Beispiel #1
0
Datei: db.py Projekt: imclab/iepy
 def get_documents_lacking_preprocess(self, step):
     """Returns an iterator of documents that shall be processed on the given
     step."""
     if not isinstance(step, PreProcessSteps):
         raise InvalidPreprocessSteps
     query = {'preprocess_metadata__%s__exists' % step.name: False}
     return IEDocument.objects(**query).timeout(False)
Beispiel #2
0
Datei: db.py Projekt: imclab/iepy
    def create_document(self, identifier, text, metadata=None):
        """Creates a new Document with text ready to be inserted on the
        information extraction pipeline (ie, ready to be tokenized, POS Tagged,
        etc).

        Identifier must be a unique value that will be used for distinguishing
        one document from another. If no title is given, will be inferred from
        the identifier.
        Metadata is a dictionary where you can put whatever you want to persist
        with your document. IEPY will do nothing with it except ensuring that
        such information will be preserved.
        """
        if metadata is None:
            metadata = {}
        doc = IEDocument(human_identifier=identifier,
                         text=text,
                         metadata=metadata)
        doc.save()
        return doc
Beispiel #3
0
Datei: db.py Projekt: imclab/iepy
 def get_raw_documents(self):
     """returns an interator of documents that lack the text field, or it's
     empty.
     """
     return IEDocument.objects(text='').timeout(False)
Beispiel #4
0
 def setUp(self):
     patcher = mock.patch.object(IEDocument, 'save')
     self.mock_save = patcher.start()
     self.addCleanup(patcher.stop)
     self.doc = IEDocument(metadata={'raw_text': 'hello world'})