Exemple #1
0
    def put(self,
            path: str,
            document: Document,
            force_replace: bool = False) -> DocumentFamily:
        """

        Args:
          path (str): The path to the document family
          document (Document): The document you wish to upload
          force_replace (bool): True if you want to delete the family in this path first

        Returns:
            The new document family instance
        """

        # We can only add a document if it doesn't already exist as a family
        if self.get_family_by_path(path) is None:
            new_document_family = DocumentFamily(path=path)
            new_event = self.add_document(new_document_family, document)
            document.to_kdxa(
                os.path.join(self.store_path, new_event.content_object.id) +
                ".kdxa")

            self.metastore.append(new_document_family)
            self.write_metastore()

            # Notify the listeners
            self.notify_listeners(new_event)

        document_family = self.get_family_by_path(path)
        if document_family is not None:
            return document_family

        raise Exception("Unable to get document family?")
Exemple #2
0
    def __next__(self):
        if self.index > len(self.files) - 1:
            raise StopIteration

        self.index += 1
        if self.unpack:
            return Document.from_kdxa(self.files[self.index - 1])

        document = Document(
            DocumentMetadata({
                "source_path":
                self.files[self.index - 1],
                "connector":
                self.get_name(),
                "mime_type":
                mimetypes.guess_type(self.files[self.index - 1]),
                "connector_options": {
                    "path": self.path,
                    "file_filter": self.file_filter
                }
            }))
        document.source.original_filename = os.path.basename(
            self.files[self.index - 1])
        document.source.original_path = self.path
        document.source.connector = self.get_name()

        # TODO we need to get the checksum and last_updated and created times
        return document
def get_test_document():
    document = Document(DocumentMetadata())
    node = document.create_node(type='foo')
    node.content = "cheese"
    document.content_node = node

    document.content_node.add_child(
        document.create_node(type='bar', content='fishstick'))
    return document
Exemple #4
0
def test_doc_from_text():
    doc = Document.from_text('It is going to be a great day')
    assert doc.get_root().content == 'It is going to be a great day'
    assert len(doc.get_root().get_children()) == 0

    doc = Document.from_text('It is going to be a great day', separator=' ')
    assert doc.get_root().content is None
    assert len(doc.get_root().get_children()) == 8
    assert doc.get_root().get_children()[4].content == 'be'
def create_document():
    document = Document(DocumentMetadata())
    node = document.create_node(type='foo')
    node.content = "cheese"
    document.content_node = node

    foo2 = document.create_node(type='bar')
    foo2.content = "fishstick"
    document.content_node.add_child(foo2)
    return document
def create_document():
    document = Document(DocumentMetadata())
    document.source.original_filename = "test.doc"
    node = document.create_node(node_type='foo')
    node.content = "cheese"
    document.content_node = node

    foo2 = document.create_node(node_type='bar')
    foo2.content = "fishstick"
    document.content_node.add_child(foo2)
    return document
Exemple #7
0
    def replace_content_object(self, document_family: DocumentFamily,
                               content_object_id: str,
                               document: Document) -> Optional[DocumentFamily]:

        for co in document_family.content_objects:
            if co.id == content_object_id:
                document.to_kdxa(
                    os.path.join(self.store_path, content_object_id) + ".kdxa")
                co.labels = document.labels
                co.classes = document.classes
                self.write_metastore()
                return document_family

        return None
Exemple #8
0
    def process(self, document: Document):
        """

        Args:
          document: Document:

        Returns:

        """
        if self.remove:
            document.remove_label(self.label)
        else:
            document.add_label(self.label)
        return document
Exemple #9
0
    def put(self, path: str, document: Document) -> DocumentFamily:
        from kodexa import KodexaPlatform
        try:
            logger.info(f"Putting document to path {path}")

            files = {"file": document.to_kddb()}
            data = {
                "path": path,
                "documentVersion": document.version,
                "document": True
            }
            document_family_response = requests.post(
                f"{KodexaPlatform.get_url()}/api/stores/{self.ref.replace(':', '/')}/fs",
                params={"path": path},
                headers={"x-access-token": KodexaPlatform.get_access_token()},
                files=files,
                data=data)

            if document_family_response.status_code == 200:
                return DocumentFamily.parse_obj(
                    document_family_response.json())

            msg = "Document family create failed [" + document_family_response.text + "], response " + str(
                document_family_response.status_code)
            logger.warning(msg)
            raise Exception(msg)
        except JSONDecodeError:
            logger.warning("Unable to decode the JSON response")
            raise
Exemple #10
0
    def replace_content_object(self, document_family: DocumentFamily,
                               content_object_id: str,
                               document: Document) -> DocumentFamily:
        from kodexa import KodexaPlatform
        try:
            logger.info(
                f"Replacing document in family {document_family.id} content object {content_object_id}"
            )

            files = {"file": document.to_kddb()}
            content_object_replace = requests.put(
                f"{KodexaPlatform.get_url()}/api/stores/{self.ref.replace(':', '/')}/families/{document_family.id}/objects/{content_object_id}/content",
                headers={"x-access-token": KodexaPlatform.get_access_token()},
                files=files)

            if content_object_replace.status_code == 200:
                return DocumentFamily.parse_obj(content_object_replace.json())

            msg = "Document replace failed [" + content_object_replace.text + "], response " + str(
                content_object_replace.status_code)
            logger.warning(msg)
            raise Exception(msg)
        except JSONDecodeError:
            logger.warning("Unable to decode the JSON response")
            raise
Exemple #11
0
    def add_related_document_to_family(self, document_family_id: str,
                                       transition: DocumentTransition,
                                       document: Document) -> ContentObject:
        from kodexa import KodexaPlatform
        try:
            logger.info(f"Putting document to family id {document_family_id}")

            data = {
                'transitionType': transition.transition_type.value,
                'documentVersion': document.version,
                'document': True,
                'sourceContentObjectId': transition.source_content_object_id
            }
            files = {"file": document.to_kddb()}
            document_family_response = requests.post(
                f"{KodexaPlatform.get_url()}/api/stores/{self.ref.replace(':', '/')}/families/{document_family_id}/objects",
                headers={"x-access-token": KodexaPlatform.get_access_token()},
                data=data,
                files=files)

            if document_family_response.status_code == 200:
                return ContentObject.parse_obj(document_family_response.json())

            msg = "Document family create failed [" + document_family_response.text + "], response " + str(
                document_family_response.status_code)
            logger.warning(msg)
            raise Exception(msg)
        except JSONDecodeError:
            logger.warning("Unable to decode the JSON response")
            raise
Exemple #12
0
 def get_document_by_content_object(self, document_family: DocumentFamily, content_object: ContentObject) -> \
         Optional[Document]:
     from kodexa import KodexaPlatform
     get_response = KodexaPlatform.get_client().get(
         f"api/stores/{self.ref.replace(':', '/')}/families/{document_family.id}/objects/{content_object.id}/content"
     )
     return Document.from_kddb(
         get_response.content) if get_response is not None else None
Exemple #13
0
def test_kbbd():
    doc = Document.from_text('It is going to be a great day')
    doc.content_node.tag('cheese', fixed_position=[1, 2])
    doc.content_node.tag('foo', fixed_position=[3, 4])
    doc2 = doc.from_kddb(doc.to_kddb())
    assert doc2.content_node.get_all_content(
    ) == 'It is going to be a great day'
    assert len(doc2.content_node.get_features()) == 2
Exemple #14
0
    def load(self, document_id: str):
        """
        Loads the document with the given document ID

        :return the document
        """
        with open(os.path.join(self.store_path, document_id + '.json'),
                  encoding='utf8') as f:
            return Document.from_json(f.read())
Exemple #15
0
    def put_native(self, path: str, content: Any, force_replace=False):
        """

        Args:
          path (str): The path to the native file
          content (Any): The content to store
          force_replace (bool): Replace the object in the store
        Returns:

        """

        # In order to store a native document we will first get the family
        # then we will create a content object for the native object
        # and also a content object for the document that references it

        family = self.get_family_by_path(path)

        if family is None:
            family = DocumentFamily(path=path)
            self.metastore.append(family)

        native_content_object = ContentObject(**{'contentType': 'NATIVE'})
        native_content_object.id = str(uuid.uuid4()).replace("-", "")
        native_content_object.created_on = datetime.now()
        if family.content_objects is None:
            family.content_objects = []
        family.content_objects.append(native_content_object)
        with open(os.path.join(self.store_path, native_content_object.id),
                  'wb') as file:
            file.write(content)

        document = Document()
        document.source.connector = "document-store"
        document.source.headers = {
            "ref": family.store_ref,
            "family": family.id,
            "id": native_content_object.id
        }
        content_event = self.add_document(family, document)
        document.to_kdxa(
            os.path.join(self.store_path, content_event.content_object.id) +
            ".kdxa")
Exemple #16
0
    def load_kdxa(self, path: str):
        """

        Args:
          path: str:

        Returns:

        """
        document = Document.from_kdxa(path)
        self.put(document.uuid, document)
Exemple #17
0
    def add_related_document_to_family(self, document_family_id: str,
                                       transition: DocumentTransition,
                                       document: Document):
        """

        Args:
          document_family_id: str:
          transition: DocumentTransition:
          document: Document:

        Returns:

        """
        self.read_metastore()
        for family in self.metastore:
            if family.id == document_family_id:
                new_event = self.add_document(family, document, transition)
                document.to_kdxa(
                    os.path.join(self.store_path, new_event.content_object.id)
                    + ".kdxa")
                self.write_metastore()
def test_url_pipeline():
    document = Document.from_url("http://www.google.com")
    new_document_store = LocalDocumentStore()

    stats = Pipeline(document).add_step(TextParser(encoding='ISO-8859-1')).add_step(
        DocumentStoreWriter(new_document_store)).run().statistics

    assert stats.documents_processed == 1
    assert stats.document_exceptions == 0
    assert new_document_store.count() == 1

    new_doc = new_document_store.get_latest_document("http://www.google.com")
    print(new_doc.content_node.get_all_content())
Exemple #19
0
 def __next__(self):
     if self.completed:
         raise StopIteration
     else:
         return Document(
             DocumentMetadata({
                 "source_path": self.file,
                 "connector": self.get_name(),
                 "mime_type": mimetypes.guess_type(self.file),
                 "connector_options": {
                     "file": self.file
                 }
             }))
Exemple #20
0
 def __next__(self):
     if self.completed:
         raise StopIteration
     else:
         self.completed = True
         return Document(
             DocumentMetadata({
                 "connector": self.get_name(),
                 "connector_options": {
                     "url": self.url,
                     "headers": self.headers
                 }
             }))
Exemple #21
0
    def from_file(file_path: str, *args, **kwargs) -> Pipeline:
        """Create a new pipeline using a file path as a source

        Args:
          file_path: The path to the file
          file_path: str:
          *args:
          **kwargs:

        Returns:
          Pipeline: A new pipeline

        """
        return Pipeline(Document.from_file(file_path), *args, **kwargs)
Exemple #22
0
    def from_text(text: str, *args, **kwargs) -> Pipeline:
        """Build a new pipeline and provide text as the basic to create a document

        Args:
          text: Text to use to create document
          text: str:
          *args:
          **kwargs:

        Returns:
          Pipeline: A new pipeline

        """
        return Pipeline(Document.from_text(text), *args, **kwargs)
Exemple #23
0
    def from_url(url, headers=None, *args, **kwargs):
        """Build a new pipeline with the input being a document created from the given URL

        Args:
          url: The URL ie. https://www.google.com
          headers: A dictionary of headers (Default value = None)
          *args:
          **kwargs:

        Returns:
          A new instance of a pipeline

        """
        return Pipeline(Document.from_url(url, headers), *args, **kwargs)
Exemple #24
0
    def get_document_by_content_object(
            self, document_family: DocumentFamily,
            content_object: ContentObject) -> Document:
        """

        Args:
          document_family (DocumentFamily): The document family
          content_object (ContentObject): The content object

        Returns:
          The Kodexa document related to the content family

        """
        return Document.from_kdxa(
            os.path.join(self.store_path, content_object.id) + ".kdxa")
Exemple #25
0
    def get_by_path(self, path: str) -> Optional[Document]:
        """Return the latest document in the family at the given path

        Args:
          path: return:
          path: str:

        Returns:

        """
        for family in self.metastore:
            if family.path == path:
                return Document.from_kdxa(
                    os.path.join(self.store_path,
                                 family.get_latest_content().id) + ".kdxa")
        return None
def get_test_document_with_three_children():
    document = Document(DocumentMetadata())
    node = document.create_node(type='foo')
    node.content = "cheese"
    document.content_node = node

    document.content_node.add_child(
        document.create_node(type='bar', content='fishstick'))
    document.content_node.add_child(
        document.create_node(type='bar', content='cheeseburger'))
    document.content_node.add_child(
        document.create_node(type='bar', content='beans'))

    return document
Exemple #27
0
    def __next__(self):
        if self.completed:
            raise StopIteration

        self.completed = True
        document = Document(
            DocumentMetadata({
                "connector": self.get_name(),
                "connector_options": {
                    "url": self.url,
                    "headers": self.headers
                }
            }))
        document.source.connector = self.get_name()
        document.source.original_path = self.url
        document.source.headers = self.headers
        return document
Exemple #28
0
    def get_by_uuid(self, uuid: str) -> Optional[Document]:
        """

        Args:
          uuid: str:

        Returns:

        """
        for family in self.metastore:
            for content_object in family.content_objects:

                if content_object.id == uuid:
                    return Document.from_kdxa(
                        os.path.join(self.store_path, content_object.id) +
                        ".kdxa")
        return None
def test_virtual_navigation_with_no_0_index():
    document = Document(DocumentMetadata())
    document.add_mixin('core')
    node = document.create_node(type='loopy')
    node.content = "banana"
    document.content_node = node

    document.content_node.add_child(document.create_node(type='loopy',
                                                         content='banana2'),
                                    index=2)

    assert document.content_node.get_node_at_index(0).content is None
    assert document.content_node.get_node_at_index(
        0).next_node().content is None
    assert document.content_node.get_node_at_index(
        0).next_node().next_node().content is 'banana2'
Exemple #30
0
 def __next__(self):
     if self.index > len(self.files) - 1:
         raise StopIteration
     else:
         self.index += 1
         return Document(
             DocumentMetadata({
                 "source_path":
                 self.files[self.index - 1],
                 "connector":
                 self.get_name(),
                 "mime_type":
                 mimetypes.guess_type(self.files[self.index - 1]),
                 "connector_options": {
                     "path": self.path,
                     "file_filter": self.file_filter
                 }
             }))