Python DocumentIndex Examples

Programming Language: Python

Namespace/Package Name: outline

Method/Function: DocumentIndex

Examples at hotexamples.com: 5

Python DocumentIndex - 5 examples found. These are the top rated real world Python examples of outline.DocumentIndex extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: test_outline.py Project: jlbradley1844/python-stuff

def test_indexer():
    with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile:
        text = myfile.read()

    doc = outline.DocumentIndex(text)
    assert doc.lookup(100) == {
        "section": "I: LAUDED AND GLORIFIED ART THOU, O LORD, MY...",
        "paragraph": 1,
        "section_seq": 1
    }
    assert doc.lookup(1000) == {
        "section": "I: LAUDED AND GLORIFIED ART THOU, O LORD, MY...",
        "paragraph": 2,
        "section_seq": 1
    }
    assert doc.lookup(10000) == {
        "section": "VI: BEHOLD, HOW THE DIVERS PEOPLES AND KINDREDS...",
        "paragraph": 19,
        "section_seq": 6
    }
    assert doc.lookup(100000) == {
        "section": "XXIX: THE PURPOSE OF GOD IN CREATING MAN HATH...",
        "paragraph": 132,
        "section_seq": 29
    }
    # overflow - point to terminal datum??
    assert doc.lookup(500000) == {
        "section": "CLXVI: WHOSO LAYETH CLAIM TO A REVELATION DIRECT...",
        "paragraph": 718,
        "section_seq": 166
    }

Example #2

Show file

File: test_outline.py Project: jlbradley1844/python-stuff

def test_para_getter():
    with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile:
        text = myfile.read()
    doc = outline.DocumentIndex(text)

    span = doc.get_paragraph_span(2)
    assert span == (349, 1081)

Example #3

Show file

File: test_outline.py Project: jlbradley1844/python-stuff

def test_sect_getter():
    with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile:
        text = myfile.read()
    doc = outline.DocumentIndex(text)

    span = doc.get_section_span(4)
    assert span == (4850, 5875)

Example #4

Show file

File: test_outline.py Project: jlbradley1844/python-stuff

def test_simple_getters():
    with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile:
        text = myfile.read()
    doc = outline.DocumentIndex(text)

    assert doc.get_number_of_paragraphs() == 718
    assert doc.get_number_of_sections() == 166

Example #5

Show file

    def __init__(self, document_index=DOCUMENT_INDEX, nlpengine=None):
        """
        Initializes engine; reads and indexes everything.
        :param document_index: List of file metadata, as per the object declaration
        in documetadata.py under documentadata.DOCUMENT_INDEX. In practice, the
        docmetadata.DOCUMENT_INDEX will be used, which will contain all of
        the documents analyzed.
        :param nlpengine: the spacy nlpengine. Normally this is not passed in
        because the whole purpose of DocumentCollection is to encapsulate said
        engine. But this is necessary if you contruct different DocumentCollections
        (e.g. for test purposes) because of the size of that object.
        """
        if nlpengine is None:
            self.nlp = spacy.load('en')  # use English
        else:
            self.nlp = nlpengine
        self.document_index = document_index

        for doc_obj in self.document_index:
            # read in the text file you wish to analyze
            with open(DocumentCollection.DOC_FOLDER +
                      self.document_index[doc_obj]["file"],
                      'r',
                      encoding='utf8') as next_file:
                text = next_file.read()

                # create a documentation index
                doc_index = outline.DocumentIndex(text)

                # tokenize and process the document into spacy document
                doc = self.nlp.tokenizer(text)
                self.nlp.tagger(doc)
                self.nlp.parser(doc)
                self.nlp.entity(doc)

                self.document_index[doc_obj]["raw"] = text
                self.document_index[doc_obj]["index"] = doc_index
                self.document_index[doc_obj]["nlpdoc"] = doc