def test_indexer():
    with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile:
        text = myfile.read()

    doc = outline.DocumentIndex(text)
    assert doc.lookup(100) == {
        "section": "I: LAUDED AND GLORIFIED ART THOU, O LORD, MY...",
        "paragraph": 1,
        "section_seq": 1
    }
    assert doc.lookup(1000) == {
        "section": "I: LAUDED AND GLORIFIED ART THOU, O LORD, MY...",
        "paragraph": 2,
        "section_seq": 1
    }
    assert doc.lookup(10000) == {
        "section": "VI: BEHOLD, HOW THE DIVERS PEOPLES AND KINDREDS...",
        "paragraph": 19,
        "section_seq": 6
    }
    assert doc.lookup(100000) == {
        "section": "XXIX: THE PURPOSE OF GOD IN CREATING MAN HATH...",
        "paragraph": 132,
        "section_seq": 29
    }
    # overflow - point to terminal datum??
    assert doc.lookup(500000) == {
        "section": "CLXVI: WHOSO LAYETH CLAIM TO A REVELATION DIRECT...",
        "paragraph": 718,
        "section_seq": 166
    }
def test_para_getter():
    with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile:
        text = myfile.read()
    doc = outline.DocumentIndex(text)

    span = doc.get_paragraph_span(2)
    assert span == (349, 1081)
def test_sect_getter():
    with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile:
        text = myfile.read()
    doc = outline.DocumentIndex(text)

    span = doc.get_section_span(4)
    assert span == (4850, 5875)
def test_simple_getters():
    with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile:
        text = myfile.read()
    doc = outline.DocumentIndex(text)

    assert doc.get_number_of_paragraphs() == 718
    assert doc.get_number_of_sections() == 166
Example #5
0
    def __init__(self, document_index=DOCUMENT_INDEX, nlpengine=None):
        """
        Initializes engine; reads and indexes everything.
        :param document_index: List of file metadata, as per the object declaration
        in documetadata.py under documentadata.DOCUMENT_INDEX. In practice, the
        docmetadata.DOCUMENT_INDEX will be used, which will contain all of
        the documents analyzed.
        :param nlpengine: the spacy nlpengine. Normally this is not passed in
        because the whole purpose of DocumentCollection is to encapsulate said
        engine. But this is necessary if you contruct different DocumentCollections
        (e.g. for test purposes) because of the size of that object.
        """
        if nlpengine is None:
            self.nlp = spacy.load('en')  # use English
        else:
            self.nlp = nlpengine
        self.document_index = document_index

        for doc_obj in self.document_index:
            # read in the text file you wish to analyze
            with open(DocumentCollection.DOC_FOLDER +
                      self.document_index[doc_obj]["file"],
                      'r',
                      encoding='utf8') as next_file:
                text = next_file.read()

                # create a documentation index
                doc_index = outline.DocumentIndex(text)

                # tokenize and process the document into spacy document
                doc = self.nlp.tokenizer(text)
                self.nlp.tagger(doc)
                self.nlp.parser(doc)
                self.nlp.entity(doc)

                self.document_index[doc_obj]["raw"] = text
                self.document_index[doc_obj]["index"] = doc_index
                self.document_index[doc_obj]["nlpdoc"] = doc