def test_indexer(): with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile: text = myfile.read() doc = outline.DocumentIndex(text) assert doc.lookup(100) == { "section": "I: LAUDED AND GLORIFIED ART THOU, O LORD, MY...", "paragraph": 1, "section_seq": 1 } assert doc.lookup(1000) == { "section": "I: LAUDED AND GLORIFIED ART THOU, O LORD, MY...", "paragraph": 2, "section_seq": 1 } assert doc.lookup(10000) == { "section": "VI: BEHOLD, HOW THE DIVERS PEOPLES AND KINDREDS...", "paragraph": 19, "section_seq": 6 } assert doc.lookup(100000) == { "section": "XXIX: THE PURPOSE OF GOD IN CREATING MAN HATH...", "paragraph": 132, "section_seq": 29 } # overflow - point to terminal datum?? assert doc.lookup(500000) == { "section": "CLXVI: WHOSO LAYETH CLAIM TO A REVELATION DIRECT...", "paragraph": 718, "section_seq": 166 }
def test_para_getter(): with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile: text = myfile.read() doc = outline.DocumentIndex(text) span = doc.get_paragraph_span(2) assert span == (349, 1081)
def test_sect_getter(): with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile: text = myfile.read() doc = outline.DocumentIndex(text) span = doc.get_section_span(4) assert span == (4850, 5875)
def test_simple_getters(): with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile: text = myfile.read() doc = outline.DocumentIndex(text) assert doc.get_number_of_paragraphs() == 718 assert doc.get_number_of_sections() == 166
def __init__(self, document_index=DOCUMENT_INDEX, nlpengine=None): """ Initializes engine; reads and indexes everything. :param document_index: List of file metadata, as per the object declaration in documetadata.py under documentadata.DOCUMENT_INDEX. In practice, the docmetadata.DOCUMENT_INDEX will be used, which will contain all of the documents analyzed. :param nlpengine: the spacy nlpengine. Normally this is not passed in because the whole purpose of DocumentCollection is to encapsulate said engine. But this is necessary if you contruct different DocumentCollections (e.g. for test purposes) because of the size of that object. """ if nlpengine is None: self.nlp = spacy.load('en') # use English else: self.nlp = nlpengine self.document_index = document_index for doc_obj in self.document_index: # read in the text file you wish to analyze with open(DocumentCollection.DOC_FOLDER + self.document_index[doc_obj]["file"], 'r', encoding='utf8') as next_file: text = next_file.read() # create a documentation index doc_index = outline.DocumentIndex(text) # tokenize and process the document into spacy document doc = self.nlp.tokenizer(text) self.nlp.tagger(doc) self.nlp.parser(doc) self.nlp.entity(doc) self.document_index[doc_obj]["raw"] = text self.document_index[doc_obj]["index"] = doc_index self.document_index[doc_obj]["nlpdoc"] = doc