def doc_to_fields(doc_id, radius=100): """ Search for field / department codes in a document. Args: doc_id (int) radius (int) """ doc_text = Document_Text.get(Document_Text.document==doc_id) # Search for each field. for subfield in Subfield.select(): match = subfield.search(doc_text.text) # If found, link field -> doc. if match: # Slice out the snippet. i1 = max(match.start() - radius, 0) i2 = min(match.end() + radius, len(doc_text.text)) snippet = doc_text.text[i1:i2] Subfield_Document.create( subfield=subfield, document=doc_text.document, offset=match.start(), snippet=crunch(snippet), )
def doc_to_fields(doc_id, radius=100): """ Search for field / department codes in a document. Args: doc_id (int) radius (int) """ doc_text = Document_Text.get(Document_Text.document == doc_id) # Search for each field. for subfield in Subfield.select(): match = subfield.search(doc_text.text) # If found, link field -> doc. if match: # Slice out the snippet. i1 = max(match.start() - radius, 0) i2 = min(match.end() + radius, len(doc_text.text)) snippet = doc_text.text[i1:i2] Subfield_Document.create( subfield=subfield, document=doc_text.document, offset=match.start(), snippet=crunch(snippet), )
def test_text_extraction_succeeds(mock_osp): """ read_text() should extract text for a document and write the result into the `document_text` table. """ # Add a file, create a document row. path = mock_osp.add_file(content='text') document = Document.create(path=path) ext_text(document.id) # Pop out the new row. row = Document_Text.get(Document_Text.document == document) assert row.text == 'text'
def test_text_extraction_succeeds(mock_osp): """ read_text() should extract text for a document and write the result into the `document_text` table. """ # Add a file, create a document row. path = mock_osp.add_file(content='text') document = Document.create(path=path) ext_text(document.id) # Pop out the new row. row = Document_Text.get(Document_Text.document==document) assert row.text == 'text'