def doc_to_fields(doc_id, radius=100): """ Search for field / department codes in a document. Args: doc_id (int) radius (int) """ doc_text = Document_Text.get(Document_Text.document==doc_id) # Search for each field. for subfield in Subfield.select(): match = subfield.search(doc_text.text) # If found, link field -> doc. if match: # Slice out the snippet. i1 = max(match.start() - radius, 0) i2 = min(match.end() + radius, len(doc_text.text)) snippet = doc_text.text[i1:i2] Subfield_Document.create( subfield=subfield, document=doc_text.document, offset=match.start(), snippet=crunch(snippet), )
def doc_to_fields(doc_id, radius=100): """ Search for field / department codes in a document. Args: doc_id (int) radius (int) """ doc_text = Document_Text.get(Document_Text.document == doc_id) # Search for each field. for subfield in Subfield.select(): match = subfield.search(doc_text.text) # If found, link field -> doc. if match: # Slice out the snippet. i1 = max(match.start() - radius, 0) i2 = min(match.end() + radius, len(doc_text.text)) snippet = doc_text.text[i1:i2] Subfield_Document.create( subfield=subfield, document=doc_text.document, offset=match.start(), snippet=crunch(snippet), )
def test_unique_pairs(add_subfield, add_doc): """ Don't allow duplicate links between the same field -> document. """ s = add_subfield() d = add_doc() Subfield_Document.create(subfield=s, document=d, offset=1, snippet='abc') with pytest.raises(IntegrityError): Subfield_Document.create(subfield=s, document=d, offset=2, snippet='def')
def _subfield_document( subfield=None, document=None, snippet='field', offset=100, ): if not subfield: subfield = add_subfield() if not document: document = add_doc() return Subfield_Document.create( subfield=subfield, document=document, offset=offset, snippet=snippet, )
def _subfield_document( subfield=None, document=None, snippet='field', offset=100, ): if not subfield: subfield = add_subfield() if not document: document = add_doc() return Subfield_Document.create( subfield=subfield, document=document, offset=offset, snippet=snippet, )