def test_clean_field_names(): """ Field and subfield names should be sanitized. """ Subfield.ingest( 'osp.test.fields.models.subfield', 'fixtures/ingest/clean_field_names.csv', ) assert Field.select().where(Field.name == 'Field1') assert Subfield.select().where(Subfield.name == 'Subfield1') assert Subfield.select().where(Subfield.name == 'Subfield2') assert Subfield.select().where(Subfield.name == 'Subfield3')
def test_clean_field_names(): """ Field and subfield names should be sanitized. """ Subfield.ingest( 'osp.test.fields.models.subfield', 'fixtures/ingest/clean_field_names.csv', ) assert Field.select().where(Field.name=='Field1') assert Subfield.select().where(Subfield.name=='Subfield1') assert Subfield.select().where(Subfield.name=='Subfield2') assert Subfield.select().where(Subfield.name=='Subfield3')
def doc_to_fields(doc_id, radius=100): """ Search for field / department codes in a document. Args: doc_id (int) radius (int) """ doc_text = Document_Text.get(Document_Text.document==doc_id) # Search for each field. for subfield in Subfield.select(): match = subfield.search(doc_text.text) # If found, link field -> doc. if match: # Slice out the snippet. i1 = max(match.start() - radius, 0) i2 = min(match.end() + radius, len(doc_text.text)) snippet = doc_text.text[i1:i2] Subfield_Document.create( subfield=subfield, document=doc_text.document, offset=match.start(), snippet=crunch(snippet), )
def doc_to_fields(doc_id, radius=100): """ Search for field / department codes in a document. Args: doc_id (int) radius (int) """ doc_text = Document_Text.get(Document_Text.document == doc_id) # Search for each field. for subfield in Subfield.select(): match = subfield.search(doc_text.text) # If found, link field -> doc. if match: # Slice out the snippet. i1 = max(match.start() - radius, 0) i2 = min(match.end() + radius, len(doc_text.text)) snippet = doc_text.text[i1:i2] Subfield_Document.create( subfield=subfield, document=doc_text.document, offset=match.start(), snippet=crunch(snippet), )
def subfield(self): """ Get the document's subfield, if any. Returns: Subfield """ return (Subfield.select().join(Subfield_Document).join(Document).where( Document.id == self.document).order_by( Subfield_Document.offset.asc()).first())
def es_stream_docs(cls): """ Index subfields. Yields: dict: The next document. """ for row in query_bar(Subfield.select()): yield dict( _id = row.id, name = row.name, )
def test_es_insert(): """ Subfield_Index.es_insert() should load all fields into Elasticsearch """ Subfield.ingest() Subfield_Index.es_insert() for sf in Subfield.select(): doc = config.es.get( index='subfield', id=sf.id, ) assert doc['_source']['name'] == sf.name
def test_insert_rows(): """ Subfield.ingest() should load field and subfield rows. """ Subfield.ingest( 'osp.test.fields.models.subfield', 'fixtures/ingest/insert_rows.csv', ) assert Field.select().count() == 3 assert Subfield.select().count() == 9 f1 = Field.get(Field.name=='Field1') f2 = Field.get(Field.name=='Field2') f3 = Field.get(Field.name=='Field3') sf1 = Subfield.get(Subfield.name=='Subfield1') sf2 = Subfield.get(Subfield.name=='Subfield2') sf3 = Subfield.get(Subfield.name=='Subfield3') sf4 = Subfield.get(Subfield.name=='Subfield4') sf5 = Subfield.get(Subfield.name=='Subfield5') sf6 = Subfield.get(Subfield.name=='Subfield6') sf7 = Subfield.get(Subfield.name=='Subfield7') sf8 = Subfield.get(Subfield.name=='Subfield8') sf9 = Subfield.get(Subfield.name=='Subfield9') assert sf1.field == f1 assert sf2.field == f1 assert sf3.field == f1 assert sf4.field == f2 assert sf5.field == f2 assert sf6.field == f2 assert sf7.field == f3 assert sf8.field == f3 assert sf9.field == f3
def test_insert_rows(): """ Subfield.ingest() should load field and subfield rows. """ Subfield.ingest( 'osp.test.fields.models.subfield', 'fixtures/ingest/insert_rows.csv', ) assert Field.select().count() == 3 assert Subfield.select().count() == 9 f1 = Field.get(Field.name == 'Field1') f2 = Field.get(Field.name == 'Field2') f3 = Field.get(Field.name == 'Field3') sf1 = Subfield.get(Subfield.name == 'Subfield1') sf2 = Subfield.get(Subfield.name == 'Subfield2') sf3 = Subfield.get(Subfield.name == 'Subfield3') sf4 = Subfield.get(Subfield.name == 'Subfield4') sf5 = Subfield.get(Subfield.name == 'Subfield5') sf6 = Subfield.get(Subfield.name == 'Subfield6') sf7 = Subfield.get(Subfield.name == 'Subfield7') sf8 = Subfield.get(Subfield.name == 'Subfield8') sf9 = Subfield.get(Subfield.name == 'Subfield9') assert sf1.field == f1 assert sf2.field == f1 assert sf3.field == f1 assert sf4.field == f2 assert sf5.field == f2 assert sf6.field == f2 assert sf7.field == f3 assert sf8.field == f3 assert sf9.field == f3