def test_require_title_and_author(models, mock_hlom): """ Records that don't have both a title and an author should be ignored. """ # No author, no title: m1 = mock_hlom.add_marc(title='', author='') # Title, no author: m2 = mock_hlom.add_marc(title='War and Peace', author='') # Author, no title: m3 = mock_hlom.add_marc(title='', author='Leo Tolstoy') # Title and author: m4 = mock_hlom.add_marc(title='War and Peace', author='Leo Tolstoy') HLOM_Record.insert_records() # Should just insert 1 record. assert HLOM_Record.select().count() == 1 # Should insert the record with title/author. assert HLOM_Record.get( HLOM_Record.control_number==m4.control_number() )
def deduplicate(self): """ Remove duplicate nodes. """ seen = set() for cn in bar(self.graph.nodes()): # Pop out the HLOM record. text = HLOM_Record.get(HLOM_Record.control_number==cn) # If the node is a duplicate, remove it. if text.hash in seen: self.graph.remove_node(cn) else: seen.add(text.hash)
def hydrate_nodes(self): """ Hydrate node metadata. """ for cn in bar(self.graph.nodes()): # Pop out the HLOM record. text = HLOM_Record.get(HLOM_Record.control_number==cn) # Prettify the title / author. title = prettify_field(text.marc.title()) author = prettify_field(text.marc.author()) publisher = prettify_field(text.marc.publisher()) pubyear = prettify_field(text.marc.pubyear()) self.graph.node[cn]['title'] = title or '' self.graph.node[cn]['author'] = author or '' self.graph.node[cn]['publisher'] = publisher or '' self.graph.node[cn]['pubyear'] = pubyear or ''
def test_insert_records(models, mock_hlom): """ HLOM_Record.insert_records() should create a row for each MARC record. """ records = [] # 10 segments: for i in range(10): # 10 records in each: for j in range(10): marc = mock_hlom.add_marc( data_file=str(i), title='title', author='author' ) records.append(marc) # Insert record rows. HLOM_Record.insert_records() # Should insert 100 records. assert HLOM_Record.select().count() == 100 for marc in records: # Pop out the `hlom_record` row. row = HLOM_Record.get( HLOM_Record.control_number==marc.control_number() ) # Should store the record body. assert row.marc.as_marc() == marc.as_marc()
def query(id): """ Query a MARC record against the OSP corpus. :param id: The hlom_record row id. """ row = HLOM_Record.get(HLOM_Record.id==id) # Execute the query. results = config.es.search('osp', 'document', timeout=30, body={ 'fields': ['doc_id'], 'size': 100000, 'filter': { 'query': { 'match_phrase': { 'body': { 'query': row.query, 'slop': 50 } } } } }) if results['hits']['total'] > 0: citations = [] for hit in results['hits']['hits']: citations.append({ 'document': hit['fields']['doc_id'][0], 'record': row.id }) # Write the citation links. HLOM_Citation.insert_many(citations).execute()