def insert_records(page_size): """ Write the records into the database. """ HLOM_Record.insert_records(page_size)
def test_require_title_and_author(models, mock_hlom): """ Records that don't have both a title and an author should be ignored. """ # No author, no title: m1 = mock_hlom.add_marc(title='', author='') # Title, no author: m2 = mock_hlom.add_marc(title='War and Peace', author='') # Author, no title: m3 = mock_hlom.add_marc(title='', author='Leo Tolstoy') # Title and author: m4 = mock_hlom.add_marc(title='War and Peace', author='Leo Tolstoy') HLOM_Record.insert_records() # Should just insert 1 record. assert HLOM_Record.select().count() == 1 # Should insert the record with title/author. assert HLOM_Record.get( HLOM_Record.control_number==m4.control_number() )
def hlom_index(requires_es): """ Clear the HLOM index. """ HLOM_Record.es_reset()
def queue_queries(): """ Queue citation extraction queries. """ for record in ServerSide(HLOM_Record.select()): config.rq.enqueue(query, record.id)
def _hlom(*args, **kwargs): # Write a MARC record. marc = mock_hlom.add_marc(*args, **kwargs) # Create a `hlom_record` row. return HLOM_Record.create( control_number=marc.control_number(), record=marc.as_marc() )
def test_insert_records(models, mock_hlom): """ HLOM_Record.insert_records() should create a row for each MARC record. """ records = [] # 10 segments: for i in range(10): # 10 records in each: for j in range(10): marc = mock_hlom.add_marc( data_file=str(i), title='title', author='author' ) records.append(marc) # Insert record rows. HLOM_Record.insert_records() # Should insert 100 records. assert HLOM_Record.select().count() == 100 for marc in records: # Pop out the `hlom_record` row. row = HLOM_Record.get( HLOM_Record.control_number==marc.control_number() ) # Should store the record body. assert row.marc.as_marc() == marc.as_marc()
def copy_records(cls, min_rank=1000): """ Copy in cited records. Args: min_rank (int): The cutoff for "frequent" words. """ cited = ( HLOM_Record.select() .join(HLOM_Citation) .group_by(HLOM_Record.id) # Coalesce duplicates. .distinct([HLOM_Record.metadata['deduping_hash']]) .order_by( HLOM_Record.metadata['deduping_hash'], HLOM_Record.id ) ) counts = Counts() for r in cited: t = termify(r.marc.title()) a = termify(r.marc.author()) # Title and author empty. if not t or not a: continue # Title and author repeat words. if set.intersection(t, a): continue # No focused words in title. if counts.max_rank(t) < min_rank: continue # No focused words in author. if counts.max_rank(a) < min_rank: continue cls.create(**r._data)
def deduplicate(self): """ Remove duplicate nodes. """ seen = set() for cn in bar(self.graph.nodes()): # Pop out the HLOM record. text = HLOM_Record.get(HLOM_Record.control_number==cn) # If the node is a duplicate, remove it. if text.hash in seen: self.graph.remove_node(cn) else: seen.add(text.hash)
def hydrate_nodes(self): """ Hydrate node metadata. """ for cn in bar(self.graph.nodes()): # Pop out the HLOM record. text = HLOM_Record.get(HLOM_Record.control_number==cn) # Prettify the title / author. title = prettify_field(text.marc.title()) author = prettify_field(text.marc.author()) publisher = prettify_field(text.marc.publisher()) pubyear = prettify_field(text.marc.pubyear()) self.graph.node[cn]['title'] = title or '' self.graph.node[cn]['author'] = author or '' self.graph.node[cn]['publisher'] = publisher or '' self.graph.node[cn]['pubyear'] = pubyear or ''
def query(id): """ Query a MARC record against the OSP corpus. :param id: The hlom_record row id. """ row = HLOM_Record.get(HLOM_Record.id==id) # Execute the query. results = config.es.search('osp', 'document', timeout=30, body={ 'fields': ['doc_id'], 'size': 100000, 'filter': { 'query': { 'match_phrase': { 'body': { 'query': row.query, 'slop': 50 } } } } }) if results['hits']['total'] > 0: citations = [] for hit in results['hits']['hits']: citations.append({ 'document': hit['fields']['doc_id'][0], 'record': row.id }) # Write the citation links. HLOM_Citation.insert_many(citations).execute()