def test_matches(add_doc, add_text): """ When documents match the query, write doc -> text rows. """ wp1 = add_doc(content='War and Peace, Leo Tolstoy 1') wp2 = add_doc(content='War and Peace, Leo Tolstoy 2') wp3 = add_doc(content='War and Peace, Leo Tolstoy 3') ak1 = add_doc(content='Anna Karenina, Leo Tolstoy 1') ak2 = add_doc(content='Anna Karenina, Leo Tolstoy 2') Document_Index.es_insert() text = add_text(title='War and Peace', surname='Tolstoy') text_to_docs(text.id) # Should write 3 citation links. assert Citation.select().count() == 3 # Should match "War and Peace," ignore "Anna Karenina". for doc in [wp1, wp2, wp3]: assert Citation.select().where( Citation.text==text, Citation.document==doc, Citation.tokens.contains(text.hash_tokens), )
def fuzz(out_file, n): """ Write N citation fuzz scores. """ cols = ['fuzz', 'tokens'] writer = csv.DictWriter(out_file, cols) writer.writeheader() # Draw N random ids. cids = np.random.random_integers(1, Citation.max_id(), n) for cid in progress.bar(cids): try: c = Citation.get(Citation.id==cid) writer.writerow(dict( fuzz=c.fuzz, tokens=c.tokens, )) except: pass
def add_edges(self, max_texts=20): """ For each syllabus, register citation pairs as edges. Args: max_texts (int): Ignore docs with > than N citations. """ text_ids = (fn.array_agg(Text.id).coerce(False).alias('text_ids')) docs = (Citation.select(Citation.document, text_ids).join(Text).having( fn.count(Text.id) <= max_texts).where(Text.display == True).where( Text.valid == True).group_by(Citation.document)) for row in query_bar(docs): for tid1, tid2 in combinations(row.text_ids, 2): # If the edge exists, increment the weight. if self.graph.has_edge(tid1, tid2): self.graph[tid1][tid2]['weight'] += 1 # Otherwise, initialize the edge. else: self.graph.add_edge(tid1, tid2, weight=1)
def test_no_matches(add_doc, add_text): """ When no documents match, don't write any rows. """ add_doc(content='War and Peace, Leo Tolstoy') Document_Index.es_insert() text = add_text(title='Master and Man', surname='Tolstoy') text_to_docs(text.id) # Shouldn't write any rows. assert Citation.select().count() == 0
def _citation( text=None, document=None, tokens=['one', 'two'], ): if not text: text = add_text() if not document: document = add_doc() return Citation.create( text=text, document=document, tokens=tokens, )
def es_stream_docs(cls): """ Stream Elasticsearch docs. Yields: dict: The next document. """ query = ( Citation.select() .join(Text) .where(Text.display==True) .where(Text.valid==True) ) for row in query_bar(query): doc = {} # Local fields: doc['_id'] = row.id doc['text_id'] = row.text_id doc['document_id'] = row.document_id doc['corpus'] = row.text.corpus # Field references: subfield = row.subfield if subfield: doc['subfield_id'] = subfield.id doc['field_id'] = subfield.field_id # Institution reference: inst = row.institution if inst: doc['institution_id'] = inst.id doc['state'] = inst.state doc['country'] = inst.country yield doc
def test_stream(add_citation): """ BaseModel.stream() should generate all records in the table. """ for i in range(100): add_citation() ids = [] for row in Citation.stream(10): ids.append(row.id) # Possible to save, since we're not in a transaction. row.valid = False row.save() assert ids == list(range(1, 101))
def test_citation_formats(title, surname, content, add_doc, add_text): """ Test title/author -> citation formats. """ # Pad tokens around the match. padded = ('XXX '*1000) + content + (' XXX'*1000) doc = add_doc(content=padded) Document_Index.es_insert() text = add_text(title=title, surname=surname) text_to_docs(text.id) assert Citation.select().where( Citation.text==text, Citation.document==doc, Citation.tokens.contains(text.hash_tokens), )
def es_stream_docs(cls): """ Stream Elasticsearch docs. Yields: dict: The next document. """ query = (Citation.select().join(Text).where( Text.display == True).where(Text.valid == True)) for row in query_bar(query): doc = {} # Local fields: doc['_id'] = row.id doc['text_id'] = row.text_id doc['document_id'] = row.document_id doc['corpus'] = row.text.corpus # Field references: subfield = row.subfield if subfield: doc['subfield_id'] = subfield.id doc['field_id'] = subfield.field_id # Institution reference: inst = row.institution if inst: doc['institution_id'] = inst.id doc['state'] = inst.state doc['country'] = inst.country yield doc
def text_to_docs(text_id): """ Query a text against the OSP corpus. Args: text_id (int): A text row id. """ row = Text.get(Text.id==text_id) doc_ids = set() for tokens in row.queries: # Execute the query. results = config.es.search( index='document', request_timeout=90, body={ 'fields': [], 'size': 1000000, 'filter': { 'query': { 'match_phrase': { 'body': { 'query': ' '.join(tokens), 'slop': 5, } } } } } ) # Fail the job if the result is incomplete. if results['timed_out']: raise TimeoutError() # Register the doc ids. if results['hits']['total'] > 0: for hit in results['hits']['hits']: doc_ids.add(int(hit['_id'])) # Build doc -> text links. citations = [] for doc_id in doc_ids: citations.append({ 'document': doc_id, 'text': row.id, 'tokens': row.hash_tokens, }) # Bulk-insert the results. if citations: Citation.insert_many(citations).execute()