def test_deduplicate(add_text, add_citation): """ Text.deduplicate() set `display` flags for all cited texts. """ t1 = add_text(title='one', surname='two') t2 = add_text(title='one', surname='two') t3 = add_text(title='three', surname='four') t4 = add_text(title='three', surname='four') t5 = add_text(title='five', surname='six') add_citation(text=t1) add_citation(text=t2) add_citation(text=t3) add_citation(text=t4) add_citation(text=t5) Text.deduplicate() t1 = Text.get(Text.id==t1.id) t2 = Text.get(Text.id==t2.id) t3 = Text.get(Text.id==t3.id) t4 = Text.get(Text.id==t4.id) t5 = Text.get(Text.id==t5.id) assert t1.display == True assert t2.display == False assert t3.display == True assert t4.display == False assert t5.display == True
def test_whitelist(add_text, add_citation): """ Whitelisted texts should be exempt from the fuzziness cutoff. """ t1 = add_text() t2 = add_text() t3 = add_text() add_citation(text=t1) add_citation(text=t2) add_citation(text=t3) Text.validate( package='osp.test.citations.models.text', path='fixtures/validate/whitelist.yml', ) t1 = Text.get(Text.id==t1.id) t2 = Text.get(Text.id==t2.id) t3 = Text.get(Text.id==t3.id) assert t1.valid == True assert t2.valid == True assert t3.valid == False
def test_deduplicate(add_text, add_citation): """ Text.deduplicate() set `display` flags for all cited texts. """ t1 = add_text(title="one", surname="two") t2 = add_text(title="one", surname="two") t3 = add_text(title="three", surname="four") t4 = add_text(title="three", surname="four") t5 = add_text(title="five", surname="six") add_citation(text=t1) add_citation(text=t2) add_citation(text=t3) add_citation(text=t4) add_citation(text=t5) Text.deduplicate() t1 = Text.get(Text.id == t1.id) t2 = Text.get(Text.id == t2.id) t3 = Text.get(Text.id == t3.id) t4 = Text.get(Text.id == t4.id) t5 = Text.get(Text.id == t5.id) assert t1.display == True assert t2.display == False assert t3.display == True assert t4.display == False assert t5.display == True
def test_whitelist(add_text, add_citation): """ Whitelisted texts should be exempt from the fuzziness cutoff. """ t1 = add_text() t2 = add_text() t3 = add_text() add_citation(text=t1) add_citation(text=t2) add_citation(text=t3) Text.validate( package='osp.test.citations.models.text', path='fixtures/validate/whitelist.yml', ) t1 = Text.get(Text.id == t1.id) t2 = Text.get(Text.id == t2.id) t3 = Text.get(Text.id == t3.id) assert t1.valid == True assert t2.valid == True assert t3.valid == False
def hydrate_nodes(self): """ Load text metadata onto the nodes. """ for tid in progress.bar(self.graph.nodes()): text = Text.get(Text.id==tid) self.graph.node[tid]['authors'] = text.pretty('authors') self.graph.node[tid]['title'] = text.pretty('title')
def test_validate(fields, add_text, add_citation): text = add_text(**fields) add_citation(text=text) Text.validate( package='osp.test.citations.models.text', path='fixtures/validate/validate.yml', ) text = Text.get(Text.id==text.id) assert text.valid == False
def test_validate(fields, add_text, add_citation): text = add_text(**fields) add_citation(text=text) Text.validate( package='osp.test.citations.models.text', path='fixtures/validate/validate.yml', ) text = Text.get(Text.id == text.id) assert text.valid == False
def text_to_docs(text_id): """ Query a text against the OSP corpus. Args: text_id (int): A text row id. """ row = Text.get(Text.id==text_id) doc_ids = set() for tokens in row.queries: # Execute the query. results = config.es.search( index='document', request_timeout=90, body={ 'fields': [], 'size': 1000000, 'filter': { 'query': { 'match_phrase': { 'body': { 'query': ' '.join(tokens), 'slop': 5, } } } } } ) # Fail the job if the result is incomplete. if results['timed_out']: raise TimeoutError() # Register the doc ids. if results['hits']['total'] > 0: for hit in results['hits']['hits']: doc_ids.add(int(hit['_id'])) # Build doc -> text links. citations = [] for doc_id in doc_ids: citations.append({ 'document': doc_id, 'text': row.id, 'tokens': row.hash_tokens, }) # Bulk-insert the results. if citations: Citation.insert_many(citations).execute()