def test_whitelist(add_text, add_citation): """ Whitelisted texts should be exempt from the fuzziness cutoff. """ t1 = add_text() t2 = add_text() t3 = add_text() add_citation(text=t1) add_citation(text=t2) add_citation(text=t3) Text.validate( package='osp.test.citations.models.text', path='fixtures/validate/whitelist.yml', ) t1 = Text.get(Text.id==t1.id) t2 = Text.get(Text.id==t2.id) t3 = Text.get(Text.id==t3.id) assert t1.valid == True assert t2.valid == True assert t3.valid == False
def test_whitelist(add_text, add_citation): """ Whitelisted texts should be exempt from the fuzziness cutoff. """ t1 = add_text() t2 = add_text() t3 = add_text() add_citation(text=t1) add_citation(text=t2) add_citation(text=t3) Text.validate( package='osp.test.citations.models.text', path='fixtures/validate/whitelist.yml', ) t1 = Text.get(Text.id == t1.id) t2 = Text.get(Text.id == t2.id) t3 = Text.get(Text.id == t3.id) assert t1.valid == True assert t2.valid == True assert t3.valid == False
def ingest_jstor(): """ Ingest JSTOR texts. """ Text.ingest_jstor()
def ingest_hlom(): """ Ingest HLOM texts. """ Text.ingest_hlom()
def test_require_title_and_author(title, author, mock_hlom): """ Skip records that don't have a query-able title and author. """ mock_hlom.add_marc(title=title, author=author) Text.ingest_hlom() assert Text.select().count() == 0
def test_require_title_and_author(title, author, mock_jstor): """ Skip records that don't have a query-able title and author. """ mock_jstor.add_article(article_title=title, author=author) Text.ingest_jstor() assert Text.select().count() == 0
def test_set_multiple_authors(mock_jstor): mock_jstor.add_article(author=[ ('David W.', 'McClure'), ('Kara G.', 'Weisman'), ]) Text.ingest_jstor() assert Text.select().first().authors == [ 'McClure, David W.', 'Weisman, Kara G.', ]
def test_load_multiple(mock_jstor): """ Text.ingest_jstor() should ingest multiple records. """ # 100 records. for i in range(100): mock_jstor.add_article() Text.ingest_jstor() # 100 rows. assert Text.select().count() == 100
def test_load_multiple(mock_hlom): """ Text.ingest_hlom() should ingest multiple records. """ # 100 records. for i in range(100): mock_hlom.add_marc() Text.ingest_hlom() # 100 rows. assert Text.select().count() == 100
def test_validate(fields, add_text, add_citation): text = add_text(**fields) add_citation(text=text) Text.validate( package='osp.test.citations.models.text', path='fixtures/validate/validate.yml', ) text = Text.get(Text.id == text.id) assert text.valid == False
def test_validate(fields, add_text, add_citation): text = add_text(**fields) add_citation(text=text) Text.validate( package='osp.test.citations.models.text', path='fixtures/validate/validate.yml', ) text = Text.get(Text.id==text.id) assert text.valid == False
def rank_texts(cls): """ Get total citation counts and ranks for texts. Returns: list """ count = fn.Count(Citation.id) query = ( Text.select(Text, count) .join(Citation) .where(Text.display == True) .where(Text.valid == True) .group_by(Text.id) .order_by(Text.id) .naive() ) counts = [t.count for t in query] # Compute dense-rank ratios. dense_ranks = rankdata(counts, "dense") top = max(dense_ranks) scores = [float(r / top) for r in dense_ranks] # Compute overall ranks (#1 is most frequent). max_ranks = rankdata(counts, "max") top = max(max_ranks) ranks = [int(top - r + 1) for r in max_ranks] return [dict(zip(["text", "rank", "score"], t)) for t in zip(query, ranks, scores)]
def test_empty_field(): """ If the field is empty, return None. """ text = Text(title=None) assert text.pretty('title') == None
def test_string_field(): """ Text#pretty() should return a prettified version of the field. """ text = Text(title='war and peace') assert text.pretty('title') == prettify('war and peace')
def test_deduplicate(add_text, add_citation): """ Text.deduplicate() set `display` flags for all cited texts. """ t1 = add_text(title="one", surname="two") t2 = add_text(title="one", surname="two") t3 = add_text(title="three", surname="four") t4 = add_text(title="three", surname="four") t5 = add_text(title="five", surname="six") add_citation(text=t1) add_citation(text=t2) add_citation(text=t3) add_citation(text=t4) add_citation(text=t5) Text.deduplicate() t1 = Text.get(Text.id == t1.id) t2 = Text.get(Text.id == t2.id) t3 = Text.get(Text.id == t3.id) t4 = Text.get(Text.id == t4.id) t5 = Text.get(Text.id == t5.id) assert t1.display == True assert t2.display == False assert t3.display == True assert t4.display == False assert t5.display == True
def test_deduplicate(add_text, add_citation): """ Text.deduplicate() set `display` flags for all cited texts. """ t1 = add_text(title='one', surname='two') t2 = add_text(title='one', surname='two') t3 = add_text(title='three', surname='four') t4 = add_text(title='three', surname='four') t5 = add_text(title='five', surname='six') add_citation(text=t1) add_citation(text=t2) add_citation(text=t3) add_citation(text=t4) add_citation(text=t5) Text.deduplicate() t1 = Text.get(Text.id==t1.id) t2 = Text.get(Text.id==t2.id) t3 = Text.get(Text.id==t3.id) t4 = Text.get(Text.id==t4.id) t5 = Text.get(Text.id==t5.id) assert t1.display == True assert t2.display == False assert t3.display == True assert t4.display == False assert t5.display == True
def test_surname_blacklisted(surname, blacklisted): surnames = map(tokenize_field, [ 'may', 'world bank', ]) text = Text(surname=surname) assert text.surname_blacklisted(surnames) == blacklisted
def test_title_blacklisted(title, blacklisted): titles = map(tokenize_field, [ 'letter', 'the white house', ]) text = Text(title=title) assert text.title_blacklisted(titles) == blacklisted
def hydrate_nodes(self): """ Load text metadata onto the nodes. """ for tid in progress.bar(self.graph.nodes()): text = Text.get(Text.id==tid) self.graph.node[tid]['authors'] = text.pretty('authors') self.graph.node[tid]['title'] = text.pretty('title')
def test_array_field(): """ If the requested field is an array, prettify each element. """ text = Text(authors=[ 'david mcclure', 'joe karaganis', ]) assert text.pretty('authors') == [ prettify('david mcclure'), prettify('joe karaganis'), ]
def test_select_cited(add_text, add_citation): """ Text.select_cited() returns texts that have been cited at least once. """ t1 = add_text() t2 = add_text() t3 = add_text() add_citation(text=t1) add_citation(text=t2) # No citation for t3 assert list(Text.select_cited()) == [ t1, t2, ]
def test_page_cursor(add_text): """ BaseModel.page_cursor() should generate record instances in an id-ordered "page", defined by a page count and 0-based index. """ for i in range(100): add_text() ids = [] for i in range(7): ids.append([t.id for t in Text.page_cursor(7, i)]) # 7 pages: assert len(ids) == 7 # 1-100 range: assert sum(ids, []) == list(range(1, 101))
def _text(corpus='corpus', identifier=None, title='Title', surname='Surname', authors=['Author'], valid=True, display=True, **kwargs): if not identifier: identifier = uuid.uuid4() return Text.create(corpus=corpus, identifier=identifier, title=title, surname=surname, authors=authors, valid=valid, display=display, **kwargs)
def _text( corpus='corpus', identifier=None, title='Title', surname='Surname', authors=['Author'], valid=True, display=True, **kwargs ): if not identifier: identifier = uuid.uuid4() return Text.create( corpus=corpus, identifier=identifier, title=title, surname=surname, authors=authors, valid=valid, display=display, **kwargs )
def test_set_title(mock_jstor): mock_jstor.add_article(article_title='Article Title') Text.ingest_jstor() assert Text.select().first().title == 'Article Title'
def test_set_identifier(mock_jstor): mock_jstor.add_article(article_id='001') Text.ingest_jstor() assert Text.select().first().identifier == '001'
def test_set_journal_title(mock_jstor): mock_jstor.add_article(journal_title='Critical Inquiry') Text.ingest_jstor() assert Text.select().first().journal_title == 'Critical Inquiry'
def test_set_issue_volume(mock_jstor): mock_jstor.add_article(issue_volume=200) Text.ingest_jstor() assert Text.select().first().issue_volume == '200'
def test_set_issue_number(mock_jstor): mock_jstor.add_article(issue_number=10) Text.ingest_jstor() assert Text.select().first().issue_number == '10'
def test_set_corpus(mock_jstor): mock_jstor.add_article() Text.ingest_jstor() assert Text.select().first().corpus == 'jstor'
def test_set_publisher(mock_jstor): mock_jstor.add_article(publisher_name='Chicago Journals') Text.ingest_jstor() assert Text.select().first().publisher == 'Chicago Journals'
def test_set_surname(mock_jstor): mock_jstor.add_article(author=[('David W.', 'McClure')]) Text.ingest_jstor() assert Text.select().first().surname == 'McClure'
def test_set_url(mock_jstor): mock_jstor.add_article(url='http://test.org') Text.ingest_jstor() assert Text.select().first().url == 'http://test.org'
def test_set_date(mock_jstor): mock_jstor.add_article(pub_year=1987, pub_month=6, pub_day=25) Text.ingest_jstor() assert Text.select().first().date == '1987-06-25'
def test_set_journal_identifier(mock_jstor): mock_jstor.add_article(journal_id='criticalinquiry') Text.ingest_jstor() assert Text.select().first().journal_identifier == 'criticalinquiry'
def text_to_docs(text_id): """ Query a text against the OSP corpus. Args: text_id (int): A text row id. """ row = Text.get(Text.id==text_id) doc_ids = set() for tokens in row.queries: # Execute the query. results = config.es.search( index='document', request_timeout=90, body={ 'fields': [], 'size': 1000000, 'filter': { 'query': { 'match_phrase': { 'body': { 'query': ' '.join(tokens), 'slop': 5, } } } } } ) # Fail the job if the result is incomplete. if results['timed_out']: raise TimeoutError() # Register the doc ids. if results['hits']['total'] > 0: for hit in results['hits']['hits']: doc_ids.add(int(hit['_id'])) # Build doc -> text links. citations = [] for doc_id in doc_ids: citations.append({ 'document': doc_id, 'text': row.id, 'tokens': row.hash_tokens, }) # Bulk-insert the results. if citations: Citation.insert_many(citations).execute()
def test_set_single_author(mock_jstor): mock_jstor.add_article(author=[('David W.', 'McClure')]) Text.ingest_jstor() assert Text.select().first().authors == ['McClure, David W.']
def test_set_date(mock_hlom): mock_hlom.add_marc(pubyear='1987') Text.ingest_hlom() assert Text.select().first().date == '1987'
def test_surname_is_toponym(surname, is_toponym): text = Text(surname=surname) assert text.surname_is_toponym == is_toponym
def test_set_pagination(mock_jstor): mock_jstor.add_article(fpage=200, lpage=300) Text.ingest_jstor() assert Text.select().first().pagination == '200-300'