def test_search_on_total_counts(params, add_text, add_citation): """ If a search query is provided, filter the results on the query. """ t1 = add_text(**params('match one')) t2 = add_text(**params('two')) t3 = add_text(**params('match three')) t4 = add_text(**params('four')) for i in range(4): add_citation(text=t1) for i in range(3): add_citation(text=t2) for i in range(2): add_citation(text=t3) for i in range(1): add_citation(text=t4) Text_Index.es_insert() texts = Text_Index.materialize_ranking(query='match') assert len(texts['hits']) == 2 assert texts['hits'][0]['_id'] == str(t1.id) assert texts['hits'][1]['_id'] == str(t3.id)
def test_sort_on_filtered_counts(add_text, add_citation): """ If a text -> count map is passed, sort on the filtered counts. """ t1 = add_text() t2 = add_text() t3 = add_text() for i in range(30): add_citation(text=t1) for i in range(20): add_citation(text=t2) for i in range(10): add_citation(text=t3) Text_Index.es_insert() texts = Text_Index.materialize_ranking(ranks={ t1.id: 1, t2.id: 2, t3.id: 3, }) assert texts['hits'][0]['_id'] == str(t3.id) assert texts['hits'][1]['_id'] == str(t2.id) assert texts['hits'][2]['_id'] == str(t1.id)
def test_index_metadata(add_text, add_citation): """ Text_Index.es_insert() should index texts. """ text = add_text( corpus='corpus', identifier='identifier', url='url', title='title', authors=['author1', 'author2'], publisher='publisher', date='date', journal='journal', ) # Cite the text. add_citation(text=text) Text_Index.es_insert() doc = config.es.get( index='text', id=text.id, ) assert doc['_source']['corpus'] == text.corpus assert doc['_source']['identifier'] == text.identifier assert doc['_source']['url'] == text.url assert doc['_source']['title'] == text.pretty('title') assert doc['_source']['authors'] == text.pretty('authors') assert doc['_source']['publisher'] == text.pretty('publisher') assert doc['_source']['date'] == text.pretty('date') assert doc['_source']['journal'] == text.pretty('journal_title')
def test_search_filter(add_text, add_citation): """ Free-text search query should be applied. """ t1 = add_text(title='match one') t2 = add_text(title='two') t3 = add_text(title='match three') t4 = add_text(title='four') for i in range(4): add_citation(t1) for i in range(3): add_citation(t2) for i in range(2): add_citation(t3) for i in range(1): add_citation(t4) Citation_Index.es_insert() Text_Index.es_insert() texts = rank_texts(query='match') assert len(texts['hits']) == 2 assert texts['hits'][0]['_id'] == str(t1.id) assert texts['hits'][1]['_id'] == str(t3.id)
def test_unfiltered(add_text, add_citation): """ When no filters or query is passed, return the overall rankings. """ t1 = add_text() t2 = add_text() t3 = add_text() for i in range(3): add_citation(t1) for i in range(2): add_citation(t2) for i in range(1): add_citation(t3) Citation_Index.es_insert() Text_Index.es_insert() texts = rank_texts() assert len(texts['hits']) == 3 assert texts['hits'][0]['_id'] == str(t1.id) assert texts['hits'][1]['_id'] == str(t2.id) assert texts['hits'][2]['_id'] == str(t3.id)
def test_size(add_text, add_citation): """ The 'size' argument should control the page length. """ t1 = add_text() t2 = add_text() t3 = add_text() for i in range(3): add_citation(t1) for i in range(2): add_citation(t2) for i in range(1): add_citation(t3) Citation_Index.es_insert() Text_Index.es_insert() texts = rank_texts(size=2) assert len(texts['hits']) == 2 assert texts['hits'][0]['_id'] == str(t1.id) assert texts['hits'][1]['_id'] == str(t2.id)
def test_metadata_filters(add_text, add_citation): """ Citation metadata filters should be applied. """ t1 = add_text(corpus='corpus1') t2 = add_text(corpus='corpus2') t3 = add_text(corpus='corpus1') t4 = add_text(corpus='corpus2') for i in range(4): add_citation(t1) for i in range(3): add_citation(t2) for i in range(2): add_citation(t3) for i in range(1): add_citation(t4) Citation_Index.es_insert() Text_Index.es_insert() texts = rank_texts(filters=dict(corpus='corpus2')) assert len(texts['hits']) == 2 assert texts['hits'][0]['_id'] == str(t2.id) assert texts['hits'][1]['_id'] == str(t4.id)
def test_sort_on_total_counts_by_default(add_text, add_citation): """ By default return results sorted on the total citation count. """ t1 = add_text() t2 = add_text() t3 = add_text() for i in range(3): add_citation(text=t1) for i in range(2): add_citation(text=t2) for i in range(1): add_citation(text=t3) Text_Index.es_insert() texts = Text_Index.materialize_ranking() assert texts['hits'][0]['_id'] == str(t1.id) assert texts['hits'][1]['_id'] == str(t2.id) assert texts['hits'][2]['_id'] == str(t3.id)
def test_metadata_filters(add_text, add_citation): """ Citation metadata filters should be applied. """ t1 = add_text(corpus='corpus1') t2 = add_text(corpus='corpus2') t3 = add_text(corpus='corpus1') t4 = add_text(corpus='corpus2') for i in range(4): add_citation(t1) for i in range(3): add_citation(t2) for i in range(2): add_citation(t3) for i in range(1): add_citation(t4) Citation_Index.es_insert() Text_Index.es_insert() texts = rank_texts(filters=dict( corpus='corpus2' )) assert len(texts['hits']) == 2 assert texts['hits'][0]['_id'] == str(t2.id) assert texts['hits'][1]['_id'] == str(t4.id)
def test_size(add_text, add_doc, add_citation): """ The 'size' argument should control the page length. """ t1 = add_text() t2 = add_text() t3 = add_text() t4 = add_text() for i in range(3): doc = add_doc() add_citation(text=t1, document=doc) add_citation(text=t2, document=doc) for i in range(2): doc = add_doc() add_citation(text=t1, document=doc) add_citation(text=t3, document=doc) for i in range(1): doc = add_doc() add_citation(text=t1, document=doc) add_citation(text=t4, document=doc) Citation_Index.es_insert() Text_Index.es_insert() texts = assigned_with(t1.id, size=2) assert len(texts['hits']) == 2 assert texts['hits'][0]['_id'] == str(t2.id) assert texts['hits'][1]['_id'] == str(t3.id)
def isbn_to_text(in_file, out_file): """ Link ISBNs -> text rankings. """ isbns = pickle.load(in_file) cols = ['isbn', 'title', 'author', 'count'] writer = csv.DictWriter(out_file, cols) writer.writeheader() ranks = Text_Index.rank_texts() # Sort count DESC. ranks = sorted( ranks, key=lambda r: r['text'].count, reverse=True, ) for i, text in enumerate(ranks): isbn = isbns.get(text['text'].identifier) writer.writerow( dict( isbn=isbn, title=text['text'].title, author=text['text'].authors[0], count=text['text'].count, )) if i % 10000 == 0: print(i)
def rank_texts(filters={}, query=None, size=1000, page=1): """ Filter and rank texts. Args: filters (dict): Citation metadata filters. query (str): A text metadata search query. size (str): Return N results. page (str): 1-indexed page offset. Returns: dict: Elasticsearch hits. """ # Filter citation counts, if non-empty filters. if any(filters.values()): ranks = Citation_Index.compute_ranking(filters) else: ranks = None # Materialize the text metadata. texts = Text_Index.materialize_ranking(ranks, query, size, page) return texts
def test_join_citation_count(add_text, add_citation): """ Text_Index.rank_texts() should join the citation count for each text. """ t1 = add_text() t2 = add_text() t3 = add_text() for i in range(3): add_citation(t1) for i in range(2): add_citation(t2) for i in range(1): add_citation(t3) texts = Text_Index.rank_texts() assert texts[0]['text'] == t1 assert texts[0]['text'].count == 3 assert texts[1]['text'] == t2 assert texts[1]['text'].count == 2 assert texts[2]['text'] == t3 assert texts[2]['text'].count == 1
def assigned_with(text_id, size=200): """ Given a "seed" text, rank other texts assigned on the same syllabi. Args: text_id (int): The text id. Returns: dict: Elasticsearch hits. """ # Get syllabi that assign the text. doc_ids = Citation_Index.docs_with_text(text_id) # Rank texts assigned by those sylalbi. ranks = Citation_Index.compute_ranking(dict( document_id=doc_ids )) # Omit the seed text. ranks.pop(str(text_id)) # Materialize the text metadata. texts = Text_Index.materialize_ranking(ranks, size=size) return texts
def ranks(out_file, depth): """ Write the top N text ranks. """ cols = [ 'count', 'title', 'author', ] writer = csv.DictWriter(out_file, cols) writer.writeheader() ranks = Text_Index.rank_texts() ranks = sorted(ranks, key=lambda x: x['rank']) for r in ranks[:depth]: text = r['text'] writer.writerow(dict( count=text.count, title=text.title, author=text.authors[0], ))
def assigned_with(text_id, size=1000): """ Given a "seed" text, rank other texts assigned on the same syllabi. Args: text_id (int): The text id. Returns: dict: Elasticsearch hits. """ # Get syllabi that assign the text. doc_ids = Citation_Index.docs_with_text(text_id) # Rank texts assigned by those sylalbi. ranks = Citation_Index.compute_ranking(dict( document_id=doc_ids )) # Omit the seed text. ranks.pop(str(text_id)) # Materialize the text metadata. texts = Text_Index.materialize_ranking(ranks, size=size) return texts
def test_index_counts_and_ranks(add_text, add_citation): """ Index total citation counts and ranks. """ t1 = add_text() t2 = add_text() t3 = add_text() t4 = add_text() t5 = add_text() t6 = add_text() for i in range(9): add_citation(text=t1) for i in range(3): add_citation(text=t2) add_citation(text=t3) for i in range(1): add_citation(text=t4) add_citation(text=t5) add_citation(text=t6) Text_Index.es_insert() for t in [t1]: doc = config.es.get(index='text', id=t.id) assert doc['_source']['count'] == 9 assert doc['_source']['rank'] == 1 assert doc['_source']['score'] == 3/3 for t in [t2, t3]: doc = config.es.get(index='text', id=t.id) assert doc['_source']['count'] == 3 assert doc['_source']['rank'] == 2 assert doc['_source']['score'] == 2/3 for t in [t4, t5, t6]: doc = config.es.get(index='text', id=t.id) assert doc['_source']['count'] == 1 assert doc['_source']['rank'] == 4 assert doc['_source']['score'] == 1/3
def test_index_counts_and_ranks(add_text, add_citation): """ Index total citation counts and ranks. """ t1 = add_text() t2 = add_text() t3 = add_text() t4 = add_text() t5 = add_text() t6 = add_text() for i in range(9): add_citation(text=t1) for i in range(3): add_citation(text=t2) add_citation(text=t3) for i in range(1): add_citation(text=t4) add_citation(text=t5) add_citation(text=t6) Text_Index.es_insert() for t in [t1]: doc = config.es.get(index='text', id=t.id) assert doc['_source']['count'] == 9 assert doc['_source']['rank'] == 1 assert doc['_source']['score'] == 3 / 3 for t in [t2, t3]: doc = config.es.get(index='text', id=t.id) assert doc['_source']['count'] == 3 assert doc['_source']['rank'] == 2 assert doc['_source']['score'] == 2 / 3 for t in [t4, t5, t6]: doc = config.es.get(index='text', id=t.id) assert doc['_source']['count'] == 1 assert doc['_source']['rank'] == 4 assert doc['_source']['score'] == 1 / 3
def corpus_facets(): """ Materialize corpus facets with counts. Returns: dict: {label, value, count} """ counts = Citation_Index.count_facets('corpus') return Text_Index.materialize_corpus_facets(counts)
def corpus_facets(): """ Materialize corpus facets with counts. Returns: dict: {label, value, count} """ counts = Citation_Index.count_facets("corpus") return Text_Index.materialize_corpus_facets(counts)
def test_paginate_results(add_text, add_citation): """ When a page is provided, return the 1-indexed page. """ # 9 texts - the first with 9 citations, second with 8, etc. texts = [] for i in reversed(range(1, 10)): text = add_text() for j in range(i): add_citation(text=text) texts.append(text) Text_Index.es_insert() # Get first page by default. p1 = Text_Index.materialize_ranking(size=3) assert len(p1['hits']) == 3 assert p1['hits'][0]['_id'] == str(texts[0].id) assert p1['hits'][1]['_id'] == str(texts[1].id) assert p1['hits'][2]['_id'] == str(texts[2].id) p2 = Text_Index.materialize_ranking(size=3, page=2) assert len(p1['hits']) == 3 assert p2['hits'][0]['_id'] == str(texts[3].id) assert p2['hits'][1]['_id'] == str(texts[4].id) assert p2['hits'][2]['_id'] == str(texts[5].id) p3 = Text_Index.materialize_ranking(size=3, page=3) assert len(p1['hits']) == 3 assert p3['hits'][0]['_id'] == str(texts[6].id) assert p3['hits'][1]['_id'] == str(texts[7].id) assert p3['hits'][2]['_id'] == str(texts[8].id)
def test_index_metadata(add_text, add_citation): """ Text_Index.es_insert() should index texts. """ text = add_text( corpus = 'corpus', identifier = 'identifier', url = 'url', title = 'title', authors = ['author1', 'author2'], publisher = 'publisher', date = 'date', journal = 'journal', ) # Cite the text. add_citation(text=text) Text_Index.es_insert() doc = config.es.get( index='text', id=text.id, ) assert doc['_source']['corpus'] == text.corpus assert doc['_source']['identifier'] == text.identifier assert doc['_source']['url'] == text.url assert doc['_source']['title'] == text.pretty('title') assert doc['_source']['authors'] == text.pretty('authors') assert doc['_source']['publisher'] == text.pretty('publisher') assert doc['_source']['date'] == text.pretty('date') assert doc['_source']['journal'] == text.pretty('journal_title')
def test_size(add_text, add_citation): """ The 'page' argument should control the page offset. """ t1 = add_text() t2 = add_text() t3 = add_text() t4 = add_text() for i in range(4): add_citation(t1) for i in range(3): add_citation(t2) for i in range(2): add_citation(t3) for i in range(1): add_citation(t4) Citation_Index.es_insert() Text_Index.es_insert() p1 = rank_texts(size=2, page=1) assert len(p1['hits']) == 2 assert p1['hits'][0]['_id'] == str(t1.id) assert p1['hits'][1]['_id'] == str(t2.id) p2 = rank_texts(size=2, page=2) assert len(p2['hits']) == 2 assert p2['hits'][0]['_id'] == str(t3.id) assert p2['hits'][1]['_id'] == str(t4.id)
def test_assigned_with(add_text, add_doc, add_citation): """ Given a seed text, assigned_with() should pull a ranking for all texts that are co-assigned on a syllabus with the seed. """ t1 = add_text() t2 = add_text() t3 = add_text() t4 = add_text() for i in range(3): doc = add_doc() add_citation(text=t1, document=doc) add_citation(text=t2, document=doc) for i in range(2): doc = add_doc() add_citation(text=t1, document=doc) add_citation(text=t3, document=doc) for i in range(1): doc = add_doc() add_citation(text=t1, document=doc) add_citation(text=t4, document=doc) Citation_Index.es_insert() Text_Index.es_insert() texts = assigned_with(t1.id) assert len(texts['hits']) == 3 assert texts['hits'][0]['_id'] == str(t2.id) assert texts['hits'][1]['_id'] == str(t3.id) assert texts['hits'][2]['_id'] == str(t4.id)
def test_skip_uncited_texts(add_text, add_citation): """ Texts without any citations should be excluded. """ t1 = add_text() t2 = add_text() add_citation(text=t1) texts = Text_Index.rank_texts() assert texts == [ dict(text=t1, rank=1, score=1), # Exclude t2. ]
def add_nodes(self): """ Register displayed texts. """ for t in progress.bar(Text_Index.rank_texts()): text = t['text'] self.graph.add_node( text.id, dict( label=text.pretty('title'), author=text.pretty('surname'), count=text.count, score=t['score'], ))
def test_only_consider_displayed_texts(add_text, add_citation): """ Only rank texts that have been marked for display. """ t1 = add_text(display=None) t2 = add_text(display=False) t3 = add_text(display=True) add_citation(text=t1) add_citation(text=t2) add_citation(text=t3) texts = Text_Index.rank_texts() assert texts == [ dict(text=t3, rank=1, score=1), ]
def test_only_consider_valid_texts(add_text, add_citation): """ Only rank texts that have passed validation. """ t1 = add_text(valid=None) t2 = add_text(valid=False) t3 = add_text(valid=True) add_citation(text=t1) add_citation(text=t2) add_citation(text=t3) texts = Text_Index.rank_texts() assert texts == [ dict(text=t3, rank=1, score=1), ]
def add_nodes(self): """ Register displayed texts. """ for t in progress.bar(Text_Index.rank_texts()): text = t['text'] self.graph.add_node(text.id, dict( label = text.pretty('title'), author = text.pretty('surname'), count = text.count, score = t['score'], ))
def test_compute_metrics(add_text, add_citation): """ Zip ranks and scores with the texts. """ t1 = add_text() t2 = add_text() t3 = add_text() t4 = add_text() t5 = add_text() t6 = add_text() for i in range(9): add_citation(text=t1) for i in range(3): add_citation(text=t2) add_citation(text=t3) for i in range(1): add_citation(text=t4) add_citation(text=t5) add_citation(text=t6) texts = Text_Index.rank_texts() assert texts == [ dict(text=t1, rank=1, score=3/3), dict(text=t2, rank=2, score=2/3), dict(text=t3, rank=2, score=2/3), dict(text=t4, rank=4, score=1/3), dict(text=t5, rank=4, score=1/3), dict(text=t6, rank=4, score=1/3), ]
def test_compute_metrics(add_text, add_citation): """ Zip ranks and scores with the texts. """ t1 = add_text() t2 = add_text() t3 = add_text() t4 = add_text() t5 = add_text() t6 = add_text() for i in range(9): add_citation(text=t1) for i in range(3): add_citation(text=t2) add_citation(text=t3) for i in range(1): add_citation(text=t4) add_citation(text=t5) add_citation(text=t6) texts = Text_Index.rank_texts() assert texts == [ dict(text=t1, rank=1, score=3 / 3), dict(text=t2, rank=2, score=2 / 3), dict(text=t3, rank=2, score=2 / 3), dict(text=t4, rank=4, score=1 / 3), dict(text=t5, rank=4, score=1 / 3), dict(text=t6, rank=4, score=1 / 3), ]