def ranks(out_file, depth): """ Write the top N text ranks. """ cols = [ 'count', 'title', 'author', ] writer = csv.DictWriter(out_file, cols) writer.writeheader() ranks = Text_Index.rank_texts() ranks = sorted(ranks, key=lambda x: x['rank']) for r in ranks[:depth]: text = r['text'] writer.writerow(dict( count=text.count, title=text.title, author=text.authors[0], ))
def test_join_citation_count(add_text, add_citation): """ Text_Index.rank_texts() should join the citation count for each text. """ t1 = add_text() t2 = add_text() t3 = add_text() for i in range(3): add_citation(t1) for i in range(2): add_citation(t2) for i in range(1): add_citation(t3) texts = Text_Index.rank_texts() assert texts[0]['text'] == t1 assert texts[0]['text'].count == 3 assert texts[1]['text'] == t2 assert texts[1]['text'].count == 2 assert texts[2]['text'] == t3 assert texts[2]['text'].count == 1
def isbn_to_text(in_file, out_file): """ Link ISBNs -> text rankings. """ isbns = pickle.load(in_file) cols = ['isbn', 'title', 'author', 'count'] writer = csv.DictWriter(out_file, cols) writer.writeheader() ranks = Text_Index.rank_texts() # Sort count DESC. ranks = sorted( ranks, key=lambda r: r['text'].count, reverse=True, ) for i, text in enumerate(ranks): isbn = isbns.get(text['text'].identifier) writer.writerow( dict( isbn=isbn, title=text['text'].title, author=text['text'].authors[0], count=text['text'].count, )) if i % 10000 == 0: print(i)
def test_skip_uncited_texts(add_text, add_citation): """ Texts without any citations should be excluded. """ t1 = add_text() t2 = add_text() add_citation(text=t1) texts = Text_Index.rank_texts() assert texts == [ dict(text=t1, rank=1, score=1), # Exclude t2. ]
def add_nodes(self): """ Register displayed texts. """ for t in progress.bar(Text_Index.rank_texts()): text = t['text'] self.graph.add_node( text.id, dict( label=text.pretty('title'), author=text.pretty('surname'), count=text.count, score=t['score'], ))
def test_only_consider_displayed_texts(add_text, add_citation): """ Only rank texts that have been marked for display. """ t1 = add_text(display=None) t2 = add_text(display=False) t3 = add_text(display=True) add_citation(text=t1) add_citation(text=t2) add_citation(text=t3) texts = Text_Index.rank_texts() assert texts == [ dict(text=t3, rank=1, score=1), ]
def test_only_consider_valid_texts(add_text, add_citation): """ Only rank texts that have passed validation. """ t1 = add_text(valid=None) t2 = add_text(valid=False) t3 = add_text(valid=True) add_citation(text=t1) add_citation(text=t2) add_citation(text=t3) texts = Text_Index.rank_texts() assert texts == [ dict(text=t3, rank=1, score=1), ]
def add_nodes(self): """ Register displayed texts. """ for t in progress.bar(Text_Index.rank_texts()): text = t['text'] self.graph.add_node(text.id, dict( label = text.pretty('title'), author = text.pretty('surname'), count = text.count, score = t['score'], ))
def test_compute_metrics(add_text, add_citation): """ Zip ranks and scores with the texts. """ t1 = add_text() t2 = add_text() t3 = add_text() t4 = add_text() t5 = add_text() t6 = add_text() for i in range(9): add_citation(text=t1) for i in range(3): add_citation(text=t2) add_citation(text=t3) for i in range(1): add_citation(text=t4) add_citation(text=t5) add_citation(text=t6) texts = Text_Index.rank_texts() assert texts == [ dict(text=t1, rank=1, score=3/3), dict(text=t2, rank=2, score=2/3), dict(text=t3, rank=2, score=2/3), dict(text=t4, rank=4, score=1/3), dict(text=t5, rank=4, score=1/3), dict(text=t6, rank=4, score=1/3), ]
def test_compute_metrics(add_text, add_citation): """ Zip ranks and scores with the texts. """ t1 = add_text() t2 = add_text() t3 = add_text() t4 = add_text() t5 = add_text() t6 = add_text() for i in range(9): add_citation(text=t1) for i in range(3): add_citation(text=t2) add_citation(text=t3) for i in range(1): add_citation(text=t4) add_citation(text=t5) add_citation(text=t6) texts = Text_Index.rank_texts() assert texts == [ dict(text=t1, rank=1, score=3 / 3), dict(text=t2, rank=2, score=2 / 3), dict(text=t3, rank=2, score=2 / 3), dict(text=t4, rank=4, score=1 / 3), dict(text=t5, rank=4, score=1 / 3), dict(text=t6, rank=4, score=1 / 3), ]