def test_state_facets(add_institution, add_citation): """ state_facets() should provide a list of label/value/count dicts. """ i1 = add_institution(state='CA') i2 = add_institution(state='AL') i3 = add_institution(state='MA') for i in range(3): c = add_citation() Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation() Institution_Document.create(institution=i2, document=c.document) for i in range(1): c = add_citation() Institution_Document.create(institution=i3, document=c.document) Citation_Index.es_insert() facets = state_facets() assert facets == [ dict(label='California', value=i1.state, count=3), dict(label='Alabama', value=i2.state, count=2), dict(label='Massachusetts', value=i3.state, count=1), ]
def test_country_facets(add_institution, add_citation): """ country_facets() should provide a list of label/value/count dicts. """ i1 = add_institution(country='AU') i2 = add_institution(country='CA') i3 = add_institution(country='NZ') for i in range(3): c = add_citation() Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation() Institution_Document.create(institution=i2, document=c.document) for i in range(1): c = add_citation() Institution_Document.create(institution=i3, document=c.document) Citation_Index.es_insert() facets = country_facets() assert facets == [ dict(label='Australia', value=i1.country, count=3), dict(label='Canada', value=i2.country, count=2), dict(label='New Zealand', value=i3.country, count=1), ]
def test_index_institution_refs(add_citation, add_institution): """ When the document is linked with an institution, an institution reference should be included in the document. """ citation = add_citation() institution = add_institution(state='CA', country='US') # Link inst -> citation. Institution_Document.create( institution=institution, document=citation.document, ) Citation_Index.es_insert() doc = config.es.get( index='citation', id=citation.id, ) assert doc['_source']['institution_id'] == institution.id assert doc['_source']['state'] == 'CA' assert doc['_source']['country'] == 'US'
def test_size(add_text, add_doc, add_citation): """ The 'size' argument should control the page length. """ t1 = add_text() t2 = add_text() t3 = add_text() t4 = add_text() for i in range(3): doc = add_doc() add_citation(text=t1, document=doc) add_citation(text=t2, document=doc) for i in range(2): doc = add_doc() add_citation(text=t1, document=doc) add_citation(text=t3, document=doc) for i in range(1): doc = add_doc() add_citation(text=t1, document=doc) add_citation(text=t4, document=doc) Citation_Index.es_insert() Text_Index.es_insert() texts = assigned_with(t1.id, size=2) assert len(texts['hits']) == 2 assert texts['hits'][0]['_id'] == str(t2.id) assert texts['hits'][1]['_id'] == str(t3.id)
def test_count_facets(add_citation, add_subfield, add_subfield_document): """ Citation_Index.count_facets() should return a set of (value, count) tuples for a given field. """ sf1 = add_subfield() sf2 = add_subfield() sf3 = add_subfield() for i in range(3): c = add_citation() add_subfield_document(subfield=sf1, document=c.document) for i in range(2): c = add_citation() add_subfield_document(subfield=sf2, document=c.document) for i in range(1): c = add_citation() add_subfield_document(subfield=sf3, document=c.document) Citation_Index.es_insert() counts = Citation_Index.count_facets('subfield_id') assert counts == [ (sf1.id, 3), (sf2.id, 2), (sf3.id, 1), ]
def test_search_filter(add_text, add_citation): """ Free-text search query should be applied. """ t1 = add_text(title='match one') t2 = add_text(title='two') t3 = add_text(title='match three') t4 = add_text(title='four') for i in range(4): add_citation(t1) for i in range(3): add_citation(t2) for i in range(2): add_citation(t3) for i in range(1): add_citation(t4) Citation_Index.es_insert() Text_Index.es_insert() texts = rank_texts(query='match') assert len(texts['hits']) == 2 assert texts['hits'][0]['_id'] == str(t1.id) assert texts['hits'][1]['_id'] == str(t3.id)
def test_unfiltered(add_text, add_citation): """ When no filters or query is passed, return the overall rankings. """ t1 = add_text() t2 = add_text() t3 = add_text() for i in range(3): add_citation(t1) for i in range(2): add_citation(t2) for i in range(1): add_citation(t3) Citation_Index.es_insert() Text_Index.es_insert() texts = rank_texts() assert len(texts['hits']) == 3 assert texts['hits'][0]['_id'] == str(t1.id) assert texts['hits'][1]['_id'] == str(t2.id) assert texts['hits'][2]['_id'] == str(t3.id)
def test_subfield_facets(add_citation, add_subfield, add_subfield_document): """ subfield_facets() should provide a list of label/value/count dicts. """ sf1 = add_subfield(name="Subfield 1") sf2 = add_subfield(name="Subfield 2") sf3 = add_subfield(name="Subfield 3") for i in range(3): c = add_citation() add_subfield_document(subfield=sf1, document=c.document) for i in range(2): c = add_citation() add_subfield_document(subfield=sf2, document=c.document) for i in range(1): c = add_citation() add_subfield_document(subfield=sf3, document=c.document) Citation_Index.es_insert() Subfield_Index.es_insert() facets = subfield_facets() assert facets == [ dict(label="Subfield 1", value=sf1.id, count=3), dict(label="Subfield 2", value=sf2.id, count=2), dict(label="Subfield 3", value=sf3.id, count=1), ]
def test_metadata_filters(add_text, add_citation): """ Citation metadata filters should be applied. """ t1 = add_text(corpus='corpus1') t2 = add_text(corpus='corpus2') t3 = add_text(corpus='corpus1') t4 = add_text(corpus='corpus2') for i in range(4): add_citation(t1) for i in range(3): add_citation(t2) for i in range(2): add_citation(t3) for i in range(1): add_citation(t4) Citation_Index.es_insert() Text_Index.es_insert() texts = rank_texts(filters=dict(corpus='corpus2')) assert len(texts['hits']) == 2 assert texts['hits'][0]['_id'] == str(t2.id) assert texts['hits'][1]['_id'] == str(t4.id)
def assigned_with(text_id, size=200): """ Given a "seed" text, rank other texts assigned on the same syllabi. Args: text_id (int): The text id. Returns: dict: Elasticsearch hits. """ # Get syllabi that assign the text. doc_ids = Citation_Index.docs_with_text(text_id) # Rank texts assigned by those sylalbi. ranks = Citation_Index.compute_ranking(dict( document_id=doc_ids )) # Omit the seed text. ranks.pop(str(text_id)) # Materialize the text metadata. texts = Text_Index.materialize_ranking(ranks, size=size) return texts
def test_institution_facets(add_institution, add_citation): """ institution_facets() should provide a list of label/value/count dicts. """ i1 = add_institution(name='Institution 1') i2 = add_institution(name='Institution 2') i3 = add_institution(name='Institution 3') for i in range(3): c = add_citation() Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation() Institution_Document.create(institution=i2, document=c.document) for i in range(1): c = add_citation() Institution_Document.create(institution=i3, document=c.document) Citation_Index.es_insert() Institution_Index.es_insert() facets = institution_facets() assert facets == [ dict(label='Institution 1', value=i1.id, count=3), dict(label='Institution 2', value=i2.id, count=2), dict(label='Institution 3', value=i3.id, count=1), ]
def test_filter_corpus(add_text, add_citation): """ Filter on corpus as a keyword value. """ t1 = add_text(corpus='corpus1') t2 = add_text(corpus='corpus1') t3 = add_text(corpus='corpus2') for i in range(3): add_citation(text=t1) for i in range(2): add_citation(text=t2) for i in range(1): add_citation(text=t3) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict( corpus='corpus1' )) assert ranks == { str(t1.id): 3, str(t2.id): 2, }
def test_merge_included_facets(add_institution, add_citation): """ Don't duplicate included facets are already present in the ranking. """ i1 = add_institution(name='Institution 1') i2 = add_institution(name='Institution 2') i3 = add_institution(name='Institution 3') for i in range(3): c = add_citation() Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation() Institution_Document.create(institution=i2, document=c.document) for i in range(1): c = add_citation() Institution_Document.create(institution=i3, document=c.document) Citation_Index.es_insert() counts = Citation_Index.count_facets( 'institution_id', include=[i2.id, i3.id], ) # Dedupe 2 and 3. assert counts == [ (i1.id, 3), (i2.id, 2), (i3.id, 1), ]
def test_size(add_text, add_citation): """ The 'size' argument should control the page length. """ t1 = add_text() t2 = add_text() t3 = add_text() for i in range(3): add_citation(t1) for i in range(2): add_citation(t2) for i in range(1): add_citation(t3) Citation_Index.es_insert() Text_Index.es_insert() texts = rank_texts(size=2) assert len(texts['hits']) == 2 assert texts['hits'][0]['_id'] == str(t1.id) assert texts['hits'][1]['_id'] == str(t2.id)
def assigned_with(text_id, size=1000): """ Given a "seed" text, rank other texts assigned on the same syllabi. Args: text_id (int): The text id. Returns: dict: Elasticsearch hits. """ # Get syllabi that assign the text. doc_ids = Citation_Index.docs_with_text(text_id) # Rank texts assigned by those sylalbi. ranks = Citation_Index.compute_ranking(dict( document_id=doc_ids )) # Omit the seed text. ranks.pop(str(text_id)) # Materialize the text metadata. texts = Text_Index.materialize_ranking(ranks, size=size) return texts
def test_metadata_filters(add_text, add_citation): """ Citation metadata filters should be applied. """ t1 = add_text(corpus='corpus1') t2 = add_text(corpus='corpus2') t3 = add_text(corpus='corpus1') t4 = add_text(corpus='corpus2') for i in range(4): add_citation(t1) for i in range(3): add_citation(t2) for i in range(2): add_citation(t3) for i in range(1): add_citation(t4) Citation_Index.es_insert() Text_Index.es_insert() texts = rank_texts(filters=dict( corpus='corpus2' )) assert len(texts['hits']) == 2 assert texts['hits'][0]['_id'] == str(t2.id) assert texts['hits'][1]['_id'] == str(t4.id)
def test_unfiltered(add_text, add_citation): """ When no filters are provided, return total counts. """ t1 = add_text() t2 = add_text() t3 = add_text() for i in range(3): add_citation(text=t1) for i in range(2): add_citation(text=t2) for i in range(1): add_citation(text=t3) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking() assert ranks == { str(t1.id): 3, str(t2.id): 2, str(t3.id): 1, }
def test_filter_institution(add_text, add_citation, add_institution): """ Filter by institution. """ t1 = add_text() t2 = add_text() t3 = add_text() i1 = add_institution() i2 = add_institution() for i in range(3): c = add_citation(text=t1) Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation(text=t2) Institution_Document.create(institution=i1, document=c.document) for i in range(1): c = add_citation(text=t3) Institution_Document.create(institution=i2, document=c.document) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict(institution_id=i1.id)) assert ranks == { str(t1.id): 3, str(t2.id): 2, }
def test_ignore_filters_with_empty_values(empty, add_text, add_citation): """ Ignore filters with empty values. """ t1 = add_text(corpus='corpus1') t2 = add_text(corpus='corpus2') t3 = add_text(corpus='corpus3') for i in range(3): add_citation(text=t1) for i in range(2): add_citation(text=t2) for i in range(1): add_citation(text=t3) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict( corpus=empty )) assert ranks == { str(t1.id): 3, str(t2.id): 2, str(t3.id): 1, }
def test_filter_country(add_text, add_citation, add_institution): """ Filter on country as a keyword value. """ t1 = add_text() t2 = add_text() t3 = add_text() i1 = add_institution(country='USA') i2 = add_institution(country='CAN') for i in range(3): c = add_citation(text=t1) Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation(text=t2) Institution_Document.create(institution=i1, document=c.document) for i in range(1): c = add_citation(text=t3) Institution_Document.create(institution=i2, document=c.document) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict(country='USA')) assert ranks == { str(t1.id): 3, str(t2.id): 2, }
def test_filter_multiple_values(add_text, add_citation): """ When a list of values is provided for a filter key, match citations that include _any_ of the provided values for the key. """ t1 = add_text(corpus='corpus1') t2 = add_text(corpus='corpus2') t3 = add_text(corpus='corpus3') for i in range(3): add_citation(text=t1) for i in range(2): add_citation(text=t2) for i in range(1): add_citation(text=t3) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict( corpus=['corpus1', 'corpus3'] )) # Count both `corpus1` and `corpus3` citations. assert ranks == { str(t1.id): 3, str(t3.id): 1, }
def test_filter_multiple_values(add_text, add_citation): """ When a list of values is provided for a filter key, match citations that include _any_ of the provided values for the key. """ t1 = add_text(corpus='corpus1') t2 = add_text(corpus='corpus2') t3 = add_text(corpus='corpus3') for i in range(3): add_citation(text=t1) for i in range(2): add_citation(text=t2) for i in range(1): add_citation(text=t3) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict(corpus=['corpus1', 'corpus3'])) # Count both `corpus1` and `corpus3` citations. assert ranks == { str(t1.id): 3, str(t3.id): 1, }
def test_docs_with_text(add_text, add_doc, add_citation): """ Citation_Index.docs_with_text() should return a set of ids for documents that assign a given text. """ t1 = add_text() t2 = add_text() d1 = add_doc() d2 = add_doc() d3 = add_doc() d4 = add_doc() add_citation(text=t1, document=d1) add_citation(text=t1, document=d2) add_citation(text=t2, document=d3) add_citation(text=t2, document=d4) Citation_Index.es_insert() doc_ids = Citation_Index.docs_with_text(t1.id) assert doc_ids == [d1.id, d2.id]
def test_ignore_filters_with_empty_values(empty, add_text, add_citation): """ Ignore filters with empty values. """ t1 = add_text(corpus='corpus1') t2 = add_text(corpus='corpus2') t3 = add_text(corpus='corpus3') for i in range(3): add_citation(text=t1) for i in range(2): add_citation(text=t2) for i in range(1): add_citation(text=t3) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict(corpus=empty)) assert ranks == { str(t1.id): 3, str(t2.id): 2, str(t3.id): 1, }
def test_filter_country(add_text, add_citation, add_institution): """ Filter on country as a keyword value. """ t1 = add_text() t2 = add_text() t3 = add_text() i1 = add_institution(country='USA') i2 = add_institution(country='CAN') for i in range(3): c = add_citation(text=t1) Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation(text=t2) Institution_Document.create(institution=i1, document=c.document) for i in range(1): c = add_citation(text=t3) Institution_Document.create(institution=i2, document=c.document) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict( country='USA' )) assert ranks == { str(t1.id): 3, str(t2.id): 2, }
def test_filter_corpus(add_text, add_citation): """ Filter on corpus as a keyword value. """ t1 = add_text(corpus='corpus1') t2 = add_text(corpus='corpus1') t3 = add_text(corpus='corpus2') for i in range(3): add_citation(text=t1) for i in range(2): add_citation(text=t2) for i in range(1): add_citation(text=t3) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict(corpus='corpus1')) assert ranks == { str(t1.id): 3, str(t2.id): 2, }
def test_subfield_facets(add_citation, add_subfield, add_subfield_document): """ subfield_facets() should provide a list of label/value/count dicts. """ sf1 = add_subfield(name='Subfield 1') sf2 = add_subfield(name='Subfield 2') sf3 = add_subfield(name='Subfield 3') for i in range(3): c = add_citation() add_subfield_document(subfield=sf1, document=c.document) for i in range(2): c = add_citation() add_subfield_document(subfield=sf2, document=c.document) for i in range(1): c = add_citation() add_subfield_document(subfield=sf3, document=c.document) Citation_Index.es_insert() Subfield_Index.es_insert() facets = subfield_facets() assert facets == [ dict(label='Subfield 1', value=sf1.id, count=3), dict(label='Subfield 2', value=sf2.id, count=2), dict(label='Subfield 3', value=sf3.id, count=1), ]
def test_filter_institution(add_text, add_citation, add_institution): """ Filter by institution. """ t1 = add_text() t2 = add_text() t3 = add_text() i1 = add_institution() i2 = add_institution() for i in range(3): c = add_citation(text=t1) Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation(text=t2) Institution_Document.create(institution=i1, document=c.document) for i in range(1): c = add_citation(text=t3) Institution_Document.create(institution=i2, document=c.document) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict( institution_id=i1.id )) assert ranks == { str(t1.id): 3, str(t2.id): 2, }
def test_merge_included(add_institution, add_citation): """ When ids are passed for institutions that fall outside of the default page, merge the extra facets into the baseline ranking. """ i1 = add_institution(name='Institution 1') i2 = add_institution(name='Institution 2') i3 = add_institution(name='Institution 3') for i in range(3): c = add_citation() Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation() Institution_Document.create(institution=i2, document=c.document) for i in range(1): c = add_citation() Institution_Document.create(institution=i3, document=c.document) Citation_Index.es_insert() Institution_Index.es_insert() facets = institution_facets(depth=2, include=[i2.id, i3.id]) assert facets == [ dict(label='Institution 1', value=i1.id, count=3), dict(label='Institution 2', value=i2.id, count=2), # Dedupe 2. dict(label='Institution 3', value=i3.id, count=1), # Append 3. ]
def rank_texts(filters={}, query=None, size=1000, page=1): """ Filter and rank texts. Args: filters (dict): Citation metadata filters. query (str): A text metadata search query. size (str): Return N results. page (str): 1-indexed page offset. Returns: dict: Elasticsearch hits. """ # Filter citation counts, if non-empty filters. if any(filters.values()): ranks = Citation_Index.compute_ranking(filters) else: ranks = None # Materialize the text metadata. texts = Text_Index.materialize_ranking(ranks, query, size, page) return texts
def test_only_index_citations_with_displayed_texts(add_text, add_citation): """ Only index citations linked with texts marked for display. """ t1 = add_text(display=None) t2 = add_text(display=False) t3 = add_text(display=True) c1 = add_citation(text=t1) c2 = add_citation(text=t2) c3 = add_citation(text=t3) Citation_Index.es_insert() assert config.es.get(index='citation', id=c3.id) assert Citation_Index.es_count() == 1
def test_append_included_facets(add_institution, add_citation): """ When "included" facets have counts that put them below of the baseline ranking, append the extra facets to the bottom of the list. """ i1 = add_institution(name='Institution 1') i2 = add_institution(name='Institution 2') i3 = add_institution(name='Institution 3') i4 = add_institution(name='Institution 4') for i in range(4): c = add_citation() Institution_Document.create(institution=i1, document=c.document) for i in range(3): c = add_citation() Institution_Document.create(institution=i2, document=c.document) for i in range(2): c = add_citation() Institution_Document.create(institution=i3, document=c.document) for i in range(1): c = add_citation() Institution_Document.create(institution=i4, document=c.document) Citation_Index.es_insert() counts = Citation_Index.count_facets( 'institution_id', include=[i3.id, i4.id], depth=2, ) assert counts == [ (i1.id, 4), (i2.id, 3), # Include 3 and 4. (i3.id, 2), (i4.id, 1), ]
def test_only_index_citations_with_valid_texts(add_text, add_citation): """ Only index citations linked with validated texts. """ t1 = add_text(valid=None) t2 = add_text(valid=False) t3 = add_text(valid=True) c1 = add_citation(text=t1) c2 = add_citation(text=t2) c3 = add_citation(text=t3) Citation_Index.es_insert() assert config.es.get(index='citation', id=c3.id) assert Citation_Index.es_count() == 1
def test_index_citation_fields(add_citation): """ Local rows - text_id, document_id, and corpus - should be included in the Elasticsearch document. """ citation = add_citation() Citation_Index.es_insert() doc = config.es.get( index='citation', id=citation.id, ) assert doc['_source']['text_id'] == citation.text_id assert doc['_source']['document_id'] == citation.document_id assert doc['_source']['corpus'] == citation.text.corpus
def test_compute_scores(add_text, add_citation): """ For each text, compute the ratio between the square roots of the text's assignment count and the max assignment count. """ t1 = add_text() t2 = add_text() t3 = add_text() t4 = add_text() t5 = add_text() t6 = add_text() for i in range(3): add_citation(text=t1) for i in range(2): add_citation(text=t2) add_citation(text=t3) for i in range(1): add_citation(text=t4) add_citation(text=t5) add_citation(text=t6) Citation_Index.es_insert() ranks = Citation_Index.compute_scores() assert ranks == { str(t1.id): np.sqrt(3) / np.sqrt(3), str(t2.id): np.sqrt(2) / np.sqrt(3), str(t3.id): np.sqrt(2) / np.sqrt(3), str(t4.id): np.sqrt(1) / np.sqrt(3), str(t5.id): np.sqrt(1) / np.sqrt(3), str(t6.id): np.sqrt(1) / np.sqrt(3), }
def field_facets(): """ Materialize field facets with counts. Returns: dict: {label, value, count} """ counts = Citation_Index.count_facets('field_id') return Field_Index.materialize_facets(counts)
def corpus_facets(): """ Materialize corpus facets with counts. Returns: dict: {label, value, count} """ counts = Citation_Index.count_facets('corpus') return Text_Index.materialize_corpus_facets(counts)
def country_facets(): """ Materialize state facets with counts. Returns: dict: {label, value, count} """ counts = Citation_Index.count_facets('country') return Institution_Index.materialize_country_facets(counts)
def institution_facets(): """ Materialize institution facets with counts. Returns: dict: {label, value, count} """ counts = Citation_Index.count_facets('institution_id') return Institution_Index.materialize_institution_facets(counts)
def country_facets(): """ Materialize state facets with counts. Returns: dict: {label, value, count} """ counts = Citation_Index.count_facets("country") return Institution_Index.materialize_country_facets(counts)
def corpus_facets(): """ Materialize corpus facets with counts. Returns: dict: {label, value, count} """ counts = Citation_Index.count_facets("corpus") return Text_Index.materialize_corpus_facets(counts)
def field_facets(): """ Materialize field facets with counts. Returns: dict: {label, value, count} """ counts = Citation_Index.count_facets("field_id") return Field_Index.materialize_facets(counts)
def institution_facets(): """ Materialize institution facets with counts. Returns: dict: {label, value, count} """ counts = Citation_Index.count_facets("institution_id") return Institution_Index.materialize_institution_facets(counts)
def test_filter_subfield( add_text, add_citation, add_subfield, add_subfield_document, ): """ Filter by subfield. """ t1 = add_text() t2 = add_text() t3 = add_text() sf1 = add_subfield() sf2 = add_subfield() for i in range(3): c = add_citation(text=t1) add_subfield_document(subfield=sf1, document=c.document) for i in range(2): c = add_citation(text=t2) add_subfield_document(subfield=sf1, document=c.document) for i in range(1): c = add_citation(text=t3) add_subfield_document(subfield=sf2, document=c.document) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict( subfield_id=sf1.id )) assert ranks == { str(t1.id): 3, str(t2.id): 2, }
def test_index_field_refs(add_citation, add_subfield, add_subfield_document): """ When the document is linked with a subfield, subfield / field referenecs should be included in the document. """ citation = add_citation() subfield = add_subfield() # Link subfield -> citation. add_subfield_document(subfield=subfield, document=citation.document) Citation_Index.es_insert() doc = config.es.get( index='citation', id=citation.id, ) assert doc['_source']['subfield_id'] == subfield.id assert doc['_source']['field_id'] == subfield.field_id