Esempio n. 1
0
def test_max_citations(models, add_hlom, add_doc):

    """
    Syllabi with more than `max_citations` should be ignored.
    """

    t1 = add_hlom()
    t2 = add_hlom()
    t3 = add_hlom()

    s1 = add_doc('syllabus1')
    s2 = add_doc('syllabus2')

    # 2 citations in s1.
    HLOM_Citation.create(document=s1, record=t1)
    HLOM_Citation.create(document=s1, record=t2)

    # 3 citations in s2.
    HLOM_Citation.create(document=s2, record=t1)
    HLOM_Citation.create(document=s2, record=t2)
    HLOM_Citation.create(document=s2, record=t3)

    n = Network()
    n.add_edges(2)

    # Just register citations from s1.
    assert n.graph.edge[t1.control_number][t2.control_number]['weight'] == 1
    assert nx.number_of_edges(n.graph) == 1
Esempio n. 2
0
def test_matches(corpus_index, mock_hlom, add_doc, add_hlom):

    """
    When OSP documents match the query, write link rows.
    """

    d1 = add_doc('War and Peace, Leo Tolstoy 1')
    d2 = add_doc('War and Peace, Leo Tolstoy 2')
    d3 = add_doc('War and Peace, Leo Tolstoy 3')
    d4 = add_doc('Anna Karenina, Leo Tolstoy 1')
    d5 = add_doc('Anna Karenina, Leo Tolstoy 2')

    Document_Text.es_insert()

    record = add_hlom('War and Peace', 'Leo Tolstoy')
    query(record.id)

    # Should write 3 citation links.
    assert HLOM_Citation.select().count() == 3

    # Should match the right documents.
    for doc in [d1, d2, d3]:

        assert HLOM_Citation.select().where(
            HLOM_Citation.document==doc,
            HLOM_Citation.record==record
        )
Esempio n. 3
0
def test_unique_pairs(models, add_hlom, add_doc):

    """
    Don't allow duplicate links between the same text -> syllabus pair.
    """

    d = add_doc()
    r = add_hlom()

    HLOM_Citation.create(document=d, record=r)

    with pytest.raises(IntegrityError):
        HLOM_Citation.create(document=d, record=r)
Esempio n. 4
0
def test_no_matches(corpus_index, add_doc, add_hlom):

    """
    When no documents match, don't write any rows.
    """

    add_doc('War and Peace, Leo Tolstoy')
    Document_Text.es_insert()

    record = add_hlom('Master and Man', 'Leo Tolstoy')
    query(record.id)

    # Shouldn't write any rows.
    assert HLOM_Citation.select().count() == 0
Esempio n. 5
0
    def build(self):
        """
        Construct the network.
        """

        # Select all cited HLOM records.
        nodes = (HLOM_Citation.select(HLOM_Citation.record).distinct(
            HLOM_Citation.record))

        # Add each record as a node.
        for node in ServerSide(nodes):
            self.graph.add_node(node.record.control_number,
                                title=node.record.title(),
                                author=node.record.author())
Esempio n. 6
0
def query(id):

    """
    Query a MARC record against the OSP corpus.

    :param id: The hlom_record row id.
    """

    row = HLOM_Record.get(HLOM_Record.id==id)

    # Execute the query.
    results = config.es.search('osp', 'document', timeout=30, body={
        'fields': ['doc_id'],
        'size': 100000,
        'filter': {
            'query': {
                'match_phrase': {
                    'body': {
                        'query': row.query,
                        'slop': 50
                    }
                }
            }
        }
    })

    if results['hits']['total'] > 0:

        citations = []
        for hit in results['hits']['hits']:
            citations.append({
                'document': hit['fields']['doc_id'][0],
                'record': row.id
            })

        # Write the citation links.
        HLOM_Citation.insert_many(citations).execute()
Esempio n. 7
0
def test_state_abbreviations(add_hlom, add_doc):

    """
    HLOM_Citation.index_state() should denormalize state abbreviations.
    """

    t1 = add_hlom()
    t2 = add_hlom()
    t3 = add_hlom()

    s1 = add_doc('syllabus1')
    s2 = add_doc('syllabus2')
    s3 = add_doc('syllabus3')

    c1 = HLOM_Citation.create(document=s1, record=t1)
    c2 = HLOM_Citation.create(document=s2, record=t2)
    c3 = HLOM_Citation.create(document=s3, record=t3)

    # Create institutions with states.
    AL = Institution.create(metadata={'Institution_State': 'AL'})
    CT = Institution.create(metadata={'Institution_State': 'CT'})
    CA = Institution.create(metadata={'Institution_State': 'CA'})

    # Link documents -> institutions.
    Document_Institution.create(document=t1, institution=AL)
    Document_Institution.create(document=t2, institution=CT)
    Document_Institution.create(document=t3, institution=CA)

    HLOM_Citation.index_institutions()

    c1 = HLOM_Citation.reload(c1)
    c2 = HLOM_Citation.reload(c2)
    c3 = HLOM_Citation.reload(c3)

    assert c1.state == 'AL'
    assert c2.state == 'CT'
    assert c3.state == 'CA'
Esempio n. 8
0
def test_institution_ids(add_hlom, add_doc):

    """
    HLOM_Citation.index_state() should denormalize institution ids.
    """

    t1 = add_hlom()
    t2 = add_hlom()
    t3 = add_hlom()

    s1 = add_doc('syllabus1')
    s2 = add_doc('syllabus2')
    s3 = add_doc('syllabus3')

    c1 = HLOM_Citation.create(document=s1, record=t1)
    c2 = HLOM_Citation.create(document=s2, record=t2)
    c3 = HLOM_Citation.create(document=s3, record=t3)

    i1 = Institution.create()
    i2 = Institution.create()
    i3 = Institution.create()

    # Link documents -> institutions.
    Document_Institution.create(document=t1, institution=i1)
    Document_Institution.create(document=t2, institution=i2)
    Document_Institution.create(document=t3, institution=i3)

    HLOM_Citation.index_institutions()

    c1 = HLOM_Citation.reload(c1)
    c2 = HLOM_Citation.reload(c2)
    c3 = HLOM_Citation.reload(c3)

    assert c1.institution == i1
    assert c2.institution == i2
    assert c3.institution == i3
Esempio n. 9
0
def test_add_edges(models, add_hlom, add_doc):

    """
    Network#add_edges() should register edges from the citation table.
    """

    t1 = add_hlom()
    t2 = add_hlom()
    t3 = add_hlom()
    t4 = add_hlom()
    t5 = add_hlom()
    t6 = add_hlom()

    s1 = add_doc('syllabus1')
    s2 = add_doc('syllabus2')
    s3 = add_doc('syllabus3')

    # texts 1-4 in s1.
    HLOM_Citation.create(document=s1, record=t1)
    HLOM_Citation.create(document=s1, record=t2)
    HLOM_Citation.create(document=s1, record=t3)
    HLOM_Citation.create(document=s1, record=t4)

    # texts 2-5 in s2.
    HLOM_Citation.create(document=s2, record=t2)
    HLOM_Citation.create(document=s2, record=t3)
    HLOM_Citation.create(document=s2, record=t4)
    HLOM_Citation.create(document=s2, record=t5)

    # texts 3-6 in s3.
    HLOM_Citation.create(document=s3, record=t3)
    HLOM_Citation.create(document=s3, record=t4)
    HLOM_Citation.create(document=s3, record=t5)
    HLOM_Citation.create(document=s3, record=t6)

    n = Network()
    n.add_edges()

    assert n.graph.edge[t1.control_number][t2.control_number]['weight'] == 1
    assert n.graph.edge[t2.control_number][t3.control_number]['weight'] == 2
    assert n.graph.edge[t3.control_number][t4.control_number]['weight'] == 3
    assert n.graph.edge[t4.control_number][t5.control_number]['weight'] == 2
    assert n.graph.edge[t5.control_number][t6.control_number]['weight'] == 1