def test_matches(add_doc, add_text):

    """
    When documents match the query, write doc -> text rows.
    """

    wp1 = add_doc(content='War and Peace, Leo Tolstoy 1')
    wp2 = add_doc(content='War and Peace, Leo Tolstoy 2')
    wp3 = add_doc(content='War and Peace, Leo Tolstoy 3')

    ak1 = add_doc(content='Anna Karenina, Leo Tolstoy 1')
    ak2 = add_doc(content='Anna Karenina, Leo Tolstoy 2')

    Document_Index.es_insert()

    text = add_text(title='War and Peace', surname='Tolstoy')
    text_to_docs(text.id)

    # Should write 3 citation links.
    assert Citation.select().count() == 3

    # Should match "War and Peace," ignore "Anna Karenina".
    for doc in [wp1, wp2, wp3]:

        assert Citation.select().where(
            Citation.text==text,
            Citation.document==doc,
            Citation.tokens.contains(text.hash_tokens),
        )
def test_matches(add_doc, add_text):

    """
    When documents match the query, write doc -> text rows.
    """

    wp1 = add_doc(content='War and Peace, Leo Tolstoy 1')
    wp2 = add_doc(content='War and Peace, Leo Tolstoy 2')
    wp3 = add_doc(content='War and Peace, Leo Tolstoy 3')

    ak1 = add_doc(content='Anna Karenina, Leo Tolstoy 1')
    ak2 = add_doc(content='Anna Karenina, Leo Tolstoy 2')

    Document_Index.es_insert()

    text = add_text(title='War and Peace', surname='Tolstoy')
    text_to_docs(text.id)

    # Should write 3 citation links.
    assert Citation.select().count() == 3

    # Should match "War and Peace," ignore "Anna Karenina".
    for doc in [wp1, wp2, wp3]:

        assert Citation.select().where(
            Citation.text==text,
            Citation.document==doc,
            Citation.tokens.contains(text.hash_tokens),
        )
Esempio n. 3
0
    def add_edges(self, max_texts=20):
        """
        For each syllabus, register citation pairs as edges.

        Args:
            max_texts (int): Ignore docs with > than N citations.
        """

        text_ids = (fn.array_agg(Text.id).coerce(False).alias('text_ids'))

        docs = (Citation.select(Citation.document, text_ids).join(Text).having(
            fn.count(Text.id) <= max_texts).where(Text.display == True).where(
                Text.valid == True).group_by(Citation.document))

        for row in query_bar(docs):
            for tid1, tid2 in combinations(row.text_ids, 2):

                # If the edge exists, increment the weight.

                if self.graph.has_edge(tid1, tid2):
                    self.graph[tid1][tid2]['weight'] += 1

                # Otherwise, initialize the edge.

                else:
                    self.graph.add_edge(tid1, tid2, weight=1)
def test_no_matches(add_doc, add_text):

    """
    When no documents match, don't write any rows.
    """

    add_doc(content='War and Peace, Leo Tolstoy')
    Document_Index.es_insert()

    text = add_text(title='Master and Man', surname='Tolstoy')
    text_to_docs(text.id)

    # Shouldn't write any rows.
    assert Citation.select().count() == 0
def test_no_matches(add_doc, add_text):

    """
    When no documents match, don't write any rows.
    """

    add_doc(content='War and Peace, Leo Tolstoy')
    Document_Index.es_insert()

    text = add_text(title='Master and Man', surname='Tolstoy')
    text_to_docs(text.id)

    # Shouldn't write any rows.
    assert Citation.select().count() == 0
    def es_stream_docs(cls):

        """
        Stream Elasticsearch docs.

        Yields:
            dict: The next document.
        """

        query = (
            Citation.select()
            .join(Text)
            .where(Text.display==True)
            .where(Text.valid==True)
        )

        for row in query_bar(query):

            doc = {}

            # Local fields:

            doc['_id'] = row.id
            doc['text_id'] = row.text_id
            doc['document_id'] = row.document_id
            doc['corpus'] = row.text.corpus

            # Field references:

            subfield = row.subfield

            if subfield:
                doc['subfield_id'] = subfield.id
                doc['field_id'] = subfield.field_id

            # Institution reference:

            inst = row.institution

            if inst:
                doc['institution_id'] = inst.id
                doc['state'] = inst.state
                doc['country'] = inst.country

            yield doc
def test_citation_formats(title, surname, content, add_doc, add_text):

    """
    Test title/author -> citation formats.
    """

    # Pad tokens around the match.
    padded = ('XXX '*1000) + content + (' XXX'*1000)

    doc = add_doc(content=padded)
    Document_Index.es_insert()

    text = add_text(title=title, surname=surname)
    text_to_docs(text.id)

    assert Citation.select().where(
        Citation.text==text,
        Citation.document==doc,
        Citation.tokens.contains(text.hash_tokens),
    )
    def es_stream_docs(cls):
        """
        Stream Elasticsearch docs.

        Yields:
            dict: The next document.
        """

        query = (Citation.select().join(Text).where(
            Text.display == True).where(Text.valid == True))

        for row in query_bar(query):

            doc = {}

            # Local fields:

            doc['_id'] = row.id
            doc['text_id'] = row.text_id
            doc['document_id'] = row.document_id
            doc['corpus'] = row.text.corpus

            # Field references:

            subfield = row.subfield

            if subfield:
                doc['subfield_id'] = subfield.id
                doc['field_id'] = subfield.field_id

            # Institution reference:

            inst = row.institution

            if inst:
                doc['institution_id'] = inst.id
                doc['state'] = inst.state
                doc['country'] = inst.country

            yield doc