def test_matches(add_doc, add_text):

    """
    When documents match the query, write doc -> text rows.
    """

    wp1 = add_doc(content='War and Peace, Leo Tolstoy 1')
    wp2 = add_doc(content='War and Peace, Leo Tolstoy 2')
    wp3 = add_doc(content='War and Peace, Leo Tolstoy 3')

    ak1 = add_doc(content='Anna Karenina, Leo Tolstoy 1')
    ak2 = add_doc(content='Anna Karenina, Leo Tolstoy 2')

    Document_Index.es_insert()

    text = add_text(title='War and Peace', surname='Tolstoy')
    text_to_docs(text.id)

    # Should write 3 citation links.
    assert Citation.select().count() == 3

    # Should match "War and Peace," ignore "Anna Karenina".
    for doc in [wp1, wp2, wp3]:

        assert Citation.select().where(
            Citation.text==text,
            Citation.document==doc,
            Citation.tokens.contains(text.hash_tokens),
        )
def test_matches(add_doc, add_text):

    """
    When documents match the query, write doc -> text rows.
    """

    wp1 = add_doc(content='War and Peace, Leo Tolstoy 1')
    wp2 = add_doc(content='War and Peace, Leo Tolstoy 2')
    wp3 = add_doc(content='War and Peace, Leo Tolstoy 3')

    ak1 = add_doc(content='Anna Karenina, Leo Tolstoy 1')
    ak2 = add_doc(content='Anna Karenina, Leo Tolstoy 2')

    Document_Index.es_insert()

    text = add_text(title='War and Peace', surname='Tolstoy')
    text_to_docs(text.id)

    # Should write 3 citation links.
    assert Citation.select().count() == 3

    # Should match "War and Peace," ignore "Anna Karenina".
    for doc in [wp1, wp2, wp3]:

        assert Citation.select().where(
            Citation.text==text,
            Citation.document==doc,
            Citation.tokens.contains(text.hash_tokens),
        )
Example #3
0
def fuzz(out_file, n):

    """
    Write N citation fuzz scores.
    """

    cols = ['fuzz', 'tokens']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    # Draw N random ids.
    cids = np.random.random_integers(1, Citation.max_id(), n)

    for cid in progress.bar(cids):

        try:

            c = Citation.get(Citation.id==cid)

            writer.writerow(dict(
                fuzz=c.fuzz,
                tokens=c.tokens,
            ))

        except:
            pass
Example #4
0
    def add_edges(self, max_texts=20):
        """
        For each syllabus, register citation pairs as edges.

        Args:
            max_texts (int): Ignore docs with > than N citations.
        """

        text_ids = (fn.array_agg(Text.id).coerce(False).alias('text_ids'))

        docs = (Citation.select(Citation.document, text_ids).join(Text).having(
            fn.count(Text.id) <= max_texts).where(Text.display == True).where(
                Text.valid == True).group_by(Citation.document))

        for row in query_bar(docs):
            for tid1, tid2 in combinations(row.text_ids, 2):

                # If the edge exists, increment the weight.

                if self.graph.has_edge(tid1, tid2):
                    self.graph[tid1][tid2]['weight'] += 1

                # Otherwise, initialize the edge.

                else:
                    self.graph.add_edge(tid1, tid2, weight=1)
def test_no_matches(add_doc, add_text):

    """
    When no documents match, don't write any rows.
    """

    add_doc(content='War and Peace, Leo Tolstoy')
    Document_Index.es_insert()

    text = add_text(title='Master and Man', surname='Tolstoy')
    text_to_docs(text.id)

    # Shouldn't write any rows.
    assert Citation.select().count() == 0
def test_no_matches(add_doc, add_text):

    """
    When no documents match, don't write any rows.
    """

    add_doc(content='War and Peace, Leo Tolstoy')
    Document_Index.es_insert()

    text = add_text(title='Master and Man', surname='Tolstoy')
    text_to_docs(text.id)

    # Shouldn't write any rows.
    assert Citation.select().count() == 0
    def _citation(
        text=None,
        document=None,
        tokens=['one', 'two'],
    ):

        if not text:
            text = add_text()

        if not document:
            document = add_doc()

        return Citation.create(
            text=text,
            document=document,
            tokens=tokens,
        )
    def _citation(
        text=None,
        document=None,
        tokens=['one', 'two'],
    ):

        if not text:
            text = add_text()

        if not document:
            document = add_doc()

        return Citation.create(
            text=text,
            document=document,
            tokens=tokens,
        )
    def es_stream_docs(cls):

        """
        Stream Elasticsearch docs.

        Yields:
            dict: The next document.
        """

        query = (
            Citation.select()
            .join(Text)
            .where(Text.display==True)
            .where(Text.valid==True)
        )

        for row in query_bar(query):

            doc = {}

            # Local fields:

            doc['_id'] = row.id
            doc['text_id'] = row.text_id
            doc['document_id'] = row.document_id
            doc['corpus'] = row.text.corpus

            # Field references:

            subfield = row.subfield

            if subfield:
                doc['subfield_id'] = subfield.id
                doc['field_id'] = subfield.field_id

            # Institution reference:

            inst = row.institution

            if inst:
                doc['institution_id'] = inst.id
                doc['state'] = inst.state
                doc['country'] = inst.country

            yield doc
def test_stream(add_citation):

    """
    BaseModel.stream() should generate all records in the table.
    """

    for i in range(100):
        add_citation()

    ids = []
    for row in Citation.stream(10):

        ids.append(row.id)

        # Possible to save, since we're not in a transaction.
        row.valid = False
        row.save()

    assert ids == list(range(1, 101))
def test_citation_formats(title, surname, content, add_doc, add_text):

    """
    Test title/author -> citation formats.
    """

    # Pad tokens around the match.
    padded = ('XXX '*1000) + content + (' XXX'*1000)

    doc = add_doc(content=padded)
    Document_Index.es_insert()

    text = add_text(title=title, surname=surname)
    text_to_docs(text.id)

    assert Citation.select().where(
        Citation.text==text,
        Citation.document==doc,
        Citation.tokens.contains(text.hash_tokens),
    )
    def es_stream_docs(cls):
        """
        Stream Elasticsearch docs.

        Yields:
            dict: The next document.
        """

        query = (Citation.select().join(Text).where(
            Text.display == True).where(Text.valid == True))

        for row in query_bar(query):

            doc = {}

            # Local fields:

            doc['_id'] = row.id
            doc['text_id'] = row.text_id
            doc['document_id'] = row.document_id
            doc['corpus'] = row.text.corpus

            # Field references:

            subfield = row.subfield

            if subfield:
                doc['subfield_id'] = subfield.id
                doc['field_id'] = subfield.field_id

            # Institution reference:

            inst = row.institution

            if inst:
                doc['institution_id'] = inst.id
                doc['state'] = inst.state
                doc['country'] = inst.country

            yield doc
def text_to_docs(text_id):

    """
    Query a text against the OSP corpus.

    Args:
        text_id (int): A text row id.
    """

    row = Text.get(Text.id==text_id)


    doc_ids = set()
    for tokens in row.queries:

        # Execute the query.
        results = config.es.search(

            index='document',
            request_timeout=90,

            body={
                'fields': [],
                'size': 1000000,
                'filter': {
                    'query': {
                        'match_phrase': {
                            'body': {
                                'query': ' '.join(tokens),
                                'slop': 5,
                            }
                        }
                    }
                }
            }

        )

        # Fail the job if the result is incomplete.
        if results['timed_out']:
            raise TimeoutError()

        # Register the doc ids.
        if results['hits']['total'] > 0:
            for hit in results['hits']['hits']:
                doc_ids.add(int(hit['_id']))


    # Build doc -> text links.
    citations = []
    for doc_id in doc_ids:

        citations.append({
            'document': doc_id,
            'text': row.id,
            'tokens': row.hash_tokens,
        })

    # Bulk-insert the results.
    if citations:
        Citation.insert_many(citations).execute()