Ejemplo n.º 1
0
def insert():

    """
    Index documents.
    """

    Document_Text.es_insert()
Ejemplo n.º 2
0
def corpus_index(requires_es):

    """
    Clear the corpus index.
    """

    Document_Text.es_reset()
Ejemplo n.º 3
0
def test_matches(corpus_index, mock_hlom, add_doc, add_hlom):

    """
    When OSP documents match the query, write link rows.
    """

    d1 = add_doc('War and Peace, Leo Tolstoy 1')
    d2 = add_doc('War and Peace, Leo Tolstoy 2')
    d3 = add_doc('War and Peace, Leo Tolstoy 3')
    d4 = add_doc('Anna Karenina, Leo Tolstoy 1')
    d5 = add_doc('Anna Karenina, Leo Tolstoy 2')

    Document_Text.es_insert()

    record = add_hlom('War and Peace', 'Leo Tolstoy')
    query(record.id)

    # Should write 3 citation links.
    assert HLOM_Citation.select().count() == 3

    # Should match the right documents.
    for doc in [d1, d2, d3]:

        assert HLOM_Citation.select().where(
            HLOM_Citation.document==doc,
            HLOM_Citation.record==record
        )
Ejemplo n.º 4
0
def reset():

    """
    Reset the index.
    """

    Document_Text.es_reset()
Ejemplo n.º 5
0
def delete():

    """
    Delete the index.
    """

    Document_Text.es_delete()
Ejemplo n.º 6
0
def create():

    """
    Create the index.
    """

    Document_Text.es_create()
Ejemplo n.º 7
0
def test_no_matches(corpus_index, add_doc, add_hlom):

    """
    When no documents match, don't write any rows.
    """

    add_doc('War and Peace, Leo Tolstoy')
    Document_Text.es_insert()

    record = add_hlom('Master and Man', 'Leo Tolstoy')
    query(record.id)

    # Shouldn't write any rows.
    assert HLOM_Citation.select().count() == 0
Ejemplo n.º 8
0
def ext_semester(doc_id):

    """
    Try to find a "Spring/Fall YY/YYY" pattern.

    Args:
        doc_id (int): The document id.
    """

    doc_text = Document_Text.get(Document_Text.document==doc_id)

    pattern = re.compile(r'''
        (?P<semester>fall|autumn|winter|spring|summer)
        [\s\']+
        (?P<year>\d{4}|\d{2})
    ''', re.I+re.X)

    match = re.search(pattern, doc_text.text)

    if match:

        row = Document_Date_Semester(
            document=doc_id,
            offset=match.start(),
            semester=match.group('semester'),
            year=match.group('year')
        )

        if row.date.year > 1980 and row.date < datetime.now():
            row.save()
            return row
Ejemplo n.º 9
0
def count():

    """
    Count documents.
    """

    click.echo(Document_Text.es_count())
Ejemplo n.º 10
0
def test_es_doc(models):

    """
    Document_Text#es_doc() should return an Elasticsearch document.
    """

    doc = Document.create(path='000/abc')
    text = Document_Text.create(document=doc, text='text')

    assert text.es_doc['_id']       == '000/abc'
    assert text.es_doc['doc_id']    == doc.id
    assert text.es_doc['body']      == 'text'
Ejemplo n.º 11
0
def test_es_insert(models, config, corpus_index):

    """
    CorpusIndex.index() should index all rows in Elasticsearch.
    """

    # Index 100 documents.
    for i in range(10):
        doc = Document.create(path=str(i))
        Document_Text.create(document=doc, text=str(i))

    Document_Text.es_insert()

    # Should insert 10 docs.
    assert Document_Text.es_count() == 10

    # For each text row:
    for t in Document_Text.select():

        # A document should exist.
        doc = config.es.get('osp', t.document.path)

        # Should index text / doc ID.
        assert doc['_source']['doc_id'] == t.document.id
        assert doc['_source']['body']   == t.document.path
Ejemplo n.º 12
0
def test_text_extraction_fails(models, mock_osp):

    """
    If no text can be extracted, don't write the row.
    """

    # Add an empty file.
    path = mock_osp.add_file(content="")
    document = Document.create(path=path)

    ext_text(document.id)

    # Shouldn't write a row.
    assert Document_Text.select().count() == 0
Ejemplo n.º 13
0
def test_text_extraction_succeeds(models, mock_osp):

    """
    read_text() should extract text for a document and write the result into
    the `document_text` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file(content="text")
    document = Document.create(path=path)

    ext_text(document.id)

    # Pop out the new row.
    row = Document_Text.get(Document_Text.document == document)
    assert row.text == "text"
Ejemplo n.º 14
0
def ext_text(doc_id):

    """
    Write the document as plain text.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    if doc.syllabus.text:

        return Document_Text.create(
            text=doc.syllabus.text,
            document=doc
        )
Ejemplo n.º 15
0
def test_link_with_document(models, mock_osp):

    """
    When a semester marker is found, the metadata row should be associated
    with the document that was passed to the job.
    """

    # 2 document rows.
    doc1 = Document.create(path='path1')
    doc2 = Document.create(path='path2')

    # Just 1 text row.
    doc_text = Document_Text.create(document=doc2, text='Fall 2012')
    assert doc_text.id != doc_text.document.id

    row = ext_semester(doc2.id)
    assert row.document == doc2
Ejemplo n.º 16
0
def term_counts(out_file, n):

    """
    Write word frequency counts for N docs.
    """

    # CSV writer.
    cols = ['term', 'count']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    # Pull counts.
    counts = Document_Text.term_counts(n)

    for term, count in counts.most_common():
        writer.writerow({
            'term': term,
            'count': count
        })