def doc_to_fields(doc_id, radius=100):

    """
    Search for field / department codes in a document.

    Args:
        doc_id (int)
        radius (int)
    """

    doc_text = Document_Text.get(Document_Text.document==doc_id)

    # Search for each field.
    for subfield in Subfield.select():

        match = subfield.search(doc_text.text)

        # If found, link field -> doc.
        if match:

            # Slice out the snippet.
            i1 = max(match.start() - radius, 0)
            i2 = min(match.end() + radius, len(doc_text.text))
            snippet = doc_text.text[i1:i2]

            Subfield_Document.create(
                subfield=subfield,
                document=doc_text.document,
                offset=match.start(),
                snippet=crunch(snippet),
            )
def doc_to_fields(doc_id, radius=100):
    """
    Search for field / department codes in a document.

    Args:
        doc_id (int)
        radius (int)
    """

    doc_text = Document_Text.get(Document_Text.document == doc_id)

    # Search for each field.
    for subfield in Subfield.select():

        match = subfield.search(doc_text.text)

        # If found, link field -> doc.
        if match:

            # Slice out the snippet.
            i1 = max(match.start() - radius, 0)
            i2 = min(match.end() + radius, len(doc_text.text))
            snippet = doc_text.text[i1:i2]

            Subfield_Document.create(
                subfield=subfield,
                document=doc_text.document,
                offset=match.start(),
                snippet=crunch(snippet),
            )
Example #3
0
def test_unique_pairs(add_subfield, add_doc):
    """
    Don't allow duplicate links between the same field -> document.
    """

    s = add_subfield()
    d = add_doc()

    Subfield_Document.create(subfield=s, document=d, offset=1, snippet='abc')

    with pytest.raises(IntegrityError):

        Subfield_Document.create(subfield=s,
                                 document=d,
                                 offset=2,
                                 snippet='def')
Example #4
0
def test_no_matches(add_doc, add_subfield):
    """
    When no fields match, don't write any rows.
    """

    doc = add_doc(content='abc Field2 101 def')

    sf1 = add_subfield(name='Field1')

    doc_to_fields(doc.id)

    # Shouldn't write any rows.
    assert Subfield_Document.select().count() == 0
def test_no_matches(add_doc, add_subfield):

    """
    When no fields match, don't write any rows.
    """

    doc = add_doc(content='abc Field2 101 def')

    sf1 = add_subfield(name='Field1')

    doc_to_fields(doc.id)

    # Shouldn't write any rows.
    assert Subfield_Document.select().count() == 0
Example #6
0
def test_matches(add_doc, add_subfield):
    """
    When a document contains a field code, write a doc->field link.
    """

    doc = add_doc(content='abc Field1 101 def Field2 101 ghi')

    sf1 = add_subfield(name='Field1')
    sf2 = add_subfield(name='Field2')
    sf3 = add_subfield(name='Field3')

    doc_to_fields(doc.id)

    # Should write 2 field -> doc links.
    assert Subfield_Document.select().count() == 2

    # Should match the right fields.
    for sf in [sf1, sf2]:

        assert Subfield_Document.select().where(
            Subfield_Document.subfield == sf,
            Subfield_Document.document == doc,
        )
def test_matches(add_doc, add_subfield):

    """
    When a document contains a field code, write a doc->field link.
    """

    doc = add_doc(content='abc Field1 101 def Field2 101 ghi')

    sf1 = add_subfield(name='Field1')
    sf2 = add_subfield(name='Field2')
    sf3 = add_subfield(name='Field3')

    doc_to_fields(doc.id)

    # Should write 2 field -> doc links.
    assert Subfield_Document.select().count() == 2

    # Should match the right fields.
    for sf in [sf1, sf2]:

        assert Subfield_Document.select().where(
            Subfield_Document.subfield==sf,
            Subfield_Document.document==doc,
        )
Example #8
0
def test_character_offset(add_doc, add_subfield):
    """
    Record the character offset of the first match.
    """

    #                      01234
    doc = add_doc(content='abc Field1 101 def Field1 201 ghi')

    sf1 = add_subfield(name='Field1')

    doc_to_fields(doc.id)

    assert Subfield_Document.select().where(
        Subfield_Document.subfield == sf1,
        Subfield_Document.document == doc,
        Subfield_Document.offset == 3,
    )
def test_character_offset(add_doc, add_subfield):

    """
    Record the character offset of the first match.
    """

    #                      01234
    doc = add_doc(content='abc Field1 101 def Field1 201 ghi')

    sf1 = add_subfield(name='Field1')

    doc_to_fields(doc.id)

    assert Subfield_Document.select().where(
        Subfield_Document.subfield==sf1,
        Subfield_Document.document==doc,
        Subfield_Document.offset==3,
    )
    def _subfield_document(
        subfield=None,
        document=None,
        snippet='field',
        offset=100,
    ):

        if not subfield:
            subfield = add_subfield()

        if not document:
            document = add_doc()

        return Subfield_Document.create(
            subfield=subfield,
            document=document,
            offset=offset,
            snippet=snippet,
        )
    def _subfield_document(
        subfield=None,
        document=None,
        snippet='field',
        offset=100,
    ):

        if not subfield:
            subfield = add_subfield()

        if not document:
            document = add_doc()

        return Subfield_Document.create(
            subfield=subfield,
            document=document,
            offset=offset,
            snippet=snippet,
        )