Beispiel #1
0
def test_deduplicate(add_text, add_citation):

    """
    Text.deduplicate() set `display` flags for all cited texts.
    """

    t1 = add_text(title='one', surname='two')
    t2 = add_text(title='one', surname='two')

    t3 = add_text(title='three', surname='four')
    t4 = add_text(title='three', surname='four')

    t5 = add_text(title='five', surname='six')

    add_citation(text=t1)
    add_citation(text=t2)
    add_citation(text=t3)
    add_citation(text=t4)
    add_citation(text=t5)

    Text.deduplicate()

    t1 = Text.get(Text.id==t1.id)
    t2 = Text.get(Text.id==t2.id)
    t3 = Text.get(Text.id==t3.id)
    t4 = Text.get(Text.id==t4.id)
    t5 = Text.get(Text.id==t5.id)

    assert t1.display == True
    assert t2.display == False

    assert t3.display == True
    assert t4.display == False

    assert t5.display == True
def test_whitelist(add_text, add_citation):

    """
    Whitelisted texts should be exempt from the fuzziness cutoff.
    """

    t1 = add_text()
    t2 = add_text()
    t3 = add_text()

    add_citation(text=t1)
    add_citation(text=t2)
    add_citation(text=t3)

    Text.validate(
        package='osp.test.citations.models.text',
        path='fixtures/validate/whitelist.yml',
    )

    t1 = Text.get(Text.id==t1.id)
    t2 = Text.get(Text.id==t2.id)
    t3 = Text.get(Text.id==t3.id)

    assert t1.valid == True
    assert t2.valid == True
    assert t3.valid == False
def test_deduplicate(add_text, add_citation):

    """
    Text.deduplicate() set `display` flags for all cited texts.
    """

    t1 = add_text(title="one", surname="two")
    t2 = add_text(title="one", surname="two")

    t3 = add_text(title="three", surname="four")
    t4 = add_text(title="three", surname="four")

    t5 = add_text(title="five", surname="six")

    add_citation(text=t1)
    add_citation(text=t2)
    add_citation(text=t3)
    add_citation(text=t4)
    add_citation(text=t5)

    Text.deduplicate()

    t1 = Text.get(Text.id == t1.id)
    t2 = Text.get(Text.id == t2.id)
    t3 = Text.get(Text.id == t3.id)
    t4 = Text.get(Text.id == t4.id)
    t5 = Text.get(Text.id == t5.id)

    assert t1.display == True
    assert t2.display == False

    assert t3.display == True
    assert t4.display == False

    assert t5.display == True
def test_whitelist(add_text, add_citation):
    """
    Whitelisted texts should be exempt from the fuzziness cutoff.
    """

    t1 = add_text()
    t2 = add_text()
    t3 = add_text()

    add_citation(text=t1)
    add_citation(text=t2)
    add_citation(text=t3)

    Text.validate(
        package='osp.test.citations.models.text',
        path='fixtures/validate/whitelist.yml',
    )

    t1 = Text.get(Text.id == t1.id)
    t2 = Text.get(Text.id == t2.id)
    t3 = Text.get(Text.id == t3.id)

    assert t1.valid == True
    assert t2.valid == True
    assert t3.valid == False
Beispiel #5
0
    def hydrate_nodes(self):

        """
        Load text metadata onto the nodes.
        """

        for tid in progress.bar(self.graph.nodes()):

            text = Text.get(Text.id==tid)

            self.graph.node[tid]['authors'] = text.pretty('authors')
            self.graph.node[tid]['title'] = text.pretty('title')
Beispiel #6
0
def test_validate(fields, add_text, add_citation):

    text = add_text(**fields)

    add_citation(text=text)

    Text.validate(
        package='osp.test.citations.models.text',
        path='fixtures/validate/validate.yml',
    )

    text = Text.get(Text.id==text.id)

    assert text.valid == False
def test_validate(fields, add_text, add_citation):

    text = add_text(**fields)

    add_citation(text=text)

    Text.validate(
        package='osp.test.citations.models.text',
        path='fixtures/validate/validate.yml',
    )

    text = Text.get(Text.id == text.id)

    assert text.valid == False
def text_to_docs(text_id):

    """
    Query a text against the OSP corpus.

    Args:
        text_id (int): A text row id.
    """

    row = Text.get(Text.id==text_id)


    doc_ids = set()
    for tokens in row.queries:

        # Execute the query.
        results = config.es.search(

            index='document',
            request_timeout=90,

            body={
                'fields': [],
                'size': 1000000,
                'filter': {
                    'query': {
                        'match_phrase': {
                            'body': {
                                'query': ' '.join(tokens),
                                'slop': 5,
                            }
                        }
                    }
                }
            }

        )

        # Fail the job if the result is incomplete.
        if results['timed_out']:
            raise TimeoutError()

        # Register the doc ids.
        if results['hits']['total'] > 0:
            for hit in results['hits']['hits']:
                doc_ids.add(int(hit['_id']))


    # Build doc -> text links.
    citations = []
    for doc_id in doc_ids:

        citations.append({
            'document': doc_id,
            'text': row.id,
            'tokens': row.hash_tokens,
        })

    # Bulk-insert the results.
    if citations:
        Citation.insert_many(citations).execute()