def test_whitelist(add_text, add_citation):

    """
    Whitelisted texts should be exempt from the fuzziness cutoff.
    """

    t1 = add_text()
    t2 = add_text()
    t3 = add_text()

    add_citation(text=t1)
    add_citation(text=t2)
    add_citation(text=t3)

    Text.validate(
        package='osp.test.citations.models.text',
        path='fixtures/validate/whitelist.yml',
    )

    t1 = Text.get(Text.id==t1.id)
    t2 = Text.get(Text.id==t2.id)
    t3 = Text.get(Text.id==t3.id)

    assert t1.valid == True
    assert t2.valid == True
    assert t3.valid == False
def test_whitelist(add_text, add_citation):
    """
    Whitelisted texts should be exempt from the fuzziness cutoff.
    """

    t1 = add_text()
    t2 = add_text()
    t3 = add_text()

    add_citation(text=t1)
    add_citation(text=t2)
    add_citation(text=t3)

    Text.validate(
        package='osp.test.citations.models.text',
        path='fixtures/validate/whitelist.yml',
    )

    t1 = Text.get(Text.id == t1.id)
    t2 = Text.get(Text.id == t2.id)
    t3 = Text.get(Text.id == t3.id)

    assert t1.valid == True
    assert t2.valid == True
    assert t3.valid == False
def ingest_jstor():

    """
    Ingest JSTOR texts.
    """

    Text.ingest_jstor()
def ingest_hlom():

    """
    Ingest HLOM texts.
    """

    Text.ingest_hlom()
def test_require_title_and_author(title, author, mock_hlom):
    """
    Skip records that don't have a query-able title and author.
    """

    mock_hlom.add_marc(title=title, author=author)
    Text.ingest_hlom()

    assert Text.select().count() == 0
Example #6
0
def test_require_title_and_author(title, author, mock_jstor):
    """
    Skip records that don't have a query-able title and author.
    """

    mock_jstor.add_article(article_title=title, author=author)
    Text.ingest_jstor()

    assert Text.select().count() == 0
def test_require_title_and_author(title, author, mock_hlom):

    """
    Skip records that don't have a query-able title and author.
    """

    mock_hlom.add_marc(title=title, author=author)
    Text.ingest_hlom()

    assert Text.select().count() == 0
def test_require_title_and_author(title, author, mock_jstor):

    """
    Skip records that don't have a query-able title and author.
    """

    mock_jstor.add_article(article_title=title, author=author)
    Text.ingest_jstor()

    assert Text.select().count() == 0
def test_set_multiple_authors(mock_jstor):

    mock_jstor.add_article(author=[
        ('David W.', 'McClure'),
        ('Kara G.', 'Weisman'),
    ])

    Text.ingest_jstor()

    assert Text.select().first().authors == [
        'McClure, David W.',
        'Weisman, Kara G.',
    ]
Example #10
0
def test_load_multiple(mock_jstor):
    """
    Text.ingest_jstor() should ingest multiple records.
    """

    # 100 records.
    for i in range(100):
        mock_jstor.add_article()

    Text.ingest_jstor()

    # 100 rows.
    assert Text.select().count() == 100
Example #11
0
def test_set_multiple_authors(mock_jstor):

    mock_jstor.add_article(author=[
        ('David W.', 'McClure'),
        ('Kara G.', 'Weisman'),
    ])

    Text.ingest_jstor()

    assert Text.select().first().authors == [
        'McClure, David W.',
        'Weisman, Kara G.',
    ]
def test_load_multiple(mock_hlom):
    """
    Text.ingest_hlom() should ingest multiple records.
    """

    # 100 records.
    for i in range(100):
        mock_hlom.add_marc()

    Text.ingest_hlom()

    # 100 rows.
    assert Text.select().count() == 100
def test_load_multiple(mock_hlom):

    """
    Text.ingest_hlom() should ingest multiple records.
    """

    # 100 records.
    for i in range(100):
        mock_hlom.add_marc()

    Text.ingest_hlom()

    # 100 rows.
    assert Text.select().count() == 100
def test_validate(fields, add_text, add_citation):

    text = add_text(**fields)

    add_citation(text=text)

    Text.validate(
        package='osp.test.citations.models.text',
        path='fixtures/validate/validate.yml',
    )

    text = Text.get(Text.id == text.id)

    assert text.valid == False
def test_load_multiple(mock_jstor):

    """
    Text.ingest_jstor() should ingest multiple records.
    """

    # 100 records.
    for i in range(100):
        mock_jstor.add_article()

    Text.ingest_jstor()

    # 100 rows.
    assert Text.select().count() == 100
Example #16
0
def test_validate(fields, add_text, add_citation):

    text = add_text(**fields)

    add_citation(text=text)

    Text.validate(
        package='osp.test.citations.models.text',
        path='fixtures/validate/validate.yml',
    )

    text = Text.get(Text.id==text.id)

    assert text.valid == False
    def rank_texts(cls):

        """
        Get total citation counts and ranks for texts.

        Returns: list
        """

        count = fn.Count(Citation.id)

        query = (
            Text.select(Text, count)
            .join(Citation)
            .where(Text.display == True)
            .where(Text.valid == True)
            .group_by(Text.id)
            .order_by(Text.id)
            .naive()
        )

        counts = [t.count for t in query]

        # Compute dense-rank ratios.
        dense_ranks = rankdata(counts, "dense")
        top = max(dense_ranks)
        scores = [float(r / top) for r in dense_ranks]

        # Compute overall ranks (#1 is most frequent).
        max_ranks = rankdata(counts, "max")
        top = max(max_ranks)
        ranks = [int(top - r + 1) for r in max_ranks]

        return [dict(zip(["text", "rank", "score"], t)) for t in zip(query, ranks, scores)]
def test_empty_field():
    """
    If the field is empty, return None.
    """

    text = Text(title=None)

    assert text.pretty('title') == None
def test_string_field():
    """
    Text#pretty() should return a prettified version of the field.
    """

    text = Text(title='war and peace')

    assert text.pretty('title') == prettify('war and peace')
def test_deduplicate(add_text, add_citation):

    """
    Text.deduplicate() set `display` flags for all cited texts.
    """

    t1 = add_text(title="one", surname="two")
    t2 = add_text(title="one", surname="two")

    t3 = add_text(title="three", surname="four")
    t4 = add_text(title="three", surname="four")

    t5 = add_text(title="five", surname="six")

    add_citation(text=t1)
    add_citation(text=t2)
    add_citation(text=t3)
    add_citation(text=t4)
    add_citation(text=t5)

    Text.deduplicate()

    t1 = Text.get(Text.id == t1.id)
    t2 = Text.get(Text.id == t2.id)
    t3 = Text.get(Text.id == t3.id)
    t4 = Text.get(Text.id == t4.id)
    t5 = Text.get(Text.id == t5.id)

    assert t1.display == True
    assert t2.display == False

    assert t3.display == True
    assert t4.display == False

    assert t5.display == True
Example #21
0
def test_deduplicate(add_text, add_citation):

    """
    Text.deduplicate() set `display` flags for all cited texts.
    """

    t1 = add_text(title='one', surname='two')
    t2 = add_text(title='one', surname='two')

    t3 = add_text(title='three', surname='four')
    t4 = add_text(title='three', surname='four')

    t5 = add_text(title='five', surname='six')

    add_citation(text=t1)
    add_citation(text=t2)
    add_citation(text=t3)
    add_citation(text=t4)
    add_citation(text=t5)

    Text.deduplicate()

    t1 = Text.get(Text.id==t1.id)
    t2 = Text.get(Text.id==t2.id)
    t3 = Text.get(Text.id==t3.id)
    t4 = Text.get(Text.id==t4.id)
    t5 = Text.get(Text.id==t5.id)

    assert t1.display == True
    assert t2.display == False

    assert t3.display == True
    assert t4.display == False

    assert t5.display == True
def test_surname_blacklisted(surname, blacklisted):

    surnames = map(tokenize_field, [
        'may',
        'world bank',
    ])

    text = Text(surname=surname)

    assert text.surname_blacklisted(surnames) == blacklisted
Example #23
0
def test_title_blacklisted(title, blacklisted):

    titles = map(tokenize_field, [
        'letter',
        'the white house',
    ])

    text = Text(title=title)

    assert text.title_blacklisted(titles) == blacklisted
Example #24
0
    def hydrate_nodes(self):

        """
        Load text metadata onto the nodes.
        """

        for tid in progress.bar(self.graph.nodes()):

            text = Text.get(Text.id==tid)

            self.graph.node[tid]['authors'] = text.pretty('authors')
            self.graph.node[tid]['title'] = text.pretty('title')
def test_array_field():
    """
    If the requested field is an array, prettify each element.
    """

    text = Text(authors=[
        'david mcclure',
        'joe karaganis',
    ])

    assert text.pretty('authors') == [
        prettify('david mcclure'),
        prettify('joe karaganis'),
    ]
def test_select_cited(add_text, add_citation):
    """
    Text.select_cited() returns texts that have been cited at least once.
    """

    t1 = add_text()
    t2 = add_text()
    t3 = add_text()

    add_citation(text=t1)
    add_citation(text=t2)
    # No citation for t3

    assert list(Text.select_cited()) == [
        t1,
        t2,
    ]
def test_select_cited(add_text, add_citation):

    """
    Text.select_cited() returns texts that have been cited at least once.
    """

    t1 = add_text()
    t2 = add_text()
    t3 = add_text()

    add_citation(text=t1)
    add_citation(text=t2)
    # No citation for t3

    assert list(Text.select_cited()) == [
        t1,
        t2,
    ]
Example #28
0
def test_page_cursor(add_text):

    """
    BaseModel.page_cursor() should generate record instances in an id-ordered
    "page", defined by a page count and 0-based index.
    """

    for i in range(100):
        add_text()

    ids = []
    for i in range(7):
        ids.append([t.id for t in Text.page_cursor(7, i)])

    # 7 pages:
    assert len(ids) == 7

    # 1-100 range:
    assert sum(ids, []) == list(range(1, 101))
    def _text(corpus='corpus',
              identifier=None,
              title='Title',
              surname='Surname',
              authors=['Author'],
              valid=True,
              display=True,
              **kwargs):

        if not identifier:
            identifier = uuid.uuid4()

        return Text.create(corpus=corpus,
                           identifier=identifier,
                           title=title,
                           surname=surname,
                           authors=authors,
                           valid=valid,
                           display=display,
                           **kwargs)
    def _text(
        corpus='corpus',
        identifier=None,
        title='Title',
        surname='Surname',
        authors=['Author'],
        valid=True,
        display=True,
        **kwargs
    ):

        if not identifier:
            identifier = uuid.uuid4()

        return Text.create(
            corpus=corpus,
            identifier=identifier,
            title=title,
            surname=surname,
            authors=authors,
            valid=valid,
            display=display,
            **kwargs
        )
def test_set_title(mock_jstor):

    mock_jstor.add_article(article_title='Article Title')
    Text.ingest_jstor()

    assert Text.select().first().title == 'Article Title'
Example #32
0
def test_set_identifier(mock_jstor):

    mock_jstor.add_article(article_id='001')
    Text.ingest_jstor()

    assert Text.select().first().identifier == '001'
Example #33
0
def test_set_journal_title(mock_jstor):

    mock_jstor.add_article(journal_title='Critical Inquiry')
    Text.ingest_jstor()

    assert Text.select().first().journal_title == 'Critical Inquiry'
def test_set_issue_volume(mock_jstor):

    mock_jstor.add_article(issue_volume=200)
    Text.ingest_jstor()

    assert Text.select().first().issue_volume == '200'
def test_set_issue_number(mock_jstor):

    mock_jstor.add_article(issue_number=10)
    Text.ingest_jstor()

    assert Text.select().first().issue_number == '10'
Example #36
0
def test_set_corpus(mock_jstor):

    mock_jstor.add_article()
    Text.ingest_jstor()

    assert Text.select().first().corpus == 'jstor'
def test_set_publisher(mock_jstor):

    mock_jstor.add_article(publisher_name='Chicago Journals')
    Text.ingest_jstor()

    assert Text.select().first().publisher == 'Chicago Journals'
Example #38
0
def test_set_surname(mock_jstor):

    mock_jstor.add_article(author=[('David W.', 'McClure')])
    Text.ingest_jstor()

    assert Text.select().first().surname == 'McClure'
def test_set_url(mock_jstor):

    mock_jstor.add_article(url='http://test.org')
    Text.ingest_jstor()

    assert Text.select().first().url == 'http://test.org'
def test_set_date(mock_jstor):

    mock_jstor.add_article(pub_year=1987, pub_month=6, pub_day=25)
    Text.ingest_jstor()

    assert Text.select().first().date == '1987-06-25'
Example #41
0
def test_set_issue_volume(mock_jstor):

    mock_jstor.add_article(issue_volume=200)
    Text.ingest_jstor()

    assert Text.select().first().issue_volume == '200'
def test_set_journal_title(mock_jstor):

    mock_jstor.add_article(journal_title='Critical Inquiry')
    Text.ingest_jstor()

    assert Text.select().first().journal_title == 'Critical Inquiry'
def test_set_journal_identifier(mock_jstor):

    mock_jstor.add_article(journal_id='criticalinquiry')
    Text.ingest_jstor()

    assert Text.select().first().journal_identifier == 'criticalinquiry'
def test_set_identifier(mock_jstor):

    mock_jstor.add_article(article_id='001')
    Text.ingest_jstor()

    assert Text.select().first().identifier == '001'
def text_to_docs(text_id):

    """
    Query a text against the OSP corpus.

    Args:
        text_id (int): A text row id.
    """

    row = Text.get(Text.id==text_id)


    doc_ids = set()
    for tokens in row.queries:

        # Execute the query.
        results = config.es.search(

            index='document',
            request_timeout=90,

            body={
                'fields': [],
                'size': 1000000,
                'filter': {
                    'query': {
                        'match_phrase': {
                            'body': {
                                'query': ' '.join(tokens),
                                'slop': 5,
                            }
                        }
                    }
                }
            }

        )

        # Fail the job if the result is incomplete.
        if results['timed_out']:
            raise TimeoutError()

        # Register the doc ids.
        if results['hits']['total'] > 0:
            for hit in results['hits']['hits']:
                doc_ids.add(int(hit['_id']))


    # Build doc -> text links.
    citations = []
    for doc_id in doc_ids:

        citations.append({
            'document': doc_id,
            'text': row.id,
            'tokens': row.hash_tokens,
        })

    # Bulk-insert the results.
    if citations:
        Citation.insert_many(citations).execute()
Example #46
0
def test_set_title(mock_jstor):

    mock_jstor.add_article(article_title='Article Title')
    Text.ingest_jstor()

    assert Text.select().first().title == 'Article Title'
def test_set_corpus(mock_jstor):

    mock_jstor.add_article()
    Text.ingest_jstor()

    assert Text.select().first().corpus == 'jstor'
Example #48
0
def test_set_single_author(mock_jstor):

    mock_jstor.add_article(author=[('David W.', 'McClure')])
    Text.ingest_jstor()

    assert Text.select().first().authors == ['McClure, David W.']
def test_set_single_author(mock_jstor):

    mock_jstor.add_article(author=[('David W.', 'McClure')])
    Text.ingest_jstor()

    assert Text.select().first().authors == ['McClure, David W.']
Example #50
0
def test_set_publisher(mock_jstor):

    mock_jstor.add_article(publisher_name='Chicago Journals')
    Text.ingest_jstor()

    assert Text.select().first().publisher == 'Chicago Journals'
def test_set_surname(mock_jstor):

    mock_jstor.add_article(author=[('David W.', 'McClure')])
    Text.ingest_jstor()

    assert Text.select().first().surname == 'McClure'
Example #52
0
def test_set_date(mock_jstor):

    mock_jstor.add_article(pub_year=1987, pub_month=6, pub_day=25)
    Text.ingest_jstor()

    assert Text.select().first().date == '1987-06-25'
Example #53
0
def test_set_url(mock_jstor):

    mock_jstor.add_article(url='http://test.org')
    Text.ingest_jstor()

    assert Text.select().first().url == 'http://test.org'
Example #54
0
def test_set_journal_identifier(mock_jstor):

    mock_jstor.add_article(journal_id='criticalinquiry')
    Text.ingest_jstor()

    assert Text.select().first().journal_identifier == 'criticalinquiry'
def test_set_date(mock_hlom):

    mock_hlom.add_marc(pubyear='1987')
    Text.ingest_hlom()

    assert Text.select().first().date == '1987'
Example #56
0
def test_surname_is_toponym(surname, is_toponym):

    text = Text(surname=surname)

    assert text.surname_is_toponym == is_toponym
def test_set_pagination(mock_jstor):

    mock_jstor.add_article(fpage=200, lpage=300)
    Text.ingest_jstor()

    assert Text.select().first().pagination == '200-300'