Esempio n. 1
0
File: hlom.py Progetto: overview/osp
def insert_records(page_size):

    """
    Write the records into the database.
    """

    HLOM_Record.insert_records(page_size)
Esempio n. 2
0
def test_require_title_and_author(models, mock_hlom):

    """
    Records that don't have both a title and an author should be ignored.
    """

    # No author, no title:
    m1 = mock_hlom.add_marc(title='', author='')

    # Title, no author:
    m2 = mock_hlom.add_marc(title='War and Peace', author='')

    # Author, no title:
    m3 = mock_hlom.add_marc(title='', author='Leo Tolstoy')

    # Title and author:
    m4 = mock_hlom.add_marc(title='War and Peace', author='Leo Tolstoy')

    HLOM_Record.insert_records()

    # Should just insert 1 record.
    assert HLOM_Record.select().count() == 1

    # Should insert the record with title/author.
    assert HLOM_Record.get(
        HLOM_Record.control_number==m4.control_number()
    )
Esempio n. 3
0
def hlom_index(requires_es):

    """
    Clear the HLOM index.
    """

    HLOM_Record.es_reset()
Esempio n. 4
0
File: hlom.py Progetto: overview/osp
def queue_queries():

    """
    Queue citation extraction queries.
    """

    for record in ServerSide(HLOM_Record.select()):
        config.rq.enqueue(query, record.id)
Esempio n. 5
0
    def _hlom(*args, **kwargs):

        # Write a MARC record.
        marc = mock_hlom.add_marc(*args, **kwargs)

        # Create a `hlom_record` row.
        return HLOM_Record.create(
            control_number=marc.control_number(),
            record=marc.as_marc()
        )
Esempio n. 6
0
def test_insert_records(models, mock_hlom):

    """
    HLOM_Record.insert_records() should create a row for each MARC record.
    """

    records = []

    # 10 segments:
    for i in range(10):

        # 10 records in each:
        for j in range(10):

            marc = mock_hlom.add_marc(
                data_file=str(i),
                title='title',
                author='author'
            )

            records.append(marc)

    # Insert record rows.
    HLOM_Record.insert_records()

    # Should insert 100 records.
    assert HLOM_Record.select().count() == 100

    for marc in records:

        # Pop out the `hlom_record` row.
        row = HLOM_Record.get(
            HLOM_Record.control_number==marc.control_number()
        )

        # Should store the record body.
        assert row.marc.as_marc() == marc.as_marc()
Esempio n. 7
0
    def copy_records(cls, min_rank=1000):

        """
        Copy in cited records.

        Args:
            min_rank (int): The cutoff for "frequent" words.
        """

        cited = (

            HLOM_Record.select()
            .join(HLOM_Citation)
            .group_by(HLOM_Record.id)

            # Coalesce duplicates.
            .distinct([HLOM_Record.metadata['deduping_hash']])
            .order_by(
                HLOM_Record.metadata['deduping_hash'],
                HLOM_Record.id
            )

        )

        counts = Counts()

        for r in cited:

            t = termify(r.marc.title())
            a = termify(r.marc.author())

            # Title and author empty.
            if not t or not a:
                continue

            # Title and author repeat words.
            if set.intersection(t, a):
                continue

            # No focused words in title.
            if counts.max_rank(t) < min_rank:
                continue

            # No focused words in author.
            if counts.max_rank(a) < min_rank:
                continue

            cls.create(**r._data)
Esempio n. 8
0
    def deduplicate(self):

        """
        Remove duplicate nodes.
        """

        seen = set()

        for cn in bar(self.graph.nodes()):

            # Pop out the HLOM record.
            text = HLOM_Record.get(HLOM_Record.control_number==cn)

            # If the node is a duplicate, remove it.
            if text.hash in seen: self.graph.remove_node(cn)
            else: seen.add(text.hash)
Esempio n. 9
0
    def hydrate_nodes(self):

        """
        Hydrate node metadata.
        """

        for cn in bar(self.graph.nodes()):

            # Pop out the HLOM record.
            text = HLOM_Record.get(HLOM_Record.control_number==cn)

            # Prettify the title / author.
            title       = prettify_field(text.marc.title())
            author      = prettify_field(text.marc.author())
            publisher   = prettify_field(text.marc.publisher())
            pubyear     = prettify_field(text.marc.pubyear())

            self.graph.node[cn]['title']        = title or ''
            self.graph.node[cn]['author']       = author or ''
            self.graph.node[cn]['publisher']    = publisher or ''
            self.graph.node[cn]['pubyear']      = pubyear or ''
Esempio n. 10
0
def query(id):

    """
    Query a MARC record against the OSP corpus.

    :param id: The hlom_record row id.
    """

    row = HLOM_Record.get(HLOM_Record.id==id)

    # Execute the query.
    results = config.es.search('osp', 'document', timeout=30, body={
        'fields': ['doc_id'],
        'size': 100000,
        'filter': {
            'query': {
                'match_phrase': {
                    'body': {
                        'query': row.query,
                        'slop': 50
                    }
                }
            }
        }
    })

    if results['hits']['total'] > 0:

        citations = []
        for hit in results['hits']['hits']:
            citations.append({
                'document': hit['fields']['doc_id'][0],
                'record': row.id
            })

        # Write the citation links.
        HLOM_Citation.insert_many(citations).execute()