Esempio n. 1
0
def test_require_title_and_author(models, mock_hlom):

    """
    Records that don't have both a title and an author should be ignored.
    """

    # No author, no title:
    m1 = mock_hlom.add_marc(title='', author='')

    # Title, no author:
    m2 = mock_hlom.add_marc(title='War and Peace', author='')

    # Author, no title:
    m3 = mock_hlom.add_marc(title='', author='Leo Tolstoy')

    # Title and author:
    m4 = mock_hlom.add_marc(title='War and Peace', author='Leo Tolstoy')

    HLOM_Record.insert_records()

    # Should just insert 1 record.
    assert HLOM_Record.select().count() == 1

    # Should insert the record with title/author.
    assert HLOM_Record.get(
        HLOM_Record.control_number==m4.control_number()
    )
Esempio n. 2
0
    def deduplicate(self):

        """
        Remove duplicate nodes.
        """

        seen = set()

        for cn in bar(self.graph.nodes()):

            # Pop out the HLOM record.
            text = HLOM_Record.get(HLOM_Record.control_number==cn)

            # If the node is a duplicate, remove it.
            if text.hash in seen: self.graph.remove_node(cn)
            else: seen.add(text.hash)
Esempio n. 3
0
    def hydrate_nodes(self):

        """
        Hydrate node metadata.
        """

        for cn in bar(self.graph.nodes()):

            # Pop out the HLOM record.
            text = HLOM_Record.get(HLOM_Record.control_number==cn)

            # Prettify the title / author.
            title       = prettify_field(text.marc.title())
            author      = prettify_field(text.marc.author())
            publisher   = prettify_field(text.marc.publisher())
            pubyear     = prettify_field(text.marc.pubyear())

            self.graph.node[cn]['title']        = title or ''
            self.graph.node[cn]['author']       = author or ''
            self.graph.node[cn]['publisher']    = publisher or ''
            self.graph.node[cn]['pubyear']      = pubyear or ''
Esempio n. 4
0
def test_insert_records(models, mock_hlom):

    """
    HLOM_Record.insert_records() should create a row for each MARC record.
    """

    records = []

    # 10 segments:
    for i in range(10):

        # 10 records in each:
        for j in range(10):

            marc = mock_hlom.add_marc(
                data_file=str(i),
                title='title',
                author='author'
            )

            records.append(marc)

    # Insert record rows.
    HLOM_Record.insert_records()

    # Should insert 100 records.
    assert HLOM_Record.select().count() == 100

    for marc in records:

        # Pop out the `hlom_record` row.
        row = HLOM_Record.get(
            HLOM_Record.control_number==marc.control_number()
        )

        # Should store the record body.
        assert row.marc.as_marc() == marc.as_marc()
Esempio n. 5
0
def query(id):

    """
    Query a MARC record against the OSP corpus.

    :param id: The hlom_record row id.
    """

    row = HLOM_Record.get(HLOM_Record.id==id)

    # Execute the query.
    results = config.es.search('osp', 'document', timeout=30, body={
        'fields': ['doc_id'],
        'size': 100000,
        'filter': {
            'query': {
                'match_phrase': {
                    'body': {
                        'query': row.query,
                        'slop': 50
                    }
                }
            }
        }
    })

    if results['hits']['total'] > 0:

        citations = []
        for hit in results['hits']['hits']:
            citations.append({
                'document': hit['fields']['doc_id'][0],
                'record': row.id
            })

        # Write the citation links.
        HLOM_Citation.insert_many(citations).execute()