def test_kb_writer_multiple_runs(tmpdir): kb_file = tmpdir.join('multiple_flushes.kb') with KbWriter(str(kb_file)) as writer: writer.add_entry('Journal of Testing', 'J.Testing') with KbWriter(str(kb_file)) as writer: writer.add_entry('Second Journal of Testing', 'Sec.J.Testing') expected = [ 'SECOND JOURNAL OF TESTING---Sec.J.Testing\n', ] assert expected == kb_file.readlines()
def test_kb_writer_many_lines(tmpdir): kb_file = tmpdir.join('many_lines.kb') with KbWriter(str(kb_file)) as writer: for numlines in xrange(100000): writer.add_entry('Journal of Testing', 'J.Testing') assert len(kb_file.readlines()) == 100000
def test_kb_writer_unicode(tmpdir): kb_file = tmpdir.join('unicode.kb') with KbWriter(str(kb_file)) as writer: writer.add_entry(u'Journal de l\'Académie', 'J.Acad.') expected = [ 'JOURNAL DE L ACADÉMIE---J.Acad.\n', ] assert expected == kb_file.readlines()
def test_kb_writer_keeps_colons(tmpdir): kb_file = tmpdir.join('keeps_colons.kb') with KbWriter(str(kb_file)) as writer: writer.add_entry('J PHYS G: NUCL PART PHYS', 'J.Phys.') expected = [ 'J PHYS G: NUCL PART PHYS---J.Phys.\n', ] assert expected == kb_file.readlines()
def create_journal_kb_file(): """Populate refextracts's journal KB from the database. Uses two raw DB queries that use syntax specific to PostgreSQL to generate a file in the format that refextract expects, that is a list of lines like:: SOURCE---DESTINATION which represents that ``SOURCE`` is translated to ``DESTINATION`` when found. Note that refextract expects ``SOURCE`` to be normalized, which means removing all non alphanumeric characters, collapsing all contiguous whitespace to one space and uppercasing the resulting string. """ refextract_journal_kb_path = current_app.config[ 'REFEXTRACT_JOURNAL_KB_PATH'] titles_query = db.session.execute(""" SELECT r.json -> 'short_title' AS short_title, r.json -> 'journal_title' -> 'title' AS journal_title FROM records_metadata AS r WHERE (r.json -> '_collections')::jsonb ? 'Journals' """) title_variants_query = db.session.execute(""" SELECT r.json -> 'short_title' AS short_title, jsonb_array_elements((r.json -> 'title_variants')::jsonb) AS title_variant FROM records_metadata AS r WHERE (r.json -> '_collections')::jsonb ? 'Journals' """) with KbWriter(kb_path=refextract_journal_kb_path) as kb_fd: for row in titles_query: kb_fd.add_entry( value=row['short_title'], kb_key=row['short_title'], ) kb_fd.add_entry( value=row['journal_title'], kb_key=row['short_title'], ) for row in title_variants_query: kb_fd.add_entry( value=row['title_variant'], kb_key=row['short_title'], )
def test_kb_writer_two_entries(tmpdir): kb_file = tmpdir.join('two_entries.kb') with KbWriter(str(kb_file)) as writer: writer.add_entry('Journal of Testing', 'J.Testing') writer.add_entry('J.Testing', 'J.Testing') expected = [ 'JOURNAL OF TESTING---J.Testing\n', 'J TESTING---J.Testing\n', ] assert expected == kb_file.readlines()