def _parse_stuff_to_db(fname, db): """Parses a wikidump, stores the model supplied db.""" cur = db.cursor() with open(createtables_path()) as create: cur.executescript(create.read()) dump = join(dirname(abspath(__file__)), fname) parse_dump(dump, db, N=2) return db
def test_parse_dump(): db = sqlite3.connect(":memory:") cur = db.cursor() with open(createtables_path()) as create: cur.executescript(create.read()) dump = join(dirname(abspath(__file__)), "nlwiki-20140927-pages-articles-sample.xml") parse_dump(dump, db, N=None) ngram_count = dict(cur.execute("select ngram, tf from ngrams;")) link_count = dict(cur.execute("select target, count from linkstats;")) assert_in("Heinrich Tessenow", ngram_count) assert_in("Heinrich Tessenow", link_count)
def test_parse_dump(): db = sqlite3.connect(':memory:') cur = db.cursor() with open(createtables_path()) as create: cur.executescript(create.read()) dump = join(dirname(abspath(__file__)), 'nlwiki-20140927-pages-articles-sample.xml') parse_dump(dump, db, N=None) ngram_count = dict(cur.execute('select ngram, tf from ngrams;')) link_count = dict(cur.execute('select target, count from linkstats;')) assert_in('Heinrich Tessenow', ngram_count) assert_in('Heinrich Tessenow', link_count)
def test_parse_dump_ngrams(): db = sqlite3.connect(':memory:') cur = db.cursor() with open(createtables_path()) as create: cur.executescript(create.read()) dump = _test_dump_path() parse_dump(dump, db, N=2) ngram_count = dict(cur.execute('select ngram, tf from ngrams;')) link_count = dict(cur.execute('select target, count from linkstats;')) assert_in(ur'van München', ngram_count) assert_in(u'Vrede van M\xfcnster', link_count) # assert_greater(link_count[('AMX Index', 'Amsterdam Midkap Index')], 0) assert_greater(link_count['AMX Index'], 0)
def test_parse_dump_ngrams(): db = sqlite3.connect(":memory:") cur = db.cursor() with open(createtables_path()) as create: cur.executescript(create.read()) dump = _test_dump_path() parse_dump(dump, db, N=2) ngram_count = dict(cur.execute("select ngram, tf from ngrams;")) link_count = dict(cur.execute("select target, count from linkstats;")) assert_in(ur"van München", ngram_count) assert_in(u"Vrede van M\xfcnster", link_count) # assert_greater(link_count[('AMX Index', 'Amsterdam Midkap Index')], 0) assert_greater(link_count["AMX Index"], 0)