def test_infer_cardinals(): tables = get_table_group_by_hash(CARDINALS_HASH, vectorization_type="lemmatize") joined_table = join_tables(tables) lemmas = lemmatize_table(joined_table) table_class = infer_table_class(joined_table, skip_header=True) assert table_class == "cardinal"
def test_rdfize_table(): hash_count = get_hash_count(vectorization_type="lemmatize") current_count = 0 for (_hash, table_group) in load_table_groups_lazy( vectorization_type="lemmatize"): print("Processing {} out of {}".format(current_count, hash_count)) if is_hash_to_processed(_hash): current_count += 1 continue joined_table = join_tables(table_group) #if table got more than 10 columns, skip! if (len(joined_table[0]) > 10): continue #skip table groups with more than 1000 tables if (len(table_group) > 1000): continue _f = open("table/{}".format(str(_hash)), "wb") _f.write(str(joined_table).encode("utf-8")) _f.close() rdf = rdfize_table(joined_table) _f = open("run/{}".format(str(_hash)), "wb") _f.write(rdf) _f.close() add_hash_to_processed(_hash) current_count += 1
def test_infer_column_name(): tables = get_table_group_by_hash(EXAMPLE_HASH, vectorization_type="lemmatize") joined_table = join_tables(tables) col_table = columnize_table(joined_table) column_name = infer_column_name(col_table[0]) assert column_name == b"label" column_name = infer_column_name(col_table[1]) assert column_name == b"place"
def test_infer_table_class(): tables = get_table_group_by_hash(EXAMPLE_HASH, vectorization_type="lemmatize") joined_table = join_tables(tables) _class = infer_table_class(joined_table, rows=5) assert _class is not None _class = infer_table_class(joined_table, rows=30) assert _class is not None
def test_join(): for (_hash, table_group) in load_table_groups_lazy( vectorization_type="lemmatize"): print(_hash) joined_table = join_tables(table_group) #show 10 lines of joined table pprinter.pprint(joined_table[:10]) print("Table class: %s" % (infer_table_class_by_category(joined_table, skip_header=True)))
def test_rdfize_table(): tables = get_table_group_by_hash(CARDINALS_HASH, vectorization_type="lemmatize") joined_table = join_tables(tables) rdf = rdfize_table(joined_table)
def test_rdfize_table_test_1(): tables = get_table_group_by_hash(TEST_HASH_1, vectorization_type="lemmatize") joined_table = join_tables(tables) rdf = rdfize_table(joined_table)
def test_get_table_class_uri(): tables = get_table_group_by_hash(CARDINALS_HASH, vectorization_type="lemmatize") joined_table = join_tables(tables) table_class = get_table_class_uri(joined_table) assert table_class == 'http://dbpedia.org/ontology/Cardinal'
def test_join(): for (_hash, table_group) in load_table_groups_lazy( vectorization_type="lemmatize"): print(_hash) join_tables(table_group) pprinter.pprint(join_tables(table_group))
def test_infer_table_properties(): #table, rows=30, skip_header=False tables = get_table_group_by_hash(EXAMPLE_HASH, vectorization_type="lemmatize") joined_table = join_tables(tables) properties = infer_table_properties(joined_table) assert properties[1] == "http://dbpedia.org/ontology/type"
def test_infer_porn_actors(): tables = get_table_group_by_hash(PORN_ACTORS_HASH, vectorization_type="lemmatize") joined_table = join_tables(tables) category = infer_table_class_by_category(joined_table, skip_header=True) assert category == b'living_people'
def test_infer(): tables = get_table_group_by_hash(PROBLEMATIC_HASH, vectorization_type="lemmatize") joined_table = join_tables(tables) _class = infer_table_class(joined_table, rows=5, skip_header=True) assert _class == "aegean_sea" #This is wrong
def test_join_tables(): table = join_tables(TEST_TABLES) assert table == DEDUPLICATED_TABLE