def main(path_to_serialized_model): # Deserialize model network = fieldnetwork.deserialize_network(path_to_serialized_model) # Create client store_client = StoreHandler() # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") # Retrieve indexes schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl') content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl') om = SSAPI(network, store_client, schema_sim_index, content_sim_index) om.add_krs([("dbpedia", "cache_onto/dbpedia.pkl")], parsed=True) matchings = om.find_matchings() print("Found: " + str(len(matchings))) for m in matchings: print(m) return om
def test(path_to_serialized_model): # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") total_tables = 0 avg_attrs_per_table = 0 avg_groups_per_table = 0 for db, t, attrs in read_table_columns(path_to_serialized_model): total_tables += 1 groups = extract_cohesive_groups(t, attrs) avg_attrs_per_table += len(attrs) avg_groups_per_table += len(groups) print("Table: " + str(t)) print("num groups: " + str(len(groups))) for score, tokens in groups: print("Score: " + str(score)) print(tokens) print("#####") avg_attrs_per_table = avg_attrs_per_table / total_tables avg_groups_per_table = avg_groups_per_table / total_tables print("Avg attr per table: " + str(avg_attrs_per_table)) print("Avg group per table: " + str(avg_groups_per_table))
def test_find_links(path_to_serialized_model, matchings): # Deserialize model network = fieldnetwork.deserialize_network(path_to_serialized_model) # Create client store_client = StoreHandler() # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") # Retrieve indexes schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl') content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl') om = SSAPI(network, store_client, schema_sim_index, content_sim_index) om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True) om.add_krs([("clo", "cache_onto/clo.pkl")], parsed=True) om.add_krs([("bao", "cache_onto/bao.pkl")], parsed=True) links = om.find_links(matchings) for link in links: print(link)
def generate_matchings(input_model_path, input_ontology_name_path, output_file): # Deserialize model network = fieldnetwork.deserialize_network(input_model_path) # Create client store_client = StoreHandler() # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") # Retrieve indexes schema_sim_index = io.deserialize_object(input_model_path + 'schema_sim_index.pkl') content_sim_index = io.deserialize_object(input_model_path + 'content_sim_index.pkl') # Create ontomatch api om = SSAPI(network, store_client, schema_sim_index, content_sim_index) for onto_name, onto_parsed_path in input_ontology_name_path: # Load parsed ontology om.add_krs([(onto_name, onto_parsed_path)], parsed=True) matchings = om.find_matchings() with open(output_file, 'w') as f: for m in matchings: f.write(str(m) + '\n') print("Done!")
def test(path_to_serialized_model): # Deserialize model network = fieldnetwork.deserialize_network(path_to_serialized_model) # Create client store_client = StoreHandler() # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") # Retrieve indexes schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl') content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl') # Create ontomatch api om = SSAPI(network, store_client, schema_sim_index, content_sim_index) # Load parsed ontology om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True) om.add_krs([("clo", "cache_onto/clo.pkl")], parsed=True) om.add_krs([("bao", "cache_onto/bao.pkl")], parsed=True) #om.add_krs([("go", "cache_onto/go.pkl")], parsed=True) # parse again print("Finding matchings...") st = time.time() matchings = om.find_matchings() et = time.time() print("Finding matchings...OK") print("Took: " + str(et-st)) for k, v in matchings: print(v) return om
def test_find_semantic_sim(): # Load onto om = SSAPI(None, None, None, None) # Load parsed ontology om.add_krs([("dbpedia", "cache_onto/schemaorg.pkl")], parsed=True) # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") print("Loading ontology classes...") names = [] # Load classes for kr_name, kr_handler in om.kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() svs = [] for token in cl.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append(('class', cl, svs)) print("Loading ontology classes...OK") while True: # Get words i = input("introduce two words separated by space to get similarity. EXIT to exit") tokens = i.split(' ') if tokens[0] == "EXIT": print("bye!") break svs = [] for t in tokens: sv = glove_api.get_embedding_for_word(t) if sv is not None: svs.append(sv) else: print("No vec for : " + str(t)) for _, cl, vecs in names: sim = SS.compute_semantic_similarity(svs, vecs) if sim > 0.4: print(str(cl) + " -> " + str(sim))
if __name__ == "__main__": print("No matcher") # Deserialize model path_to_serialized_model = "../models/testwithserialdwh/" network = fieldnetwork.deserialize_network(path_to_serialized_model) # Create client store_client = StoreHandler() # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") ref_path = "/Users/ra-mit/development/discovery_proto/ontomatch/enwiki-latest-all-titles-in-ns0" def gen(ref_path): i = 0 with open(ref_path, 'r') as f: for l in f: i += 1 if i % 50000 == 0: print(i) yield l pos, neg = find_matching_to_text(network, reference_name="wikipedia",
path_to_serialized_model = argv[1] onto_name = argv[2] path_to_ontology = argv[3] path_to_sem_model = argv[4] path_to_results = argv[5] path_to_gold_standard = argv[6] # Deserialize model network = fieldnetwork.deserialize_network(path_to_serialized_model) # Create client store_client = StoreHandler() # Load glove model print("Loading language model...") glove_api.load_model(path_to_sem_model) print("Loading language model...OK") # Retrieve indexes schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl') content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl') # Create ontomatch api om = SSAPI(network, store_client, schema_sim_index, content_sim_index) # Load parsed ontology om.add_krs([(onto_name, path_to_ontology)], parsed=True) # # Build content sim om.priv_build_content_sim(0.6)
def test_4_n_42(path_to_serialized_model): # Deserialize model network = fieldnetwork.deserialize_network(path_to_serialized_model) # Create client store_client = StoreHandler() # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") # Retrieve indexes schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl') content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl') # Create ontomatch api om = SSAPI(network, store_client, schema_sim_index, content_sim_index) # Load parsed ontology #om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True) #om.add_krs([("clo", "cache_onto/clo.pkl")], parsed=True) #om.add_krs([("bao", "cache_onto/bao.pkl")], parsed=True) om.add_krs([("dbpedia", "cache_onto/dbpedia.pkl")], parsed=True) # parse again # L6: [Relations] -> [Class names] (semantic groups) print("Finding L6 matchings...") st = time.time() l6_matchings, sem_coh_groups = matcherlib.find_sem_coh_matchings(om.network, om.kr_handlers) print("Finding L6 matchings...OK, " + str(len(l6_matchings)) + " found") et = time.time() print("Took: " + str(et - st)) for m in l6_matchings: print(m) for k, v in sem_coh_groups.items(): print(str(k) + " -> " + str(v)) exit() print("Finding matchings...") st = time.time() # L4: [Relation names] -> [Class names] (syntax) print("Finding L4 matchings...") st = time.time() l4_matchings = matcherlib.find_relation_class_name_matchings(om.network, om.kr_handlers) print("Finding L4 matchings...OK, " + str(len(l4_matchings)) + " found") et = time.time() print("Took: " + str(et - st)) print("computing fanout") fanout = defaultdict(int) for m in l4_matchings: sch, cla = m fanout[sch] += 1 ordered = sorted(fanout.items(), key=operator.itemgetter(1), reverse=True) for o in ordered: print(o) # for match in l4_matchings: # print(match) # L4.2: [Relation names] -> [Class names] (semantic) print("Finding L42 matchings...") st = time.time() l42_matchings = matcherlib.find_relation_class_name_sem_matchings(om.network, om.kr_handlers) print("Finding L42 matchings...OK, " + str(len(l42_matchings)) + " found") et = time.time() print("Took: " + str(et - st)) et = time.time() print("Finding matchings...OK") print("Took: " + str(et - st)) print("are l4 subsumed by l42?") not_in_l42 = 0 not_subsumed = [] for m in l4_matchings: if m not in l42_matchings: not_in_l42 += 1 not_subsumed.append(m) print("NOT-subsumed: " + str(not_in_l42)) """ # L5: [Attribute names] -> [Class names] (syntax) print("Finding L5 matchings...") st = time.time() l5_matchings = matcherlib.find_relation_class_attr_name_matching(om.network, om.kr_handlers) print("Finding L5 matchings...OK, " + str(len(l5_matchings)) + " found") et = time.time() print("Took: " + str(et - st)) # for match in l5_matchings: # print(match) # l52_matchings = [] # L52: [Attribute names] -> [Class names] (semantic) print("Finding L52 matchings...") st = time.time() l52_matchings = matcherlib.find_relation_class_attr_name_sem_matchings(om.network, om.kr_handlers) print("Finding L52 matchings...OK, " + str(len(l52_matchings)) + " found") et = time.time() print("Took: " + str(et - st)) """ with open('OUTPUT_442_only', 'w') as f: f.write("L4" + '\n') for m in l4_matchings: f.write(str(m) + '\n') f.write("L42" + '\n') for m in l42_matchings: f.write(str(m) + '\n') f.write("L5" + '\n')
def add_language_model(self, path_to_sem_model): print("Loading language model...") glove_api.load_model(path_to_sem_model) print("Loading language model...OK")