def execute(graph1, dim): if graph1.corpus is None: print("!!! Graph has no corpus !!!") return PipelineDataTuple(graph1) model = word2vec_embedding_from_sentences_v2(graph1.corpus, CONFIGURATION, sg=0, size=dim, window=500) for descriptor, resource in graph1.elements.items(): try: resource.embeddings.append(np.array(model[descriptor.lower()]).astype(float).tolist()) except KeyError: resource.embeddings.append(np.array(model["<>"]).astype(float).tolist()) print("Key " + descriptor + " not found ... proceeding") return PipelineDataTuple(graph1)
def exec(graph1, graph2): for gold_mapping in CONFIGURATION.gold_mapping.raw_trainsets: print(" --> Preparing training data.") # package_directory = os.path.dirname(os.path.abspath(__file__)) #gold_mapping = CONFIGURATION.gold_mapping.raw_trainsets[0]#os.path.join(package_directory, '..','..', 'data', 'sap_hilti_data','sap_hilti_full_strings', # 'train_simple_sap_hilti.csv') save(graph1, graph2, ntpath.basename(gold_mapping), gold_mapping) path_to_set = CONFIGURATION.rundir + ntpath.basename( gold_mapping) + "-strcombined.csv" path_to_idset = CONFIGURATION.rundir + ntpath.basename( gold_mapping) + "-strcombined_ids.csv" df = pd.read_csv(path_to_set, index_col=['Unnamed: 0'], sep="\t", encoding="UTF-8").\ merge(pd.read_csv(path_to_idset, index_col=['Unnamed: 0'], sep="\t", encoding="UTF-8"), left_index=True,\ right_index=True) df.to_csv(CONFIGURATION.rundir + ntpath.basename(gold_mapping) + "_merged.csv", sep="\t") CONFIGURATION.gold_mapping.prepared_trainsets.append(df) #gold_mapping = CONFIGURATION.gold_mapping.raw_trainsets[1] #os.path.join(package_directory, '..','..', 'data', 'sap_hilti_data','sap_hilti_full_strings', # #'train_hard_sap_hilti.csv') #save(graph1, graph2, 'train_hard', gold_mapping) #CONFIGURATION.gold_mapping.prepared_trainsets.append(CONFIGURATION.rundir + 'train_hard' + "-strcombined.csv") if CONFIGURATION.match_cross_product: print(" --> No testset provided. Preparing cross product.") filepath = CONFIGURATION.rundir + str(uuid.uuid4().hex) + ".tmp" print(' Blocking by syntax, progress: 0%', end="\r") parallel.main(CONFIGURATION.src_triples, CONFIGURATION.tgt_triples, CONFIGURATION.src_properties, filepath) print(' Blocking by syntax, progress: 100%') CONFIGURATION.gold_mapping.raw_testsets = [filepath] else: print(" --> Preparing testset.") for gold_mapping in CONFIGURATION.gold_mapping.raw_testsets: #gold_mapping = CONFIGURATION.gold_mapping.raw_testsets[0]#os.path.join(package_directory, '..','..', 'data', 'sap_hilti_data','sap_hilti_full_strings', # 'test_simple_sap_hilti.csv') save(graph1, graph2, ntpath.basename(gold_mapping), gold_mapping) path_to_set = CONFIGURATION.rundir + ntpath.basename( gold_mapping) + "-strcombined.csv" path_to_idset = CONFIGURATION.rundir + ntpath.basename( gold_mapping) + "-strcombined_ids.csv" df = pd.read_csv(path_to_set, index_col=['Unnamed: 0'], sep="\t", encoding="UTF-8").\ merge(pd.read_csv(path_to_idset, index_col=['Unnamed: 0'], sep="\t", encoding="UTF-8"), left_index=True,\ right_index=True) df.to_csv(CONFIGURATION.rundir + ntpath.basename(gold_mapping) + "_merged.csv", sep="\t") CONFIGURATION.gold_mapping.prepared_testsets.append(df) # gold_mapping = CONFIGURATION.gold_mapping.raw_testsets[1]#os.path.join(package_directory, '..','..', 'data', 'sap_hilti_data','sap_hilti_full_strings', # # 'test_hard_sap_hilti.csv') # save(graph1, graph2, 'test_hard', gold_mapping) # CONFIGURATION.gold_mapping.prepared_testsets.append(CONFIGURATION.rundir + 'test_hard' + "-strcombined.csv") return PipelineDataTuple( graph1, graph2 ) # just return the original graph data; this is assumed to be the final step in the pipeline!
def execute(graph1, graph2, dim): documents = prepare_data(graph1) documents = documents + prepare_data(graph2) model = train(documents, dim) fill_graph(graph1, model) fill_graph(graph2, model) return PipelineDataTuple(graph1, graph2)
def execute(graph): for descriptor, resource in graph.elements.items(): tmp = list() for embedding in resource.embeddings: for num in embedding: tmp = tmp + [num] resource.embeddings = [tmp] return PipelineDataTuple(graph)
def execute(graph1, graph2, dim, properties): predicates1, documents1 = prepare_data(graph1, properties) predicates2, documents2 = prepare_data(graph2, properties) documents = documents1 + documents2 model = train(documents, dim) fill_graph(graph1, model, predicates1) fill_graph(graph2, model, predicates2) return PipelineDataTuple(graph1, graph2)
def load_kg_with_rdflib(path, format=None): g = Graph() with open(path, 'rb') as f: g.parse(f, format=format) #test = __get_namespace(g) #test = list(g.namespaces()) return PipelineDataTuple(__yield_object(g), __yield_literal(g))
def interface(main_input, args, configuration): global CONFIGURATION CONFIGURATION = configuration graph1 = main_input.get(0) graph2 = main_input.get(1) assert graph1 is not None, "Graph not found in " + os.path.basename(sys.argv[0]) if graph2 is None: return execute(graph1) else: return PipelineDataTuple(execute(graph1).elems[0], execute(graph2).elems[0])
def exec(graph1, graph2): gold_mapping = CONFIGURATION.gold_mapping.raw_trainsets[0] save(graph1, graph2, 'train', gold_mapping) CONFIGURATION.gold_mapping.prepared_trainsets.append(CONFIGURATION.rundir + 'train' + "-strcombined.csv") return PipelineDataTuple( graph1, graph2 ) # just return the original graph data; this is assumed to be the final step in the pipeline!
def interface(main_input, args, configuration): global CONFIGURATION CONFIGURATION = configuration nt_filepath = args.get(0) spo_generator = main_input.get(0) spl_generator = main_input.get(1) assert spo_generator is not None, "S-P-O generator not found in " + os.path.basename( sys.argv[0]) assert spl_generator is not None, "S-P-L generator not found in " + os.path.basename( sys.argv[0]) assert nt_filepath is not None, "Path to NT-sourcefile not found in " + os.path.basename( sys.argv[0]) return PipelineDataTuple(Graph(spo_generator, spl_generator, nt_filepath))
def execute(graph1, graph2, dim, sentence_generation_method, ngrams=False, maxdepth=1): documents = prepare_data(graph1, sentence_generation_method, ngrams, maxdepth) documents = documents + prepare_data(graph2, sentence_generation_method, ngrams, maxdepth) model = train(documents, dim, ngrams) fill_graph(graph1, model) fill_graph(graph2, model) return PipelineDataTuple(graph1, graph2)
def execute(graph1, graph2, dim): documents, documents_ids = prepare_data(graph1, dict(), list()) documents, documents_ids = prepare_data(graph2, documents_ids, documents) global ctr documents.append(["<>", "<>"]) documents_ids["<>"] = ctr ctr += 1 model = train(documents, dim) with open(CONFIGURATION.rundir + "document_ids.csv", mode="w+") as f: for descriptor, index in documents_ids.items(): f.write(descriptor + "," + str(index) + "\n") fill_graph(graph1, model, documents_ids) fill_graph(graph2, model, documents_ids) return PipelineDataTuple(graph1, graph2)
def exec(graph1, graph2, matchings_filename): married_matches = pd.read_csv(CONFIGURATION.rundir + matchings_filename, sep="\t", encoding="UTF-8") starttag = '<?xml version="1.0" encoding="utf-8"?>\n<rdf:RDF xmlns="http://knowledgeweb.semanticweb.org/heterogeneity/alignment"\n xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"\n xmlns:xsd="http://www.w3.org/2001/XMLSchema#">\n<Alignment>\n <xml>yes</xml>\n <level>0</level>\n <type>??</type>\n <onto1>\n <Ontology rdf:about="darkscape">\n <location>http://darkscape.wikia.com</location>\n </Ontology>\n </onto1>\n <onto2>\n <Ontology rdf:about="oldschoolrunescape">\n <location>http://oldschoolrunescape.wikia.com</location>\n </Ontology>\n </onto2>\n' endtag = '</Alignment>\n</rdf:RDF>' os.mkdir(CONFIGURATION.rundir + matchings_filename.replace(".csv", "")) with open(CONFIGURATION.rundir + matchings_filename.replace(".csv", "") + str(os.sep) + 'darkscape~oldschoolrunescape~results.xml', "w+", encoding="UTF-8") as f: f.write(starttag) for index, row in married_matches.iterrows(): f.write( create_elem( str(row.src_id).replace("&", "&"), str(row.tgt_id).replace("&", "&")) + "\n") f.write(endtag) return PipelineDataTuple(graph1, graph2)
def execute(graph, dim=20): embeddings = None for descriptor, resource in graph.elements.items(): if embeddings is None: embeddings = np.array([[descriptor] + resource.embeddings[0]]) else: embeddings = np.append(embeddings, [[descriptor] + resource.embeddings[0]], axis=0) df = pd.DataFrame(embeddings) df.set_index(0) pca = decomposition.KernelPCA(n_components=dim, kernel='rbf') reduced_df = pca.fit_transform( df[[df.columns[i] for i in range(len(df.columns)) if not i == 0]]) reduced_df = pd.DataFrame(reduced_df) reduced_df.loc[:, dim] = df[0] reduced_df = reduced_df.set_index(dim) for descriptor, reduced_embedding in reduced_df.iterrows(): graph.elements[descriptor].embeddings[0] = reduced_embedding.tolist() return PipelineDataTuple(graph)
def execute(graph1, dim, properties): predicates, documents = prepare_data(graph1, properties) models = train(documents, dim) fill_graph(graph1, models, predicates) return PipelineDataTuple(graph1)
def execute(graph1, graph2): for root, dir, files in os.walk(os.path.join(CONFIGURATION.musedir, "data", "dumped"), topdown=False): for name in files: os.remove(os.path.join(root, name)) for name in dir: os.rmdir(os.path.join(root, name)) try: os.remove( os.path.join(CONFIGURATION.musedir, "data", "crosslingual", "dictionaries", "src-tgt.txt")) except FileNotFoundError: pass try: os.remove( os.path.join(CONFIGURATION.musedir, "data", "embeddings1.vec")) except FileNotFoundError: pass try: os.remove( os.path.join(CONFIGURATION.musedir, "data", "embeddings2.vec")) except FileNotFoundError: pass f = open(os.path.join(CONFIGURATION.musedir, "data", "embeddings1.vec"), "w+") ctr = 0 dim = None for descriptor, resource in graph1.elements.items(): f.write(descriptor + " " + str(resource.embeddings).replace( "[", "").replace("]", "").replace(",", "") + " \n") ctr = ctr + 1 if dim is None: dim = len(resource.embeddings[0]) f.close() line_pre_adder( os.path.join(CONFIGURATION.musedir, "data", "embeddings1.vec"), str(ctr) + " " + str(dim) + "\n") f = open(os.path.join(CONFIGURATION.musedir, "data", "embeddings2.vec"), "w+") ctr = 0 dim = None for descriptor, resource in graph2.elements.items(): f.write(descriptor + " " + str(resource.embeddings).replace( "[", "").replace("]", "").replace(",", "") + " \n") ctr = ctr + 1 if dim is None: dim = len(resource.embeddings[0]) f.close() line_pre_adder( os.path.join(CONFIGURATION.musedir, "data", "embeddings2.vec"), str(ctr) + " " + str(dim) + "\n") gs = None for path_to_gs in CONFIGURATION.gold_mapping.raw_trainsets: if gs is None: gs = pd.read_csv(path_to_gs, header=None, delimiter='\t') else: tmp_gs = pd.read_csv(path_to_gs, header=None, delimiter='\t') gs = gs.append(tmp_gs, ignore_index=True) gs = gs.loc[gs[2] == 1] gs.to_csv(os.path.join(CONFIGURATION.musedir, "data", "crosslingual", "dictionaries", "src-tgt.txt"), header=False, index=False, sep='\t') gs.to_csv(os.path.join(CONFIGURATION.musedir, "data", "crosslingual", "dictionaries", "src-tgt.0-5000.txt"), header=False, index=False, sep='\t') align(os.path.join(CONFIGURATION.musedir, "data", "embeddings1.vec"), os.path.join(CONFIGURATION.musedir, "data", "embeddings2.vec"), dim) for root, dirs, files in os.walk(os.path.join(CONFIGURATION.musedir, "dumped", "debug"), topdown=False): for dir in dirs: emb_dir = root + str(os.sep) + dir ctr = 0 for line in open(emb_dir + str(os.sep) + "vectors-src.txt", "r"): if ctr < 1: ctr = ctr + 1 continue line = line.split() try: tmp = list() tmp.append(np.array(line[1:len(line)]).astype(float).tolist()) graph1.elements[line[0]].embeddings = tmp except KeyError: print("key not found for " + line[0]) ctr = 0 for line in open(emb_dir + str(os.sep) + "vectors-tgt.txt", "r"): if ctr < 1: ctr = ctr + 1 continue line = line.split() try: tmp = list() tmp.append(np.array(line[1:len(line)]).astype(float).tolist()) graph2.elements[line[0]].embeddings = tmp except KeyError: print("key not found for " + line[0]) return PipelineDataTuple(graph1, graph2)
def execute(graph1, corpus_file, properties): graph1.corpus = read_from_file(corpus_file, properties) return PipelineDataTuple(graph1)
def main(): #src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', # 'graph_triples_hilti_erp.nt') #tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', # 'graph_triples_hilti_web.nt') #src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', # 'corpus_hilti_erp.txt') #tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', # 'corpus_hilti_web.txt') #gold_mapping = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', # 'train_simple_sap_hilti.csv') src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks', 'graph_triples_hilti_erp.nt') tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks', 'graph_triples_hilti_web.nt') src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks', 'corpus_hilti_erp.txt') tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks', 'corpus_hilti_web.txt') gold_mapping = InternalGoldStandard({'trainsets': [os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks', 'final_trainset.csv')], 'testsets': [os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks', 'possible_matches.csv')] }) dim = 1 model = LogisticRegression()#XGBClassifier() labelfile = os.path.join(package_directory, '..', 'data', 'sap_hilti_data','balanced_walks', 'labels.txt') src_properties = StringMatcher_Interface.get_labels_from_file(labelfile)#["http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"] tgt_properties = StringMatcher_Interface.get_labels_from_file(labelfile)#["http://rdata2graph.sap.com/hilti_web/property/products.name"] use_streams = False name = "test" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) #line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a), # PipelineDataTuple(src_corpus)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) #line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b), # PipelineDataTuple(tgt_corpus)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'steps', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) #line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankMatcher.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, use_streams, False, True) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration)
def main(): src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'graph_triples_hilti_erp.nt') tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'graph_triples_hilti_web.nt') src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'corpus_hilti_erp.txt') tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'corpus_hilti_web.txt') gold_mapping = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'train_simple_sap_hilti.csv') dim = 3 #model = make_pipeline(PolynomialFeatures(6), Ridge())#DecisionTreeClassifier() #make_pipeline(PolynomialFeatures(8), Ridge()) #model = sklearn.linear_model.LinearRegression() #from sklearn.ensemble import RandomForestRegressor #model = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100) #model = LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True, # intercept_scaling=1, loss='squared_hinge', max_iter=1000, # multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0) from sklearn.linear_model import LogisticRegression model = XGBClassifier() src_properties = ["http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"] tgt_properties = ["http://rdata2graph.sap.com/hilti_web/property/products.name"] name = "jaccard_no_props_given" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(W2V_1InterfaceWrapper.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_a, line_b), None) line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration)
def exec(graph1, graph2, ml_model): Matchdata_Saver.interface(PipelineDataTuple(graph1, graph2), None, CONFIGURATION) #train_simple = pd.read_csv(CONFIGURATION.rundir + 'train_simple-strcombined.csv', index_col=['Unnamed: 0']) #train_hard = pd.read_csv(CONFIGURATION.rundir + 'train_hard-strcombined.csv', index_col=['Unnamed: 0']) #test_simple = pd.read_csv(CONFIGURATION.rundir + 'test_simple-strcombined.csv', index_col=['Unnamed: 0']) #test_hard = pd.read_csv(CONFIGURATION.rundir + 'test_hard-strcombined.csv', index_col=['Unnamed: 0']) train = None for trainset in CONFIGURATION.gold_mapping.prepared_trainsets: if train is None: train = trainset.loc[:, ~(trainset.columns.isin( ['src_id', 'tgt_id', 'src_category', 'tgt_category']))] else: tmp_train = trainset.loc[:, ~(trainset.columns.isin( ['src_id', 'tgt_id', 'src_category', 'tgt_category']))] train = train.append(tmp_train, ignore_index=True) # #### Alternative 1: Sample the training data manually. #a = train_simple.loc[train_simple['label']==1].sample(n=100, replace=False) #b = train_simple.loc[train_simple['label']==0].sample(n=100, replace=False) #c = train_hard.loc[train_hard['label']==1].sample(n=0, replace=False) #d = train_hard.loc[train_hard['label']==0].sample(n=600, replace=False) #train = d.append(c.append(a.append(b, ignore_index=True), ignore_index=True), ignore_index=True) # #### Alternative 2: Use all available data for training. #train = train_simple.append(train_hard, ignore_index=True) # ## Prepare train/test/prediction data x_train = train.loc[:, train.columns != 'label'] y_train = train['label'] # ## Prediction model = ml_model # RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0) #LogisticRegression(solver='lbfgs') model = model.fit( x_train[[ col for col in x_train.columns if not col == 'syntactic_diff' and not col == 'plus_diff' ]], y_train) syntactic_model = LogisticRegression( solver='lbfgs' ) # RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0) syntactic_model = syntactic_model.fit( pd.DataFrame(x_train['syntactic_diff']), y_train) dump(model, CONFIGURATION.rundir + 'model.joblib') dump(syntactic_model, CONFIGURATION.rundir + 'syntactic_model.joblib') for testset in CONFIGURATION.gold_mapping.prepared_testsets: test = testset.loc[:, ~(testset.columns.isin( ['src_id', 'tgt_id', 'src_category', 'tgt_category']))] x_test1 = test.loc[:, test.columns != 'label'] y_test1 = test['label'] CONFIGURATION.log( "\n################################################################\n\n" ) prediction = model.predict(x_test1[[ col for col in x_test1.columns if not col == 'syntactic_diff' and not col == 'plus_diff' ]]) result = classification_report(np.array(y_test1), prediction, target_names=['false', 'true']) CONFIGURATION.log("EmbeddingMatcher - ml_model performance:\n") CONFIGURATION.log(str(result)) CONFIGURATION.log(str(ConfusionMatrix(np.array(y_test1), prediction))) CONFIGURATION.log( "\n\n--------------------------------------------------------------\n" ) CONFIGURATION.log("\n" + str([ col for col in x_test1.columns if not col == 'syntactic_diff' and not col == 'plus_diff' ])) CONFIGURATION.log("\n" + str( LogisticRegression( random_state=0, solver='lbfgs', multi_class='ovr').fit( x_train[[ col for col in x_test1.columns if not col == 'syntactic_diff' and not col == 'plus_diff' ]], y_train).coef_) + "\n") testset.loc[prediction == 1, ['src_id', 'tgt_id']].to_csv( CONFIGURATION.rundir + 'ml_matchings.csv', sep="\t", index=False, encoding='UTF-8') PredictionToXMLConverter.interface( PipelineDataTuple(graph1, graph2), PipelineDataTuple('ml_matchings.csv'), CONFIGURATION) #prediction = syntactic_model.predict(pd.DataFrame(x_test1['syntactic_diff'])) #result = classification_report(prediction, np.array(y_test1), target_names=['false','true']) #print("Syntactic matching results on simple test:") #print(result) #print(ConfusionMatrix(prediction, np.array(y_test1))) #CONFIGURATION.log("Syntactic matching results on simple test:") #CONFIGURATION.log(str(result)) #CONFIGURATION.log(str(ConfusionMatrix(prediction, np.array(y_test1)))) # CONFIGURATION.log( "\n################################################################\n\n" ) if CONFIGURATION.calc_PLUS_SCORE: test_plus = test.loc[(test.plus_diff > 0.68) & (test.label == 1)] test_plus = test_plus.append(test.loc[(test.plus_diff < 0.68) & (test.label == 0)], ignore_index=True) x_test_plus = test_plus.loc[:, test_plus.columns != 'label'] y_test_plus = test_plus['label'] prediction_plus = model.predict(x_test_plus[[ col for col in x_train.columns if not col == 'syntactic_diff' and not col == 'plus_diff' ]]) result_plus = classification_report(np.array(y_test_plus), prediction_plus, target_names=['false', 'true']) CONFIGURATION.log("EmbeddingMatcher - ml_model performance+:\n") CONFIGURATION.log(str(result_plus)) CONFIGURATION.log( str(ConfusionMatrix(np.array(y_test_plus), prediction_plus))) CONFIGURATION.log( "\n\n--------------------------------------------------------------\n" ) else: CONFIGURATION.log("No performance+ calculated") CONFIGURATION.log( "\n\n--------------------------------------------------------------\n" ) #print("Syntactic matching results+ on simple test: 0.0%") #CONFIGURATION.log("Syntactic matching results+ on simple test: 0.0%") CONFIGURATION.log( "\n################################################################\n\n" ) # Schema correspondence predictions # In the following code segment, schema correspondences are predicted using the instance-matching model. # However, this method is not recommended, as the model is (most likely) primarily or only trained on # instance-correspondences. '''import scipy from cle.matcher.DatasetHelperTools import extend_features, get_schema_data_from_graph schema_data, schema_data_ids = get_schema_data_from_graph(graph1, graph2) schema_data = extend_features(schema_data) y_pred = model.predict(schema_data) y_pred = scipy.stats.zscore(np.array(y_pred)) predictions = [1 if value > 0 else 0 for value in y_pred] schema_predicted = pd.concat([pd.DataFrame({"prediction":predictions}), schema_data_ids], axis=1, sort=False) schema_predicted.to_csv(index=False,path_or_buf=CONFIGURATION.rundir+"predicted_data.csv", header=False) pd.options.display.max_colwidth = 100 pd.set_option('display.max_colwidth', -1) CONFIGURATION.log("\nschema matches predicted with ML model:\n") schema_predicted = schema_predicted[schema_predicted['prediction'] == 0] CONFIGURATION.log(schema_predicted.to_string()+"\n")''' return PipelineDataTuple(graph1, graph2)
def exec(graph1, graph2): # In[270]: additional_features = None progress = 0 def mergedf(df1, df2): if df1 is None: return df2 else: return df1.append(df2, ignore_index=True) basedir = CONFIGURATION.rundir current_process_dir = basedir dirpath = basedir all_possible_matches_path = CONFIGURATION.gold_mapping.raw_testsets[0] documents_ids_A = dict() documents_ids_B = dict() all_possible_matches = dict() all_nodeids = set() with open(all_possible_matches_path, encoding="UTF-8") as f: for line in f: line = line.replace("\n","").split("\t") all_nodeids.add(line[0]) if line[0] in all_possible_matches.keys(): all_possible_matches[line[0]].add(line[1]) else: all_possible_matches[line[0]] = set([line[1]]) if line[1] in all_possible_matches.keys(): all_possible_matches[line[1]].add(line[0]) else: all_possible_matches[line[1]] = set([line[0]]) possible_matches = CONFIGURATION.gold_mapping.prepared_testsets[0]#pd.read_csv(dirpath + "possible_matches.csv-strcombined.csv", sep=",", encoding="UTF-8") #possible_matches_ids = pd.read_csv(dirpath + "possible_matches.csv-strcombined_ids.csv", sep=",", encoding="UTF-8") #possible_matches = possible_matches.merge(possible_matches_ids, left_on=['Unnamed: 0'], right_on=['Unnamed: 0']) oaei_gold_standard3 = CONFIGURATION.gold_mapping.prepared_trainsets[0]#pd.read_csv(dirpath + "oaei_gold_standard3.csv-strcombined.csv", sep=",", encoding="UTF-8") #oaei_gold_standard3_ids = pd.read_csv(dirpath + "oaei_gold_standard3.csv-strcombined_ids.csv", sep=",", encoding="UTF-8") #oaei_gold_standard3 = oaei_gold_standard3.merge(oaei_gold_standard3_ids, left_on=['Unnamed: 0'], right_on=['Unnamed: 0']) def get_possible_matches(nid): final_matches = list(all_possible_matches[nid]) #if nid in resources.label.tolist(): # for m in matches: # if m in resources.label.tolist(): # pass # final_matches.append(m) # #if nid in classes.label.tolist(): # for m in matches: # if m in classes.label.tolist(): # final_matches.append(m) # #if nid in properties.label.tolist(): # for m in matches: # if m in properties.label.tolist(): # final_matches.append(m) return final_matches # In[311]: def get_training_material(nid): res = list() with open(dirpath+"w2v_training_material.csv", mode="r", encoding="UTF-8") as f: for line in f: if nodeid in line.split(" "): res = res + line.split(" ") return list(set(res)) model = Word2Vec.load(dirpath+"w2v.model") total = len(all_nodeids) matchings = None with open(dirpath+'additional_features.csv', mode="w+", encoding="UTF-8") as f: for nodeid in all_nodeids: possible_matches_for_nodeid = possible_matches.loc[((possible_matches.src_id==nodeid) & (possible_matches.tgt_id.isin(get_possible_matches(nodeid))))] progress += 1 if len(get_possible_matches(nodeid))<1: continue #vecs = model.wv[get_possible_matches(nodeid)] def edits(v1, v2s): res = list() v1 = v1.split("/")[-1] for v2 in v2s: v2 = v2.split("/")[-1] res.append(editdistance.eval(v1, v2)/min(len(v1), len(v2))) return np.array([res]) #x = edits(nodeid, get_possible_matches(nodeid)) #x = np.concatenate((x, np.array([get_possible_matches(nodeid)])), axis=0) #sorted_x = pd.DataFrame(x).T.sort_values(by=[0], ascending=True) sorted_x = possible_matches_for_nodeid.sort_values(by=['syntactic_diff'], ascending=True) sorted_x.loc[:,'syntax_score'] = 0 ctr = 1 #sorted_x.columns = ['syntax_diff' if col==0 else col for col in sorted_x.columns] for index, row in sorted_x.iterrows(): #print(row[1] + " - " + str(row['syntax_diff'])) sorted_x.loc[index, 'syntax_score'] = row['syntax_score'] + 1/ctr ctr += 1 #print('Closest in sum:') x = sorted_x x.loc[:,'total_score'] = x['cos_score'] + x['syntax_score'] + x['euclid_score'] + x['probability_score'] sorted_x = x.sort_values(by=['total_score'], ascending=False) #sorted_x.columns = ['tgt_id' if col==1 else col for col in sorted_x.columns] for index, row in sorted_x.iterrows():#sorted_x.loc[sorted_x.total_score == max(sorted_x.total_score.values),:].iterrows(): matching_pair = pd.DataFrame([sorted_x.loc[index]]) matching_pair.loc[:,'src_id'] = nodeid #print(nodeid + "\t" + row[1] + "\t" + str(row['total_score']) + "\t" + str(row['cos_score']) + "\t" + str(row['euclid_score'])) matchings = mergedf(matchings, matching_pair) print(" Computing syntax-ranks: " + str(int(100*progress/total)) + "%.", end='\r') print(" Computing syntax-ranks: 100%") matchings.to_csv(dirpath+"additional_features.csv") cols = [col for col in oaei_gold_standard3.columns if col not in ['label','src_id','tgt_id','src_category','tgt_category']]#['src_tgt_angle', 'src_tgt_veclen', 'plus_diff', 'syntactic_diff'] X, y = oaei_gold_standard3[cols], oaei_gold_standard3.label clf = XGBClassifier().fit(X, y) #random_state=0, solver='lbfgs', multi_class='ovr', class_weight={1:0.1,0:0.9}).fit(X, y) X, y = matchings[cols], matchings.label matchings = matchings.loc[clf.predict(X)==1] try: CONFIGURATION.log("\nStableRankMatcher - logistic regression hyperparameters:\n") CONFIGURATION.log("Coefficients: " + str(clf.coef_) + " for " + str(list(set(cols))) +"\n") CONFIGURATION.log("Intercept: " + str(clf.intercept_) + "\n") except: pass matchings.to_csv(dirpath+"remaining_matchings.csv", sep="\t") matchings = matchings.sort_values(by=['total_score','src_tgt_angle'], ascending=[False, False]) married_matchings = None ctr = 0 while len(matchings) > 0: ctr += 1 row = matchings.head(1) married_matchings = mergedf(married_matchings, pd.DataFrame(row)) matchings = matchings.loc[~(matchings.src_id == row.src_id.values[0]) & ~(matchings.tgt_id == row.tgt_id.values[0])] if married_matchings is not None: married_matchings[['src_id','tgt_id']].to_csv(dirpath+"married_matchings.csv", sep="\t", index=False) PredictionToXMLConverter.interface(PipelineDataTuple(graph1, graph2), PipelineDataTuple('married_matchings.csv'), CONFIGURATION) CONFIGURATION.log("\n\nStableRankMatcher - logistic regression performance:\n") CONFIGURATION.log(classification_report(np.array(y), clf.predict(X))) if len(married_matchings)>0: married_matchings.loc[:,'married'] = 'x' possible_matches = possible_matches.merge(married_matchings[['src_id','tgt_id', 'married','total_score']], left_on=['src_id', 'tgt_id'], right_on=['src_id', 'tgt_id'], how='left') possible_matches.loc[:, 'prediction'] = 0 possible_matches.loc[~(possible_matches.married.isna()), 'prediction'] = 1 CONFIGURATION.log("\n\nStableRankMatcher - marriage performance:\n") CONFIGURATION.log(classification_report(np.array(possible_matches.label), np.array(possible_matches.prediction))) else: CONFIGURATION.log("\n\nStableRankEmbeddingsMatcher - marriage performance: 00.00 (no matches found)\n") return PipelineDataTuple(graph1, graph2)
def main(): src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'graph_triples_hilti_erp.nt') tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'graph_triples_hilti_web.nt') src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'corpus_hilti_erp.txt') tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'corpus_hilti_web.txt') gold_mapping = InternalGoldStandard({'trainsets': [os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'train_simple_sap_hilti.csv'), os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'train_hard_sap_hilti.csv')], 'testsets': [os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'test_simple_sap_hilti.csv'), os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'test_hard_sap_hilti.csv')] }) dim = 20 model = LogisticRegression() src_properties = None#["http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"] tgt_properties = None#["http://rdata2graph.sap.com/hilti_web/property/products.name"] ##name = "W2V_1 muse xgb with 50k only on embeddings" ##pipeline = Pipeline() ##line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) ##line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) ##line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a), ## PipelineDataTuple(src_corpus)) ##line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) ##line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) ##line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b), ## PipelineDataTuple(tgt_corpus)) ##line_ab = pipeline.append_step(W2V_1InterfaceWrapper.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim)) ##line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) ##line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) ##line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab), ## PipelineDataTuple(model)) ##line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None) ##line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) ##line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) ##line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) ## ##configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, ## pipeline, src_properties, tgt_properties) ##configuration_handler = ConfigurationHandler() ##configuration_handler.execute(configuration) ## ## ## ## ##name = "W2V_1 muse xgb with 50k on embeddings and sim" ##pipeline = Pipeline() ##line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) ##line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) ##line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a), ## PipelineDataTuple(src_corpus)) ##line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) ##line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) ##line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b), ## PipelineDataTuple(tgt_corpus)) ##line_ab = pipeline.append_step(W2V_1InterfaceWrapper.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim)) ##line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) ##line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) ##line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab), ## PipelineDataTuple(model)) ##line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None) ##line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) ##line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) ##line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) ## ##configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, ## pipeline, src_properties, tgt_properties) ##configuration_handler = ConfigurationHandler() ##configuration_handler.execute(configuration) name = "jacc_no_schema_given" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) #line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a), # PipelineDataTuple(src_corpus)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) #line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b), # PipelineDataTuple(tgt_corpus)) line_ab = pipeline.append_step(PseudoD2V_1InterfaceWrapper_2.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(SimpleTriplesEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) #line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration)
def execute(self, configuration): #try: clean_cache(configuration.cachedir) prepare_dir(configuration.rundir) prepare_dir(configuration.cachedir) prepare_file(configuration.logfile) #open(configuration.logfile,"w+").close() current_time_in_millis = int(round(time.time())) print("-----------------------------------------------") print("Starting '" + configuration.name + "':\n\n") configuration.logs_ = open(configuration.logfile, "a+") configuration.log(configuration.to_string()) prepare_file(configuration.src_corpus) prepare_file(configuration.tgt_corpus) prepare_file(configuration.src_triples) prepare_file(configuration.tgt_triples) for path_to_file in configuration.gold_mapping.raw_trainsets + configuration.gold_mapping.raw_testsets: prepare_file(path_to_file) step = configuration.pipeline.get_first_step() while step is not None: print("Performing step " + str(step.func.__module__) + "." + str(step.func.__name__) + " with " + str(step.args)) x = [] if step.input_step is not None: for input_step in step.input_step.elems: for elem in input_step.output.elems: x.append(elem) t = PipelineDataTuple(*x) out = step.func(t, step.args, configuration) if step.persist_output: step.output = out step = step.next_step configuration.log( "\n\n\n\nNeeded " + str(int(round(time.time())) - current_time_in_millis) + "s.") configuration.logs_.close() print("-----------------------------------------------") #except Exception as e: # try: # configuration.logs_.close() # except: # pass # logs = open(configuration.logfile, "a+") # logs.write("\n\n\n") # logs.write(configuration.name + " FAILED due to:") # logs.write(str(e)) # logs.close() # print(configuration.name + " FAILED.") # print("-----------------------------------------------") del configuration
def exec(graph1, graph2): def mergedf(df1, df2): if df1 is None: return df2 else: return df1.append(df2, ignore_index=True) basedir = CONFIGURATION.rundir current_process_dir = basedir dirpath = basedir possible_matches = CONFIGURATION.gold_mapping.prepared_testsets[ 0] #pd.read_csv(dirpath + "possible_matches.csv-strcombined.csv", sep=",", encoding="UTF-8") #possible_matches_ids = pd.read_csv(dirpath + "possible_matches.csv-strcombined_ids.csv", sep=",", encoding="UTF-8") #possible_matches = possible_matches.merge(possible_matches_ids, left_on=['Unnamed: 0'], right_on=['Unnamed: 0']) oaei_gold_standard3 = CONFIGURATION.gold_mapping.prepared_trainsets[ 0] #pd.read_csv(dirpath + "oaei_gold_standard3.csv-strcombined.csv", sep=",", encoding="UTF-8") #oaei_gold_standard3_ids = pd.read_csv(dirpath + "oaei_gold_standard3.csv-strcombined_ids.csv", sep=",", encoding="UTF-8") #oaei_gold_standard3 = oaei_gold_standard3.merge(oaei_gold_standard3_ids, left_on=['Unnamed: 0'], right_on=['Unnamed: 0']) cols = [ col for col in oaei_gold_standard3.columns if col not in [ 'label', 'src_id', 'tgt_id', 'syntactic_diff', 'plus_diff', 'src_category', 'tgt_category' ] ] #['src_tgt_angle', 'src_tgt_veclen', 'plus_diff', 'syntactic_diff'] X, y = oaei_gold_standard3[cols], oaei_gold_standard3.label weight_ratio = float(len(y[y == 0])) / float(len(y[y == 1])) w_array = np.array([1] * y.shape[0]) w_array[y == 1] = weight_ratio * 2.0 w_array[y == 0] = (1 - weight_ratio) clf = XGBClassifier().fit(X, y, sample_weight=w_array) #random_state=0, solver='lbfgs', multi_class='ovr', class_weight={1:0.5,0:0.5}).fit(X, y) X, y = possible_matches[cols], possible_matches.label matchings = possible_matches.loc[clf.predict(X) == 1] try: CONFIGURATION.log( "\nStableRankMatcher - logistic regression hyperparameters:\n") CONFIGURATION.log("Coefficients: " + str(clf.coef_) + " for " + str(list(set(cols))) + "\n") CONFIGURATION.log("Intercept: " + str(clf.intercept_) + "\n") except: pass matchings.to_csv(dirpath + "remaining_matchings.csv", sep="\t") matchings = matchings.sort_values(by=['total_score', 'src_tgt_angle'], ascending=[False, False]) married_matchings = None ctr = 0 while len(matchings) > 0: ctr += 1 row = matchings.head(1) married_matchings = mergedf(married_matchings, pd.DataFrame(row)) matchings = matchings.loc[ ~(matchings.src_id == row.src_id.values[0]) & ~(matchings.tgt_id == row.tgt_id.values[0])] if married_matchings is not None: married_matchings[['src_id', 'tgt_id']].to_csv(dirpath + "married_matchings.csv", sep="\t", index=False) PredictionToXMLConverter.interface( PipelineDataTuple(graph1, graph2), PipelineDataTuple('married_matchings.csv'), CONFIGURATION) CONFIGURATION.log( "\n\nStableRankEmbeddingsMatcher - logistic regression performance:\n" ) CONFIGURATION.log(classification_report(np.array(y), clf.predict(X))) married_matchings.loc[:, 'married'] = 'x' if len(married_matchings) > 0: possible_matches = possible_matches.merge( married_matchings[[ 'src_id', 'tgt_id', 'married', 'total_score' ]], left_on=['src_id', 'tgt_id'], right_on=['src_id', 'tgt_id'], how='left') possible_matches.loc[:, 'prediction'] = 0 possible_matches.loc[~(possible_matches.married.isna()), 'prediction'] = 1 CONFIGURATION.log( "\n\nStableRankEmbeddingsMatcher - marriage performance:\n") CONFIGURATION.log( classification_report(np.array(possible_matches.label), np.array(possible_matches.prediction))) else: CONFIGURATION.log( "\n\nStableRankEmbeddingsMatcher - marriage performance: 00.00 (no matches found)\n" ) return PipelineDataTuple(graph1, graph2)
def main(): logfile = os.path.join(package_directory, '..', 'results.log') try: os.remove(logfile) except: pass src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'graph_triples_hilti_erp.nt') tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'graph_triples_hilti_web.nt') src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'corpus_hilti_erp.txt') tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'corpus_hilti_web.txt') gold_mapping = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'sap_hilti_gold.csv') dim = 20 model = XGBClassifier() name = "w2v d2v concat muse xgb" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_corpus)) line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim)) line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim)) line_a = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_a), None) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_corpus)) line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim)) line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim)) line_b = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_b), None) line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping, model, logfile, name)) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim, pipeline) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "w2v d2v concat xgb" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_corpus)) line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim)) line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim)) line_a = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_a), None) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_corpus)) line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim)) line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim)) line_b = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_b), None) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTule(line_a, line_b), PipelineDataTuple(gold_mapping, model, logfile, name)) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim, pipeline) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "w2v muse xgb" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_corpus)) line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_corpus)) line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping, model, logfile, name)) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim, pipeline) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "w2v xgb" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_corpus)) line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_corpus)) line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping, model, logfile, name)) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim, pipeline) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "d2v muse xgb" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping, model, logfile, name)) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim, pipeline) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "d2v xgb" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping, model, logfile, name)) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim, pipeline) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration)
def exec(graph1, graph2, ml_model): OAEIMatchdata_Saver.interface(PipelineDataTuple(graph1, graph2), None, CONFIGURATION) train = None for trainset in CONFIGURATION.gold_mapping.prepared_trainsets: if train is None: train = pd.read_csv(trainset, index_col=['Unnamed: 0']) else: tmp_train = pd.read_csv(trainset, index_col=['Unnamed: 0']) train = train.append(tmp_train, ignore_index=True) # stream test data # #### Alternative 1: Sample the training data manually. #a = train_simple.loc[train_simple['label']==1].sample(n=100, replace=False) #b = train_simple.loc[train_simple['label']==0].sample(n=100, replace=False) #c = train_hard.loc[train_hard['label']==1].sample(n=0, replace=False) #d = train_hard.loc[train_hard['label']==0].sample(n=600, replace=False) #train = d.append(c.append(a.append(b, ignore_index=True), ignore_index=True), ignore_index=True) cachefile_path = None import hashlib import re cachefile = hashlib.sha256(bytes(re.escape(CONFIGURATION.gold_mapping.raw_testsets[0]), encoding='UTF-8')).hexdigest() + '.cache' if os.path.exists(CONFIGURATION.cachedir + cachefile) and CONFIGURATION.use_cache: cachefile_path = CONFIGURATION.cachedir + cachefile test = stream_prepare_data_from_graph(graph1, graph2, CONFIGURATION.gold_mapping.raw_testsets[0], cachefile_path) # ## Prepare train/test/prediction data x_train = train.loc[:, train.columns != 'label'] y_train = train['label'] # ## Prediction model = ml_model#RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0) #LogisticRegression(solver='lbfgs') model = model.fit(x_train, y_train) #syntactic_model = LogisticRegression(solver='lbfgs')#RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0) #syntactic_model = syntactic_model.fit(pd.DataFrame(x_train['syntactic_diff']), y_train) dump(model, CONFIGURATION.rundir + 'model.joblib') #dump(model, CONFIGURATION.rundir + 'syntactic_model.joblib') prediction = list() gold = list() plus_prediction = list() plus_gold = list() ctr = 0 df = None for sample in test: ctr = ctr + 1 print(str(ctr)) if df is None: df = sample else: df = pd.concat((df, sample), axis=1) # ctr = ctr + 1 # prediction = prediction + model.predict(sample.loc[:, sample.columns != 'label']).tolist() # gold = gold + sample['label'].tolist() # print(str(ctr)) # if sample.plus_diff.values[0] > 0.68 and sample.label.values[0] == 1 or sample.plus_diff.values[0] < 0.68 and sample.label.values[0] == 0: # plus_prediction = plus_prediction + model.predict(sample.loc[:, sample.columns != 'label']).tolist() # plus_gold = plus_gold + sample['label'].tolist() # CONFIGURATION.log(str(sample.iloc[0].tolist() + [prediction]) + '\n') prediction = np.array(prediction) gold = np.array(gold) plus_prediction = np.array(plus_prediction) plus_gold = np.array(plus_gold) result = classification_report(prediction, gold, target_names=['false', 'true']) print("Results on test:") print(result) print(ConfusionMatrix(prediction, gold)) print("\n\n--------------------------------------------------------------\n") CONFIGURATION.log("Results on test:") CONFIGURATION.log(str(result)) CONFIGURATION.log(str(ConfusionMatrix(prediction, gold))) CONFIGURATION.log("\n\n--------------------------------------------------------------\n") plus_result = classification_report(plus_prediction, plus_gold, target_names=['false', 'true']) print("Results on test:") print(plus_result) print(ConfusionMatrix(plus_prediction, plus_gold)) print("\n\n--------------------------------------------------------------\n") CONFIGURATION.log("Results on test:") CONFIGURATION.log(str(plus_result)) CONFIGURATION.log(str(ConfusionMatrix(plus_prediction, plus_gold))) CONFIGURATION.log("\n\n--------------------------------------------------------------\n") print("Syntactic matching results+ on test: 0.0%") CONFIGURATION.log("Syntactic matching results+ on test: 0.0%") print("\n################################################################\n\n") CONFIGURATION.log("\n################################################################\n\n") return PipelineDataTuple(graph1, graph2)
def exec(graph1, graph2, model): setsize = 1000 # Now start prediction: positive_samples, negative_samples, combined_samples, combined_samples_ids = batch_prepare_data_from_graph(graph1, graph2, CONFIGURATION.gold_mapping) positive_samples, negative_samples, combined_samples = extend_features(positive_samples), extend_features(negative_samples), extend_features(combined_samples) non_trivial_matches_ids = extract_non_trivial_matches(graph1, graph2, combined_samples_ids, CONFIGURATION.src_properties, CONFIGURATION.tgt_properties, combined_samples) combined_samples.to_csv(CONFIGURATION.rundir+"combined.csv") combined_samples.to_csv(CONFIGURATION.projectdir+"combined.csv") combined_samples_ids.to_csv(CONFIGURATION.rundir+"combined_ids.csv") negative_samples.to_csv(CONFIGURATION.rundir+"negatives.csv") positive_samples.to_csv(CONFIGURATION.rundir+"positives.csv") package_directory = os.path.dirname(os.path.abspath(__file__)) CONFIGURATION.log("\n\n") CONFIGURATION.log("#####################################################\n") CONFIGURATION.log("#" + CONFIGURATION.name + " / " + str(model) + "\n") CONFIGURATION.log("-----------------------------------------------------\n") #Train/Test split X = pd.DataFrame(combined_samples.loc[:,combined_samples.columns != 'label']) X = pd.concat([X, combined_samples_ids], axis=1, sort=False) Y = pd.DataFrame(combined_samples.loc[:,'label']) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.5, random_state=7) X_test.set_index(['src_id','tgt_id']) combined_samples_ids.set_index(['src_id','tgt_id']) non_trivial_matches_ids.set_index(['src_id','tgt_id']) combined_samples_ids = X_test.merge(combined_samples_ids, how='inner') non_trivial_matches_ids = X_test.merge(non_trivial_matches_ids, how='inner') combined_samples_ids = combined_samples_ids.reset_index(drop = True) non_trivial_matches_ids = non_trivial_matches_ids.reset_index(drop = True) X_train = X_train.drop(['src_id','tgt_id'], axis=1) X_test = X_test.drop(['src_id','tgt_id'], axis=1) # fit model to training data model.fit(X_train, y_train.values.ravel()) #scaler = StandardScaler() # Fit only to the training data #scaler.fit(X_train) # Now apply the transformations to the data: #X_train = scaler.transform(X_train) #X_test = scaler.transform(X_test) y_pred = model.predict(X_train) y_pred = np.array(y_pred)#scipy.stats.zscore(np.array(y_pred)) predictions = [1 if value > 0.5 else 0 for value in y_pred] # evaluate predictions CONFIGURATION.log("Macro train: "+str(precision_recall_fscore_support(y_train, predictions, average='macro')) + "\n") CONFIGURATION.log("Micro train: "+str(precision_recall_fscore_support(y_train, predictions, average='micro')) + "\n") CONFIGURATION.log("#####################################################\n") y_test = y_test.reset_index(drop=True) y_train = y_train.reset_index(drop=True) y_pred = model.predict(X_test) y_pred = np.array(y_pred) #y_pred = scipy.stats.zscore(np.array(y_pred)) persisted_predictions = [1 if value > 0.5 else 0 for value in y_pred] # evaluate predictions CONFIGURATION.log("Macro test: "+str(precision_recall_fscore_support(y_test, persisted_predictions, average='macro')) + "\n") CONFIGURATION.log("Micro test: "+str(precision_recall_fscore_support(y_test, persisted_predictions, average='micro')) + "\n") CONFIGURATION.log("#####################################################\n") target_names = ['neg', 'pos'] CONFIGURATION.log("Report (pos: "+str(setsize)+" / neg: "+str(setsize)+"):\n") CONFIGURATION.log(str(classification_report(y_test, persisted_predictions, target_names=target_names)) + "\n") non_trivials = pd.merge(non_trivial_matches_ids, combined_samples_ids, left_on=['src_id','tgt_id'], right_on=['src_id','tgt_id'], how='right', indicator=True) non_trivials = non_trivials.loc[non_trivials['_merge'] == 'both'].index.tolist() #y_test = y_test['label'] CONFIGURATION.log("#####################################################\n") CONFIGURATION.log("Report+ :" + str(classification_report(y_test.loc[y_test.index.isin(non_trivials)], np.array(persisted_predictions)[non_trivials], target_names=target_names)) + "\n") # Schema correspondence predictions # In the following code segment, schema correspondences are predicted using the instance-matching model. # However, this method is not recommended, as the model is (most likely) primarily or only trained on # instance-correspondences. '''schema_data, schema_data_ids = get_schema_data_from_graph(graph1, graph2) schema_data = extend_features(schema_data) y_pred = model.predict(schema_data) y_pred = scipy.stats.zscore(np.array(y_pred)) predictions = [1 if value > 0 else 0 for value in y_pred] schema_predicted = pd.concat([pd.DataFrame({"prediction":predictions}), schema_data_ids], axis=1, sort=False) schema_predicted.to_csv(index=False,path_or_buf=package_directory+"/../../predicted_data.csv", header=False) pd.options.display.max_colwidth = 100 pd.set_option('display.max_colwidth', -1) CONFIGURATION.log("\nschema matches predicted with ML model:\n") schema_predicted = schema_predicted[schema_predicted['prediction'] == 1] #CONFIGURATION.log(schema_predicted.to_string()+"\n")''' CONFIGURATION.log("\nschema matches predicted with heuristics:\n") persisted_predictions = [x == 1 for x in persisted_predictions] positive_predictions = combined_samples_ids[persisted_predictions] correspondece_types = dict() for index, row in positive_predictions.iterrows(): try: srckey = str(graph1.elements[row['src_id']].relations['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'].descriptor) tgtkey = str(graph2.elements[row['tgt_id']].relations['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'].descriptor) if (srckey in correspondece_types.keys()): if (tgtkey in correspondece_types[srckey].keys()): correspondece_types[srckey][tgtkey] = correspondece_types[srckey][tgtkey] + 1 else: correspondece_types[srckey][tgtkey] = 1 else: correspondece_types[srckey] = dict() correspondece_types[srckey][tgtkey] = 1 except: pass for srckey, val in correspondece_types.items(): maxtgtkey = None for tgtkey, count in val.items(): if maxtgtkey == None: maxtgtkey = tgtkey if count > val[maxtgtkey]: maxtgtkey = tgtkey CONFIGURATION.log(str(srckey) + " --> " + str(maxtgtkey) + "\n") CONFIGURATION.log("\n\n\n") print(" --> Evaluated; logs written to " + str(CONFIGURATION.logfile)) return PipelineDataTuple(graph1, graph2)# just return the original graph data; this is assumed to be the final step in the pipeline!
def main(): logfile = os.path.join(package_directory, '..', 'results.log') src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks', 'graph_triples_hilti_erp.nt') tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks', 'graph_triples_hilti_web.nt') src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks', 'corpus_hilti_erp.txt') tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks', 'corpus_hilti_web.txt') gold_mapping = InternalGoldStandard({ 'trainsets': [ os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks', 'final_trainset.csv') ], 'testsets': [ os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks', 'possible_matches.csv') ] }) dim = 20 model = XGBClassifier() src_properties = [ "http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx" ] tgt_properties = [ "http://rdata2graph.sap.com/hilti_web/property/products.name" ] name = "HILTI_pure_syntax" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(1, 'steps', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(PureSyntaxMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankSyntaxMatcher.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) #line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "HILTI_w2v_steps_walklength1" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'steps', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankMatcher.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) #line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "HILTI_w2v_steps_walklength1_muse" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'steps', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankMatcher.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "HILTI_w2v_steps_walklength3" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'steps', False, 3)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankMatcher.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "HILTI_w2v_steps_walklength1_3grams" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'steps', True, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankMatcher.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "HILTI_w2v_batch_walklength1" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'batch', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankMatcher.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "HILTI_w2v_steps_walklength1_dim100" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(100, 'steps', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankMatcher.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "HILTI_d2v_steps_walklength1_muse" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkD2V_1Embedder.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'steps', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankMatcher.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "HILTI_visualization" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(2, 'steps', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration)
def main_ngram_string(): logfile = os.path.join(package_directory, '..', 'results.log') src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_3grams', 'graph_triples_hilti_erp.nt') tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_3grams', 'graph_triples_hilti_web.nt') src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_3grams', 'corpus_hilti_erp.txt') tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_3grams', 'corpus_hilti_web.txt') gold_mapping = InternalGoldStandard({ 'trainsets': [ os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_3grams', 'train_simple_sap_hilti.csv'), os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_3grams', 'train_hard_sap_hilti.csv') ], 'testsets': [ os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_3grams', 'test_simple_sap_hilti.csv'), os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_3grams', 'test_hard_sap_hilti.csv') ] }) dim = 20 model = XGBClassifier() src_properties = [ "http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx" ] tgt_properties = [ "http://rdata2graph.sap.com/hilti_web/property/products.name" ] name = "3gram: simpletriplesembedding xgb" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_a = pipeline.append_step(SimpleTriplesEmbedder.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(SimpleTriplesEmbedder.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_a, line_b), None) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) line_a = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_a), PipelineDataTuple(2)) line_b = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_b), PipelineDataTuple(2)) line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_a, line_b), None) line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "3gram: simpletriplesembedding_1 xgb" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(SimpleTriplesEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "3gram: w2v xgb" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_corpus)) line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_corpus)) line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_a, line_b), None) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) line_a = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_a), PipelineDataTuple(2)) line_b = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_b), PipelineDataTuple(2)) line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_a, line_b), None) line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "3gram: d2v xgb" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_a, line_b), None) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) line_a = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_a), PipelineDataTuple(2)) line_b = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_b), PipelineDataTuple(2)) line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_a, line_b), None) line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) # name = "3gram: pseudod2v xgb" # pipeline = Pipeline() # line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) # line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) # line_a = pipeline.append_step(PseudoD2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim)) # line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) # line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) # line_b = pipeline.append_step(PseudoD2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim)) # line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_a, line_b), None) # line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), # PipelineDataTuple(model)) # line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) # line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) # line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) # line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None) # line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) # line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) # # configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, # pipeline, src_properties, tgt_properties) # configuration_handler = ConfigurationHandler() # configuration_handler.execute(configuration) name = "3gram: W2V_1 xgb" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_corpus)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_corpus)) line_ab = pipeline.append_step(W2V_1InterfaceWrapper.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "3gram: D2V_1 xgb" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(D2V_1InterfaceWrapper.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim)) line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration)
def main(): #src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', # 'graph_triples_hilti_erp.nt') #tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', # 'graph_triples_hilti_web.nt') #src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', # 'corpus_hilti_erp.txt') #tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', # 'corpus_hilti_web.txt') #gold_mapping = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', # 'train_simple_sap_hilti.csv') src_triples = os.path.join(package_directory, '..', 'data', 'oaei_data', 'graph_triples_darkscape.nt') tgt_triples = os.path.join(package_directory, '..', 'data', 'oaei_data', 'graph_triples_oldschoolrunescape.nt') src_corpus = os.path.join(package_directory, '..', 'data', 'oaei_data', 'corpus_darkscape.txt') tgt_corpus = os.path.join(package_directory, '..', 'data', 'oaei_data', 'corpus_oldschoolrunescape.txt') gold_mapping = InternalGoldStandard({ 'trainsets': [ os.path.join(package_directory, '..', 'data', 'oaei_data', 'oaei_gold_standard2.csv') ], 'testsets': [ os.path.join(package_directory, '..', 'data', 'oaei_data', 'possible_matches.csv') ] }) dim = 20 model = XGBClassifier() #LogisticRegression() labelfile = os.path.join(package_directory, '..', 'data', 'oaei_data', 'labels.txt') src_properties = StringMatcher_Interface.get_labels_from_file(labelfile) tgt_properties = StringMatcher_Interface.get_labels_from_file(labelfile) name = "OAEI_emb_w2v_steps_walklength1" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'steps', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface, PipelineDataTuple(line_ab), None) # configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) # # # name = "OAEI_emb_w2v_steps_walklength1_muse" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'steps', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface, PipelineDataTuple(line_ab), None) # configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) # # name = "OAEI_emb_w2v_steps_walklength1_tsne" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'steps', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface, PipelineDataTuple(line_ab), None) # configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) # # # name = "OAEI_emb_w2v_steps_walklength3" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'steps', False, 3)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) # name = "OAEI_emb_w2v_steps_walklength1_3grams" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'steps', True, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface, PipelineDataTuple(line_ab), None) # configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "OAEI_emb_w2v_batch_walklength1" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'batch', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "OAEI_emb_w2v_steps_walklength1_dim100" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(100, 'steps', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration) name = "OAEI_emb_d2v_steps_walklength1_muse" pipeline = Pipeline() line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples)) line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples)) line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples)) line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples)) line_ab = pipeline.append_step(WalkD2V_1Embedder.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim, 'steps', False, 1)) line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping)) line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab), PipelineDataTuple(model)) #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2)) line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None) line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface, PipelineDataTuple(line_ab), None) configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim, pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False) configuration_handler = ConfigurationHandler() configuration_handler.execute(configuration)
def exec(graph1, graph2, model): setsize = 1000 # Now start prediction: package_directory = os.path.dirname(os.path.abspath(__file__)) CONFIGURATION.gold_mapping = os.path.join( package_directory, '..', '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings', 'hq_sap_hilti_gold_stratified.csv') positive_samples, negative_samples, combined_samples, combined_samples_ids = batch_prepare_data_from_graph( graph1, graph2, CONFIGURATION.gold_mapping) positive_samples, negative_samples, combined_samples = extend_features( positive_samples), extend_features(negative_samples), extend_features( combined_samples) non_trivial_matches_ids = extract_non_trivial_matches( graph1, graph2, combined_samples_ids, CONFIGURATION.src_properties, CONFIGURATION.tgt_properties, combined_samples) combined_samples.to_csv(CONFIGURATION.rundir + "strcombined.csv") # pd.merge(pd.merge(non_trivial_matches_ids, combined_samples_ids, left_on=['src_id','tgt_id'], right_on=['src_id','tgt_id'], how='inner', indicator=False), # combined_samples, right_index=True, left_index=True).drop(['src_id','tgt_id'], axis=1).to_csv(CONFIGURATION.rundir+"snon_trivials.csv") combined_samples_ids.to_csv(CONFIGURATION.rundir + "strcombined_ids.csv") negative_samples.to_csv(CONFIGURATION.rundir + "strnegatives.csv") positive_samples.to_csv(CONFIGURATION.rundir + "strpositives.csv") CONFIGURATION.log("\n\n") CONFIGURATION.log( "#####################################################\n") CONFIGURATION.log("#" + CONFIGURATION.name + " / " + str(model) + "\n") CONFIGURATION.log( "-----------------------------------------------------\n") #Train/Test split X = pd.DataFrame(combined_samples.loc[:, combined_samples.columns != 'label']) #X = pd.concat([X, combined_samples_ids], axis=1, sort=False) Y = pd.DataFrame(combined_samples.loc[:, 'label']) from sklearn import metrics cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=True) per = cross_validate(model, X, Y, cv=cv, scoring=('f1_micro', 'f1_macro', 'precision', 'recall'), return_train_score=True) CONFIGURATION.log("F1-macro test\t" + str(np.average(per['test_f1_macro'])) + " +/-" + str(np.std(per['test_f1_macro'])) + "\t" + str(per['test_f1_macro']) + "\n") CONFIGURATION.log("F1-macro train\t" + str(np.average(per['train_f1_macro'])) + " +/-" + str(np.std(per['train_f1_macro'])) + "\t" + str(per['train_f1_macro']) + "\n") CONFIGURATION.log("F1-micro test:\t" + str(np.average(per['test_f1_micro'])) + " +/-" + str(np.std(per['test_f1_micro'])) + "\t" + str(per['test_f1_micro']) + "\n") CONFIGURATION.log("F1-micro train:\t" + str(np.average(per['train_f1_micro'])) + " +/-" + str(np.std(per['train_f1_micro'])) + "\t" + str(per['train_f1_micro']) + "\n") CONFIGURATION.log("Precision test:\t" + str(np.average(per['test_precision'])) + " +/-" + str(np.std(per['test_precision'])) + "\t" + str(per['test_precision']) + "\n") CONFIGURATION.log("Precision train:\t" + str(np.average(per['train_precision'])) + " +/-" + str(np.std(per['train_precision'])) + "\t" + str(per['train_precision']) + "\n") CONFIGURATION.log("Recall test:\t\t" + str(np.average(per['test_recall'])) + " +/-" + str(np.std(per['test_recall'])) + "\t" + str(per['test_recall']) + "\n") CONFIGURATION.log("Recall train:\t\t" + str(np.average(per['train_recall'])) + " +/-" + str(np.std(per['train_recall'])) + "\t" + str(per['train_recall']) + "\n") from sklearn.model_selection import cross_val_predict y_pred = cross_val_predict(model, X, Y, cv=cv) y_pred = np.array(y_pred) #scipy.stats.zscore(np.array(y_pred)) predictions = [1 if value > 0.5 else 0 for value in y_pred] # evaluate predictions persisted_predictions = [1 if value > 0.5 else 0 for value in y_pred] CONFIGURATION.log('\nDataset meta info:\n') CONFIGURATION.log('Actual samples ' + str(len(Y)) + ' / Positive samples ' + str(len(Y.loc[Y['label'] == 1])) + ' / Negative samples ' + str(len(Y.loc[Y['label'] == 0])) + '\n') CONFIGURATION.log( 'Predicted samples ' + str(len(Y)) + ' / Positive samples ' + str(len(np.where(np.array(persisted_predictions) == 1)[0])) + ' / Negative samples ' + str(len(np.where(np.array(persisted_predictions) == 0)[0])) + '\n') CONFIGURATION.log( "#####################################################\n") # evaluate predictions non_trivials = pd.merge(non_trivial_matches_ids, combined_samples_ids, left_on=['src_id', 'tgt_id'], right_on=['src_id', 'tgt_id'], how='right', indicator=True) non_trivials = non_trivials.loc[non_trivials['_merge'] == 'both'].index.tolist() #y_test = y_test['label'] target_names = ['pos', 'neg'] CONFIGURATION.log("Report+:" + str( classification_report(Y.loc[Y.index.isin(non_trivials)], np.array(persisted_predictions)[non_trivials], target_names=target_names)) + "\n") CONFIGURATION.log('\nDataset meta info:\n') CONFIGURATION.log('Actual samples ' + str( len(non_trivials) ) + ' / Positive samples ' + str( len(np.where(np.array(persisted_predictions)[non_trivials] == 1)[0]) ) + ' / Negative samples ' + str( len(np.where(np.array(persisted_predictions)[non_trivials] == 0)[0])) + '\n') CONFIGURATION.log( 'Predicted samples ' + str(len(non_trivials)) + ' / Positive samples ' + str(len(np.where( np.array(Y.loc[Y.index.isin(non_trivials)]) == 1)[0])) + ' / Negative samples ' + str(len(np.where( np.array(Y.loc[Y.index.isin(non_trivials)]) == 0)[0])) + '\n') # Schema correspondence predictions # In the following code segment, schema correspondences are predicted using the instance-matching model. # However, this method is not recommended, as the model is (most likely) primarily or only trained on # instance-correspondences. '''schema_data, schema_data_ids = get_schema_data_from_graph(graph1, graph2) schema_data = extend_features(schema_data) y_pred = model.predict(schema_data) y_pred = scipy.stats.zscore(np.array(y_pred)) predictions = [1 if value > 0 else 0 for value in y_pred] schema_predicted = pd.concat([pd.DataFrame({"prediction":predictions}), schema_data_ids], axis=1, sort=False) schema_predicted.to_csv(index=False,path_or_buf=package_directory+"/../../predicted_data.csv", header=False) pd.options.display.max_colwidth = 100 pd.set_option('display.max_colwidth', -1) CONFIGURATION.log("\nschema matches predicted with ML model:\n") schema_predicted = schema_predicted[schema_predicted['prediction'] == 1] #CONFIGURATION.log(schema_predicted.to_string()+"\n")''' CONFIGURATION.log("\nschema matches predicted with heuristics:\n") persisted_predictions = [x == 1 for x in persisted_predictions] positive_predictions = combined_samples_ids[persisted_predictions] correspondece_types = dict() for index, row in positive_predictions.iterrows(): try: srckey = str(graph1.elements[row['src_id']].relations[ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'].descriptor) tgtkey = str(graph2.elements[row['tgt_id']].relations[ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'].descriptor) if (srckey in correspondece_types.keys()): if (tgtkey in correspondece_types[srckey].keys()): correspondece_types[srckey][ tgtkey] = correspondece_types[srckey][tgtkey] + 1 else: correspondece_types[srckey][tgtkey] = 1 else: correspondece_types[srckey] = dict() correspondece_types[srckey][tgtkey] = 1 except: pass for srckey, val in correspondece_types.items(): maxtgtkey = None for tgtkey, count in val.items(): if maxtgtkey == None: maxtgtkey = tgtkey if count > val[maxtgtkey]: maxtgtkey = tgtkey CONFIGURATION.log(str(srckey) + " --> " + str(maxtgtkey) + "\n") CONFIGURATION.log("\n\n\n") print(" --> Evaluated; logs written to " + str(CONFIGURATION.logfile)) return PipelineDataTuple( graph1, graph2 ) # just return the original graph data; this is assumed to be the final step in the pipeline!