Beispiel #1
0
def execute(graph1, dim):
    if graph1.corpus is None:
        print("!!! Graph has no corpus !!!")
        return PipelineDataTuple(graph1)
    model = word2vec_embedding_from_sentences_v2(graph1.corpus, CONFIGURATION, sg=0, size=dim, window=500)
    for descriptor, resource in graph1.elements.items():
        try:
            resource.embeddings.append(np.array(model[descriptor.lower()]).astype(float).tolist())
        except KeyError:
            resource.embeddings.append(np.array(model["<>"]).astype(float).tolist())
            print("Key " + descriptor + " not found ... proceeding")
    return PipelineDataTuple(graph1)
def exec(graph1, graph2):

    for gold_mapping in CONFIGURATION.gold_mapping.raw_trainsets:
        print("     --> Preparing training data.")
        # package_directory = os.path.dirname(os.path.abspath(__file__))
        #gold_mapping = CONFIGURATION.gold_mapping.raw_trainsets[0]#os.path.join(package_directory, '..','..', 'data', 'sap_hilti_data','sap_hilti_full_strings',
        #     'train_simple_sap_hilti.csv')
        save(graph1, graph2, ntpath.basename(gold_mapping), gold_mapping)
        path_to_set = CONFIGURATION.rundir + ntpath.basename(
            gold_mapping) + "-strcombined.csv"
        path_to_idset = CONFIGURATION.rundir + ntpath.basename(
            gold_mapping) + "-strcombined_ids.csv"
        df = pd.read_csv(path_to_set, index_col=['Unnamed: 0'], sep="\t", encoding="UTF-8").\
                merge(pd.read_csv(path_to_idset, index_col=['Unnamed: 0'], sep="\t", encoding="UTF-8"), left_index=True,\
                right_index=True)
        df.to_csv(CONFIGURATION.rundir + ntpath.basename(gold_mapping) +
                  "_merged.csv",
                  sep="\t")
        CONFIGURATION.gold_mapping.prepared_trainsets.append(df)

    #gold_mapping = CONFIGURATION.gold_mapping.raw_trainsets[1] #os.path.join(package_directory, '..','..', 'data', 'sap_hilti_data','sap_hilti_full_strings',
    #                #'train_hard_sap_hilti.csv')
    #save(graph1, graph2, 'train_hard', gold_mapping)
    #CONFIGURATION.gold_mapping.prepared_trainsets.append(CONFIGURATION.rundir + 'train_hard' + "-strcombined.csv")

    if CONFIGURATION.match_cross_product:
        print("     --> No testset provided. Preparing cross product.")
        filepath = CONFIGURATION.rundir + str(uuid.uuid4().hex) + ".tmp"
        print('         Blocking by syntax, progress: 0%', end="\r")
        parallel.main(CONFIGURATION.src_triples, CONFIGURATION.tgt_triples,
                      CONFIGURATION.src_properties, filepath)
        print('         Blocking by syntax, progress: 100%')
        CONFIGURATION.gold_mapping.raw_testsets = [filepath]
    else:
        print("     --> Preparing testset.")
    for gold_mapping in CONFIGURATION.gold_mapping.raw_testsets:
        #gold_mapping = CONFIGURATION.gold_mapping.raw_testsets[0]#os.path.join(package_directory, '..','..', 'data', 'sap_hilti_data','sap_hilti_full_strings',
        #     'test_simple_sap_hilti.csv')
        save(graph1, graph2, ntpath.basename(gold_mapping), gold_mapping)
        path_to_set = CONFIGURATION.rundir + ntpath.basename(
            gold_mapping) + "-strcombined.csv"
        path_to_idset = CONFIGURATION.rundir + ntpath.basename(
            gold_mapping) + "-strcombined_ids.csv"
        df = pd.read_csv(path_to_set, index_col=['Unnamed: 0'], sep="\t", encoding="UTF-8").\
                merge(pd.read_csv(path_to_idset, index_col=['Unnamed: 0'], sep="\t", encoding="UTF-8"), left_index=True,\
                right_index=True)
        df.to_csv(CONFIGURATION.rundir + ntpath.basename(gold_mapping) +
                  "_merged.csv",
                  sep="\t")
        CONFIGURATION.gold_mapping.prepared_testsets.append(df)


#            gold_mapping = CONFIGURATION.gold_mapping.raw_testsets[1]#os.path.join(package_directory, '..','..', 'data', 'sap_hilti_data','sap_hilti_full_strings',
#               #     'test_hard_sap_hilti.csv')
#            save(graph1, graph2, 'test_hard', gold_mapping)
#            CONFIGURATION.gold_mapping.prepared_testsets.append(CONFIGURATION.rundir + 'test_hard' + "-strcombined.csv")

    return PipelineDataTuple(
        graph1, graph2
    )  # just return the original graph data; this is assumed to be the final step in the pipeline!
def execute(graph1, graph2, dim):
    documents = prepare_data(graph1)
    documents = documents + prepare_data(graph2)
    model = train(documents, dim)
    fill_graph(graph1, model)
    fill_graph(graph2, model)
    return PipelineDataTuple(graph1, graph2)
def execute(graph):
    for descriptor, resource in graph.elements.items():
        tmp = list()
        for embedding in resource.embeddings:
            for num in embedding:
                tmp = tmp + [num]
        resource.embeddings = [tmp]
    return PipelineDataTuple(graph)
Beispiel #5
0
def execute(graph1, graph2, dim, properties):
    predicates1, documents1 = prepare_data(graph1, properties)
    predicates2, documents2 = prepare_data(graph2, properties)
    documents = documents1 + documents2
    model = train(documents, dim)
    fill_graph(graph1, model, predicates1)
    fill_graph(graph2, model, predicates2)
    return PipelineDataTuple(graph1, graph2)
def load_kg_with_rdflib(path, format=None):
    g = Graph()
    with open(path, 'rb') as f:
        g.parse(f, format=format)

    #test = __get_namespace(g)
    #test = list(g.namespaces())

    return PipelineDataTuple(__yield_object(g), __yield_literal(g))
def interface(main_input, args, configuration):
    global CONFIGURATION
    CONFIGURATION = configuration
    graph1 = main_input.get(0)
    graph2 = main_input.get(1)
    assert graph1 is not None, "Graph not found in " + os.path.basename(sys.argv[0])
    if graph2 is None:
        return execute(graph1)
    else:
        return PipelineDataTuple(execute(graph1).elems[0], execute(graph2).elems[0])
def exec(graph1, graph2):

    gold_mapping = CONFIGURATION.gold_mapping.raw_trainsets[0]
    save(graph1, graph2, 'train', gold_mapping)
    CONFIGURATION.gold_mapping.prepared_trainsets.append(CONFIGURATION.rundir +
                                                         'train' +
                                                         "-strcombined.csv")

    return PipelineDataTuple(
        graph1, graph2
    )  # just return the original graph data; this is assumed to be the final step in the pipeline!
Beispiel #9
0
def interface(main_input, args, configuration):
    global CONFIGURATION
    CONFIGURATION = configuration
    nt_filepath = args.get(0)
    spo_generator = main_input.get(0)
    spl_generator = main_input.get(1)
    assert spo_generator is not None, "S-P-O generator not found in " + os.path.basename(
        sys.argv[0])
    assert spl_generator is not None, "S-P-L generator not found in " + os.path.basename(
        sys.argv[0])
    assert nt_filepath is not None, "Path to NT-sourcefile not found in " + os.path.basename(
        sys.argv[0])
    return PipelineDataTuple(Graph(spo_generator, spl_generator, nt_filepath))
def execute(graph1,
            graph2,
            dim,
            sentence_generation_method,
            ngrams=False,
            maxdepth=1):
    documents = prepare_data(graph1, sentence_generation_method, ngrams,
                             maxdepth)
    documents = documents + prepare_data(graph2, sentence_generation_method,
                                         ngrams, maxdepth)
    model = train(documents, dim, ngrams)
    fill_graph(graph1, model)
    fill_graph(graph2, model)
    return PipelineDataTuple(graph1, graph2)
Beispiel #11
0
def execute(graph1, graph2, dim):
    documents, documents_ids = prepare_data(graph1, dict(), list())
    documents, documents_ids = prepare_data(graph2, documents_ids, documents)

    global ctr
    documents.append(["<>", "<>"])
    documents_ids["<>"] = ctr
    ctr += 1

    model = train(documents, dim)
    with open(CONFIGURATION.rundir + "document_ids.csv", mode="w+") as f:
        for descriptor, index in documents_ids.items():
            f.write(descriptor + "," + str(index) + "\n")
    fill_graph(graph1, model, documents_ids)
    fill_graph(graph2, model, documents_ids)
    return PipelineDataTuple(graph1, graph2)
Beispiel #12
0
def exec(graph1, graph2, matchings_filename):

    married_matches = pd.read_csv(CONFIGURATION.rundir + matchings_filename,
                                  sep="\t",
                                  encoding="UTF-8")
    starttag = '<?xml version="1.0" encoding="utf-8"?>\n<rdf:RDF xmlns="http://knowledgeweb.semanticweb.org/heterogeneity/alignment"\n  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"\n  xmlns:xsd="http://www.w3.org/2001/XMLSchema#">\n<Alignment>\n  <xml>yes</xml>\n  <level>0</level>\n  <type>??</type>\n  <onto1>\n    <Ontology rdf:about="darkscape">\n      <location>http://darkscape.wikia.com</location>\n    </Ontology>\n  </onto1>\n  <onto2>\n    <Ontology rdf:about="oldschoolrunescape">\n      <location>http://oldschoolrunescape.wikia.com</location>\n    </Ontology>\n  </onto2>\n'
    endtag = '</Alignment>\n</rdf:RDF>'
    os.mkdir(CONFIGURATION.rundir + matchings_filename.replace(".csv", ""))
    with open(CONFIGURATION.rundir + matchings_filename.replace(".csv", "") +
              str(os.sep) + 'darkscape~oldschoolrunescape~results.xml',
              "w+",
              encoding="UTF-8") as f:
        f.write(starttag)
        for index, row in married_matches.iterrows():
            f.write(
                create_elem(
                    str(row.src_id).replace("&", "&amp;"),
                    str(row.tgt_id).replace("&", "&amp;")) + "\n")
        f.write(endtag)

    return PipelineDataTuple(graph1, graph2)
Beispiel #13
0
def execute(graph, dim=20):
    embeddings = None
    for descriptor, resource in graph.elements.items():
        if embeddings is None:
            embeddings = np.array([[descriptor] + resource.embeddings[0]])
        else:
            embeddings = np.append(embeddings,
                                   [[descriptor] + resource.embeddings[0]],
                                   axis=0)
    df = pd.DataFrame(embeddings)
    df.set_index(0)

    pca = decomposition.KernelPCA(n_components=dim, kernel='rbf')
    reduced_df = pca.fit_transform(
        df[[df.columns[i] for i in range(len(df.columns)) if not i == 0]])
    reduced_df = pd.DataFrame(reduced_df)
    reduced_df.loc[:, dim] = df[0]
    reduced_df = reduced_df.set_index(dim)
    for descriptor, reduced_embedding in reduced_df.iterrows():
        graph.elements[descriptor].embeddings[0] = reduced_embedding.tolist()

    return PipelineDataTuple(graph)
def execute(graph1, dim, properties):
    predicates, documents = prepare_data(graph1, properties)
    models = train(documents, dim)
    fill_graph(graph1, models, predicates)
    return PipelineDataTuple(graph1)
Beispiel #15
0
def execute(graph1, graph2):
    for root, dir, files in os.walk(os.path.join(CONFIGURATION.musedir, "data",
                                                 "dumped"),
                                    topdown=False):
        for name in files:
            os.remove(os.path.join(root, name))
        for name in dir:
            os.rmdir(os.path.join(root, name))

    try:
        os.remove(
            os.path.join(CONFIGURATION.musedir, "data", "crosslingual",
                         "dictionaries", "src-tgt.txt"))
    except FileNotFoundError:
        pass
    try:
        os.remove(
            os.path.join(CONFIGURATION.musedir, "data", "embeddings1.vec"))
    except FileNotFoundError:
        pass
    try:
        os.remove(
            os.path.join(CONFIGURATION.musedir, "data", "embeddings2.vec"))
    except FileNotFoundError:
        pass

    f = open(os.path.join(CONFIGURATION.musedir, "data", "embeddings1.vec"),
             "w+")
    ctr = 0
    dim = None
    for descriptor, resource in graph1.elements.items():
        f.write(descriptor + " " + str(resource.embeddings).replace(
            "[", "").replace("]", "").replace(",", "") + " \n")
        ctr = ctr + 1
        if dim is None:
            dim = len(resource.embeddings[0])
    f.close()
    line_pre_adder(
        os.path.join(CONFIGURATION.musedir, "data", "embeddings1.vec"),
        str(ctr) + " " + str(dim) + "\n")

    f = open(os.path.join(CONFIGURATION.musedir, "data", "embeddings2.vec"),
             "w+")
    ctr = 0
    dim = None
    for descriptor, resource in graph2.elements.items():
        f.write(descriptor + " " + str(resource.embeddings).replace(
            "[", "").replace("]", "").replace(",", "") + " \n")
        ctr = ctr + 1
        if dim is None:
            dim = len(resource.embeddings[0])
    f.close()
    line_pre_adder(
        os.path.join(CONFIGURATION.musedir, "data", "embeddings2.vec"),
        str(ctr) + " " + str(dim) + "\n")

    gs = None
    for path_to_gs in CONFIGURATION.gold_mapping.raw_trainsets:
        if gs is None:
            gs = pd.read_csv(path_to_gs, header=None, delimiter='\t')
        else:
            tmp_gs = pd.read_csv(path_to_gs, header=None, delimiter='\t')
            gs = gs.append(tmp_gs, ignore_index=True)
    gs = gs.loc[gs[2] == 1]
    gs.to_csv(os.path.join(CONFIGURATION.musedir, "data", "crosslingual",
                           "dictionaries", "src-tgt.txt"),
              header=False,
              index=False,
              sep='\t')
    gs.to_csv(os.path.join(CONFIGURATION.musedir, "data", "crosslingual",
                           "dictionaries", "src-tgt.0-5000.txt"),
              header=False,
              index=False,
              sep='\t')

    align(os.path.join(CONFIGURATION.musedir, "data", "embeddings1.vec"),
          os.path.join(CONFIGURATION.musedir, "data", "embeddings2.vec"), dim)

    for root, dirs, files in os.walk(os.path.join(CONFIGURATION.musedir,
                                                  "dumped", "debug"),
                                     topdown=False):
        for dir in dirs:
            emb_dir = root + str(os.sep) + dir

    ctr = 0
    for line in open(emb_dir + str(os.sep) + "vectors-src.txt", "r"):
        if ctr < 1:
            ctr = ctr + 1
            continue
        line = line.split()
        try:
            tmp = list()
            tmp.append(np.array(line[1:len(line)]).astype(float).tolist())
            graph1.elements[line[0]].embeddings = tmp
        except KeyError:
            print("key not found for " + line[0])

    ctr = 0
    for line in open(emb_dir + str(os.sep) + "vectors-tgt.txt", "r"):
        if ctr < 1:
            ctr = ctr + 1
            continue
        line = line.split()
        try:
            tmp = list()
            tmp.append(np.array(line[1:len(line)]).astype(float).tolist())
            graph2.elements[line[0]].embeddings = tmp
        except KeyError:
            print("key not found for " + line[0])

    return PipelineDataTuple(graph1, graph2)
def execute(graph1, corpus_file, properties):
    graph1.corpus = read_from_file(corpus_file, properties)
    return PipelineDataTuple(graph1)
Beispiel #17
0
def main():



    #src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                           'graph_triples_hilti_erp.nt')
    #tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                           'graph_triples_hilti_web.nt')
    #src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                          'corpus_hilti_erp.txt')
    #tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                          'corpus_hilti_web.txt')
    #gold_mapping = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                            'train_simple_sap_hilti.csv')
    src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks',
                               'graph_triples_hilti_erp.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks',
                               'graph_triples_hilti_web.nt')
    src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks',
                              'corpus_hilti_erp.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks',
                              'corpus_hilti_web.txt')
    gold_mapping = InternalGoldStandard({'trainsets':
                                            [os.path.join(package_directory, '..', 'data',
                                            'sap_hilti_data', 'balanced_walks', 'final_trainset.csv')],
                                         'testsets': [os.path.join(package_directory, '..', 'data',
                                            'sap_hilti_data', 'balanced_walks', 'possible_matches.csv')]
                                        })
    dim = 1
    model = LogisticRegression()#XGBClassifier()
    labelfile = os.path.join(package_directory, '..', 'data', 'sap_hilti_data','balanced_walks',
                              'labels.txt')
    src_properties = StringMatcher_Interface.get_labels_from_file(labelfile)#["http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"]
    tgt_properties = StringMatcher_Interface.get_labels_from_file(labelfile)#["http://rdata2graph.sap.com/hilti_web/property/products.name"]
    use_streams = False


    name = "test"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    #line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
    #                              PipelineDataTuple(src_corpus))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    #line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
    #                              PipelineDataTuple(tgt_corpus))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None)

    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    #line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface, PipelineDataTuple(line_ab), None)


    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
                                  pipeline, src_properties, tgt_properties, use_streams, False, True)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
Beispiel #18
0
def main():



    src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                               'graph_triples_hilti_erp.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                               'graph_triples_hilti_web.nt')
    src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                              'corpus_hilti_erp.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                              'corpus_hilti_web.txt')
    gold_mapping = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                                'train_simple_sap_hilti.csv')

    dim = 3
    #model = make_pipeline(PolynomialFeatures(6), Ridge())#DecisionTreeClassifier() #make_pipeline(PolynomialFeatures(8), Ridge())
    #model = sklearn.linear_model.LinearRegression()
    #from sklearn.ensemble import RandomForestRegressor
    #model = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)
    #model = LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
    #                  intercept_scaling=1, loss='squared_hinge', max_iter=1000,
    #                  multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0)
    from sklearn.linear_model import LogisticRegression
    model = XGBClassifier()
    src_properties = ["http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"]
    tgt_properties = ["http://rdata2graph.sap.com/hilti_web/property/products.name"]


    name = "jaccard_no_props_given"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(W2V_1InterfaceWrapper.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim))
    line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)
    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
                                  pipeline, src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
Beispiel #19
0
def exec(graph1, graph2, ml_model):

    Matchdata_Saver.interface(PipelineDataTuple(graph1, graph2), None,
                              CONFIGURATION)

    #train_simple = pd.read_csv(CONFIGURATION.rundir + 'train_simple-strcombined.csv', index_col=['Unnamed: 0'])
    #train_hard = pd.read_csv(CONFIGURATION.rundir + 'train_hard-strcombined.csv', index_col=['Unnamed: 0'])
    #test_simple = pd.read_csv(CONFIGURATION.rundir + 'test_simple-strcombined.csv', index_col=['Unnamed: 0'])
    #test_hard = pd.read_csv(CONFIGURATION.rundir + 'test_hard-strcombined.csv', index_col=['Unnamed: 0'])

    train = None
    for trainset in CONFIGURATION.gold_mapping.prepared_trainsets:
        if train is None:
            train = trainset.loc[:, ~(trainset.columns.isin(
                ['src_id', 'tgt_id', 'src_category', 'tgt_category']))]
        else:
            tmp_train = trainset.loc[:, ~(trainset.columns.isin(
                ['src_id', 'tgt_id', 'src_category', 'tgt_category']))]
            train = train.append(tmp_train, ignore_index=True)

    # #### Alternative 1: Sample the training data manually.
    #a = train_simple.loc[train_simple['label']==1].sample(n=100, replace=False)
    #b = train_simple.loc[train_simple['label']==0].sample(n=100, replace=False)
    #c = train_hard.loc[train_hard['label']==1].sample(n=0, replace=False)
    #d = train_hard.loc[train_hard['label']==0].sample(n=600, replace=False)
    #train = d.append(c.append(a.append(b, ignore_index=True), ignore_index=True), ignore_index=True)

    # #### Alternative 2: Use all available data for training.
    #train = train_simple.append(train_hard, ignore_index=True)

    # ## Prepare train/test/prediction data
    x_train = train.loc[:, train.columns != 'label']
    y_train = train['label']

    # ## Prediction
    model = ml_model  # RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0) #LogisticRegression(solver='lbfgs')
    model = model.fit(
        x_train[[
            col for col in x_train.columns
            if not col == 'syntactic_diff' and not col == 'plus_diff'
        ]], y_train)
    syntactic_model = LogisticRegression(
        solver='lbfgs'
    )  # RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
    syntactic_model = syntactic_model.fit(
        pd.DataFrame(x_train['syntactic_diff']), y_train)
    dump(model, CONFIGURATION.rundir + 'model.joblib')
    dump(syntactic_model, CONFIGURATION.rundir + 'syntactic_model.joblib')

    for testset in CONFIGURATION.gold_mapping.prepared_testsets:
        test = testset.loc[:, ~(testset.columns.isin(
            ['src_id', 'tgt_id', 'src_category', 'tgt_category']))]

        x_test1 = test.loc[:, test.columns != 'label']
        y_test1 = test['label']

        CONFIGURATION.log(
            "\n################################################################\n\n"
        )

        prediction = model.predict(x_test1[[
            col for col in x_test1.columns
            if not col == 'syntactic_diff' and not col == 'plus_diff'
        ]])
        result = classification_report(np.array(y_test1),
                                       prediction,
                                       target_names=['false', 'true'])
        CONFIGURATION.log("EmbeddingMatcher - ml_model performance:\n")
        CONFIGURATION.log(str(result))
        CONFIGURATION.log(str(ConfusionMatrix(np.array(y_test1), prediction)))
        CONFIGURATION.log(
            "\n\n--------------------------------------------------------------\n"
        )

        CONFIGURATION.log("\n" + str([
            col for col in x_test1.columns
            if not col == 'syntactic_diff' and not col == 'plus_diff'
        ]))
        CONFIGURATION.log("\n" + str(
            LogisticRegression(
                random_state=0, solver='lbfgs', multi_class='ovr').fit(
                    x_train[[
                        col for col in x_test1.columns if
                        not col == 'syntactic_diff' and not col == 'plus_diff'
                    ]], y_train).coef_) + "\n")

        testset.loc[prediction == 1, ['src_id', 'tgt_id']].to_csv(
            CONFIGURATION.rundir + 'ml_matchings.csv',
            sep="\t",
            index=False,
            encoding='UTF-8')
        PredictionToXMLConverter.interface(
            PipelineDataTuple(graph1, graph2),
            PipelineDataTuple('ml_matchings.csv'), CONFIGURATION)

        #prediction = syntactic_model.predict(pd.DataFrame(x_test1['syntactic_diff']))
        #result = classification_report(prediction, np.array(y_test1), target_names=['false','true'])
        #print("Syntactic matching results on simple test:")
        #print(result)
        #print(ConfusionMatrix(prediction, np.array(y_test1)))
        #CONFIGURATION.log("Syntactic matching results on simple test:")
        #CONFIGURATION.log(str(result))
        #CONFIGURATION.log(str(ConfusionMatrix(prediction, np.array(y_test1))))
        #
        CONFIGURATION.log(
            "\n################################################################\n\n"
        )

        if CONFIGURATION.calc_PLUS_SCORE:
            test_plus = test.loc[(test.plus_diff > 0.68) & (test.label == 1)]
            test_plus = test_plus.append(test.loc[(test.plus_diff < 0.68)
                                                  & (test.label == 0)],
                                         ignore_index=True)
            x_test_plus = test_plus.loc[:, test_plus.columns != 'label']
            y_test_plus = test_plus['label']
            prediction_plus = model.predict(x_test_plus[[
                col for col in x_train.columns
                if not col == 'syntactic_diff' and not col == 'plus_diff'
            ]])
            result_plus = classification_report(np.array(y_test_plus),
                                                prediction_plus,
                                                target_names=['false', 'true'])
            CONFIGURATION.log("EmbeddingMatcher - ml_model performance+:\n")
            CONFIGURATION.log(str(result_plus))
            CONFIGURATION.log(
                str(ConfusionMatrix(np.array(y_test_plus), prediction_plus)))
            CONFIGURATION.log(
                "\n\n--------------------------------------------------------------\n"
            )
        else:
            CONFIGURATION.log("No performance+ calculated")
            CONFIGURATION.log(
                "\n\n--------------------------------------------------------------\n"
            )
        #print("Syntactic matching results+ on simple test: 0.0%")
        #CONFIGURATION.log("Syntactic matching results+ on simple test: 0.0%")

        CONFIGURATION.log(
            "\n################################################################\n\n"
        )

    # Schema correspondence predictions
    # In the following code segment, schema correspondences are predicted using the instance-matching model.
    # However, this method is not recommended, as the model is (most likely) primarily or only trained on
    # instance-correspondences.
    '''import scipy
    from cle.matcher.DatasetHelperTools import extend_features, get_schema_data_from_graph
    schema_data, schema_data_ids = get_schema_data_from_graph(graph1, graph2)
    schema_data = extend_features(schema_data)
    y_pred = model.predict(schema_data)
    y_pred = scipy.stats.zscore(np.array(y_pred))
    predictions = [1 if value > 0 else 0 for value in y_pred]
    schema_predicted = pd.concat([pd.DataFrame({"prediction":predictions}), schema_data_ids], axis=1, sort=False)
    schema_predicted.to_csv(index=False,path_or_buf=CONFIGURATION.rundir+"predicted_data.csv", header=False)
    pd.options.display.max_colwidth = 100
    pd.set_option('display.max_colwidth', -1)
    CONFIGURATION.log("\nschema matches predicted with ML model:\n")
    schema_predicted = schema_predicted[schema_predicted['prediction'] == 0]
    CONFIGURATION.log(schema_predicted.to_string()+"\n")'''

    return PipelineDataTuple(graph1, graph2)
Beispiel #20
0
def exec(graph1, graph2):


        # In[270]:
        additional_features = None
        progress = 0

        def mergedf(df1, df2):
            if df1 is None:
                return df2
            else:
                return df1.append(df2, ignore_index=True)

        basedir = CONFIGURATION.rundir
        current_process_dir = basedir
        dirpath = basedir
        all_possible_matches_path = CONFIGURATION.gold_mapping.raw_testsets[0]

        documents_ids_A = dict()
        documents_ids_B = dict()
        all_possible_matches = dict()
        all_nodeids = set()
        with open(all_possible_matches_path, encoding="UTF-8") as f:
            for line in f:
                line = line.replace("\n","").split("\t")
                all_nodeids.add(line[0])
                if line[0] in all_possible_matches.keys():
                    all_possible_matches[line[0]].add(line[1])
                else:
                    all_possible_matches[line[0]] = set([line[1]])

                if line[1] in all_possible_matches.keys():
                    all_possible_matches[line[1]].add(line[0])
                else:
                    all_possible_matches[line[1]] = set([line[0]])

        possible_matches = CONFIGURATION.gold_mapping.prepared_testsets[0]#pd.read_csv(dirpath + "possible_matches.csv-strcombined.csv", sep=",", encoding="UTF-8")
        #possible_matches_ids = pd.read_csv(dirpath + "possible_matches.csv-strcombined_ids.csv", sep=",", encoding="UTF-8")
        #possible_matches = possible_matches.merge(possible_matches_ids, left_on=['Unnamed: 0'], right_on=['Unnamed: 0'])


        oaei_gold_standard3 = CONFIGURATION.gold_mapping.prepared_trainsets[0]#pd.read_csv(dirpath + "oaei_gold_standard3.csv-strcombined.csv", sep=",", encoding="UTF-8")
        #oaei_gold_standard3_ids = pd.read_csv(dirpath + "oaei_gold_standard3.csv-strcombined_ids.csv", sep=",", encoding="UTF-8")
        #oaei_gold_standard3 = oaei_gold_standard3.merge(oaei_gold_standard3_ids, left_on=['Unnamed: 0'], right_on=['Unnamed: 0'])




        def get_possible_matches(nid):
                final_matches = list(all_possible_matches[nid])
                #if nid in resources.label.tolist():
                #    for m in matches:
                #        if m in resources.label.tolist():
                #            pass
                #            final_matches.append(m)
#
                #if nid in classes.label.tolist():
                #    for m in matches:
                #        if m in classes.label.tolist():
                #            final_matches.append(m)
#
                #if nid in properties.label.tolist():
                #    for m in matches:
                #        if m in properties.label.tolist():
                #            final_matches.append(m)

                return final_matches


        # In[311]:


        def get_training_material(nid):
            res = list()
            with open(dirpath+"w2v_training_material.csv", mode="r", encoding="UTF-8") as f:
                for line in f:
                    if nodeid in line.split(" "):
                        res = res + line.split(" ")
                return list(set(res))
        model = Word2Vec.load(dirpath+"w2v.model")

        total = len(all_nodeids)
        matchings = None
        with open(dirpath+'additional_features.csv', mode="w+", encoding="UTF-8") as f:
            for nodeid in all_nodeids:

                possible_matches_for_nodeid = possible_matches.loc[((possible_matches.src_id==nodeid) & (possible_matches.tgt_id.isin(get_possible_matches(nodeid))))]



                progress += 1
                if len(get_possible_matches(nodeid))<1:
                    continue


                #vecs = model.wv[get_possible_matches(nodeid)]
                def edits(v1, v2s):
                    res = list()
                    v1 = v1.split("/")[-1]
                    for v2 in v2s:
                        v2 = v2.split("/")[-1]
                        res.append(editdistance.eval(v1, v2)/min(len(v1), len(v2)))
                    return np.array([res])
                #x = edits(nodeid, get_possible_matches(nodeid))
                #x = np.concatenate((x, np.array([get_possible_matches(nodeid)])), axis=0)
                #sorted_x = pd.DataFrame(x).T.sort_values(by=[0], ascending=True)
                sorted_x = possible_matches_for_nodeid.sort_values(by=['syntactic_diff'], ascending=True)
                sorted_x.loc[:,'syntax_score'] = 0
                ctr = 1
                #sorted_x.columns = ['syntax_diff' if col==0 else col for col in sorted_x.columns]
                for index, row in sorted_x.iterrows():
                    #print(row[1] + " - " + str(row['syntax_diff']))
                    sorted_x.loc[index, 'syntax_score'] = row['syntax_score'] + 1/ctr
                    ctr += 1




                #print('Closest in sum:')
                x = sorted_x
                x.loc[:,'total_score'] = x['cos_score'] + x['syntax_score'] + x['euclid_score'] + x['probability_score']
                sorted_x = x.sort_values(by=['total_score'], ascending=False)
                #sorted_x.columns = ['tgt_id' if col==1 else col for col in sorted_x.columns]
                for index, row in sorted_x.iterrows():#sorted_x.loc[sorted_x.total_score == max(sorted_x.total_score.values),:].iterrows():
                    matching_pair = pd.DataFrame([sorted_x.loc[index]])
                    matching_pair.loc[:,'src_id'] = nodeid
                    #print(nodeid + "\t" + row[1] + "\t" + str(row['total_score']) + "\t" + str(row['cos_score']) + "\t" + str(row['euclid_score']))
                    matchings = mergedf(matchings, matching_pair)



                print("         Computing syntax-ranks: " + str(int(100*progress/total)) + "%.", end='\r')

        print("         Computing syntax-ranks: 100%")



        matchings.to_csv(dirpath+"additional_features.csv")

        cols = [col for col in oaei_gold_standard3.columns if col not in ['label','src_id','tgt_id','src_category','tgt_category']]#['src_tgt_angle', 'src_tgt_veclen', 'plus_diff', 'syntactic_diff']
        X, y = oaei_gold_standard3[cols], oaei_gold_standard3.label
        clf = XGBClassifier().fit(X, y)
        #random_state=0, solver='lbfgs', multi_class='ovr', class_weight={1:0.1,0:0.9}).fit(X, y)

        X, y = matchings[cols], matchings.label
        matchings = matchings.loc[clf.predict(X)==1]


        try:
            CONFIGURATION.log("\nStableRankMatcher - logistic regression hyperparameters:\n")
            CONFIGURATION.log("Coefficients: " + str(clf.coef_) + " for " + str(list(set(cols))) +"\n")
            CONFIGURATION.log("Intercept: " + str(clf.intercept_) + "\n")
        except:
            pass
        matchings.to_csv(dirpath+"remaining_matchings.csv", sep="\t")


        matchings = matchings.sort_values(by=['total_score','src_tgt_angle'], ascending=[False, False])
        married_matchings = None
        ctr = 0
        while len(matchings) > 0:
                ctr += 1
                row = matchings.head(1)
                married_matchings = mergedf(married_matchings, pd.DataFrame(row))
                matchings = matchings.loc[~(matchings.src_id == row.src_id.values[0]) & ~(matchings.tgt_id == row.tgt_id.values[0])]

        if married_matchings is not None:
            married_matchings[['src_id','tgt_id']].to_csv(dirpath+"married_matchings.csv", sep="\t", index=False)


            PredictionToXMLConverter.interface(PipelineDataTuple(graph1, graph2), PipelineDataTuple('married_matchings.csv'), CONFIGURATION)




            CONFIGURATION.log("\n\nStableRankMatcher - logistic regression performance:\n")
            CONFIGURATION.log(classification_report(np.array(y), clf.predict(X)))

            if len(married_matchings)>0:
                married_matchings.loc[:,'married'] = 'x'
                possible_matches = possible_matches.merge(married_matchings[['src_id','tgt_id', 'married','total_score']], left_on=['src_id', 'tgt_id'], right_on=['src_id', 'tgt_id'], how='left')
                possible_matches.loc[:, 'prediction'] = 0
                possible_matches.loc[~(possible_matches.married.isna()), 'prediction'] = 1
                CONFIGURATION.log("\n\nStableRankMatcher - marriage performance:\n")
                CONFIGURATION.log(classification_report(np.array(possible_matches.label), np.array(possible_matches.prediction)))
            else:
                CONFIGURATION.log("\n\nStableRankEmbeddingsMatcher - marriage performance: 00.00 (no matches found)\n")



        return PipelineDataTuple(graph1, graph2)
Beispiel #21
0
def main():



    src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                               'graph_triples_hilti_erp.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                               'graph_triples_hilti_web.nt')
    src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                              'corpus_hilti_erp.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                              'corpus_hilti_web.txt')
    gold_mapping = InternalGoldStandard({'trainsets':
                                            [os.path.join(package_directory, '..', 'data',
                                            'sap_hilti_data', 'sap_hilti_full_strings', 'train_simple_sap_hilti.csv'),
                                            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                                            'sap_hilti_full_strings', 'train_hard_sap_hilti.csv')],
                                         'testsets': [os.path.join(package_directory, '..', 'data',
                                            'sap_hilti_data', 'sap_hilti_full_strings', 'test_simple_sap_hilti.csv'),
                                            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                                            'sap_hilti_full_strings', 'test_hard_sap_hilti.csv')]
                                        })
    dim = 20
    model = LogisticRegression()
    src_properties = None#["http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"]
    tgt_properties = None#["http://rdata2graph.sap.com/hilti_web/property/products.name"]




    ##name = "W2V_1 muse xgb with 50k only on embeddings"
    ##pipeline = Pipeline()
    ##line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    ##line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    ##line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
    ##                              PipelineDataTuple(src_corpus))
    ##line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    ##line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    ##line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
    ##                              PipelineDataTuple(tgt_corpus))
    ##line_ab = pipeline.append_step(W2V_1InterfaceWrapper.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim))
    ##line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    ##line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab),
    ##                               PipelineDataTuple(model))
    ##line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)
##
    ##configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
    ##                              pipeline, src_properties, tgt_properties)
    ##configuration_handler = ConfigurationHandler()
    ##configuration_handler.execute(configuration)
##
##
##
##
    ##name = "W2V_1 muse xgb with 50k on embeddings and sim"
    ##pipeline = Pipeline()
    ##line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    ##line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    ##line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
    ##                              PipelineDataTuple(src_corpus))
    ##line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    ##line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    ##line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
    ##                              PipelineDataTuple(tgt_corpus))
    ##line_ab = pipeline.append_step(W2V_1InterfaceWrapper.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim))
    ##line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    ##line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab),
    ##                               PipelineDataTuple(model))
    ##line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)
##
    ##configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
    ##                              pipeline, src_properties, tgt_properties)
    ##configuration_handler = ConfigurationHandler()
    ##configuration_handler.execute(configuration)

    name = "jacc_no_schema_given"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    #line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
    #                              PipelineDataTuple(src_corpus))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    #line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
    #                              PipelineDataTuple(tgt_corpus))
    line_ab = pipeline.append_step(PseudoD2V_1InterfaceWrapper_2.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim))
    line_ab = pipeline.append_step(SimpleTriplesEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim))

    line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    #line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None)

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
                                  pipeline, src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
    def execute(self, configuration):

        #try:

        clean_cache(configuration.cachedir)
        prepare_dir(configuration.rundir)
        prepare_dir(configuration.cachedir)
        prepare_file(configuration.logfile)

        #open(configuration.logfile,"w+").close()

        current_time_in_millis = int(round(time.time()))

        print("-----------------------------------------------")
        print("Starting '" + configuration.name + "':\n\n")

        configuration.logs_ = open(configuration.logfile, "a+")
        configuration.log(configuration.to_string())

        prepare_file(configuration.src_corpus)
        prepare_file(configuration.tgt_corpus)
        prepare_file(configuration.src_triples)
        prepare_file(configuration.tgt_triples)
        for path_to_file in configuration.gold_mapping.raw_trainsets + configuration.gold_mapping.raw_testsets:
            prepare_file(path_to_file)

        step = configuration.pipeline.get_first_step()
        while step is not None:
            print("Performing step " + str(step.func.__module__) + "." +
                  str(step.func.__name__) + " with " + str(step.args))
            x = []
            if step.input_step is not None:
                for input_step in step.input_step.elems:
                    for elem in input_step.output.elems:
                        x.append(elem)
            t = PipelineDataTuple(*x)
            out = step.func(t, step.args, configuration)
            if step.persist_output:
                step.output = out

            step = step.next_step

        configuration.log(
            "\n\n\n\nNeeded " +
            str(int(round(time.time())) - current_time_in_millis) + "s.")
        configuration.logs_.close()

        print("-----------------------------------------------")
        #except Exception as e:
        #    try:
        #        configuration.logs_.close()
        #    except:
        #        pass
        #    logs = open(configuration.logfile, "a+")
        #    logs.write("\n\n\n")
        #    logs.write(configuration.name + " FAILED due to:")
        #    logs.write(str(e))
        #    logs.close()
        #    print(configuration.name + " FAILED.")
        #    print("-----------------------------------------------")

        del configuration
Beispiel #23
0
def exec(graph1, graph2):
    def mergedf(df1, df2):
        if df1 is None:
            return df2
        else:
            return df1.append(df2, ignore_index=True)

    basedir = CONFIGURATION.rundir
    current_process_dir = basedir
    dirpath = basedir

    possible_matches = CONFIGURATION.gold_mapping.prepared_testsets[
        0]  #pd.read_csv(dirpath + "possible_matches.csv-strcombined.csv", sep=",", encoding="UTF-8")
    #possible_matches_ids = pd.read_csv(dirpath + "possible_matches.csv-strcombined_ids.csv", sep=",", encoding="UTF-8")
    #possible_matches = possible_matches.merge(possible_matches_ids, left_on=['Unnamed: 0'], right_on=['Unnamed: 0'])

    oaei_gold_standard3 = CONFIGURATION.gold_mapping.prepared_trainsets[
        0]  #pd.read_csv(dirpath + "oaei_gold_standard3.csv-strcombined.csv", sep=",", encoding="UTF-8")
    #oaei_gold_standard3_ids = pd.read_csv(dirpath + "oaei_gold_standard3.csv-strcombined_ids.csv", sep=",", encoding="UTF-8")
    #oaei_gold_standard3 = oaei_gold_standard3.merge(oaei_gold_standard3_ids, left_on=['Unnamed: 0'], right_on=['Unnamed: 0'])

    cols = [
        col for col in oaei_gold_standard3.columns if col not in [
            'label', 'src_id', 'tgt_id', 'syntactic_diff', 'plus_diff',
            'src_category', 'tgt_category'
        ]
    ]  #['src_tgt_angle', 'src_tgt_veclen', 'plus_diff', 'syntactic_diff']
    X, y = oaei_gold_standard3[cols], oaei_gold_standard3.label
    weight_ratio = float(len(y[y == 0])) / float(len(y[y == 1]))
    w_array = np.array([1] * y.shape[0])
    w_array[y == 1] = weight_ratio * 2.0
    w_array[y == 0] = (1 - weight_ratio)
    clf = XGBClassifier().fit(X, y, sample_weight=w_array)
    #random_state=0, solver='lbfgs', multi_class='ovr', class_weight={1:0.5,0:0.5}).fit(X, y)

    X, y = possible_matches[cols], possible_matches.label
    matchings = possible_matches.loc[clf.predict(X) == 1]

    try:
        CONFIGURATION.log(
            "\nStableRankMatcher - logistic regression hyperparameters:\n")
        CONFIGURATION.log("Coefficients: " + str(clf.coef_) + " for " +
                          str(list(set(cols))) + "\n")
        CONFIGURATION.log("Intercept: " + str(clf.intercept_) + "\n")
    except:
        pass
    matchings.to_csv(dirpath + "remaining_matchings.csv", sep="\t")

    matchings = matchings.sort_values(by=['total_score', 'src_tgt_angle'],
                                      ascending=[False, False])
    married_matchings = None
    ctr = 0
    while len(matchings) > 0:
        ctr += 1
        row = matchings.head(1)
        married_matchings = mergedf(married_matchings, pd.DataFrame(row))
        matchings = matchings.loc[
            ~(matchings.src_id == row.src_id.values[0])
            & ~(matchings.tgt_id == row.tgt_id.values[0])]

    if married_matchings is not None:
        married_matchings[['src_id',
                           'tgt_id']].to_csv(dirpath + "married_matchings.csv",
                                             sep="\t",
                                             index=False)

        PredictionToXMLConverter.interface(
            PipelineDataTuple(graph1, graph2),
            PipelineDataTuple('married_matchings.csv'), CONFIGURATION)

        CONFIGURATION.log(
            "\n\nStableRankEmbeddingsMatcher - logistic regression performance:\n"
        )
        CONFIGURATION.log(classification_report(np.array(y), clf.predict(X)))

        married_matchings.loc[:, 'married'] = 'x'
        if len(married_matchings) > 0:
            possible_matches = possible_matches.merge(
                married_matchings[[
                    'src_id', 'tgt_id', 'married', 'total_score'
                ]],
                left_on=['src_id', 'tgt_id'],
                right_on=['src_id', 'tgt_id'],
                how='left')
            possible_matches.loc[:, 'prediction'] = 0
            possible_matches.loc[~(possible_matches.married.isna()),
                                 'prediction'] = 1
            CONFIGURATION.log(
                "\n\nStableRankEmbeddingsMatcher - marriage performance:\n")
            CONFIGURATION.log(
                classification_report(np.array(possible_matches.label),
                                      np.array(possible_matches.prediction)))
        else:
            CONFIGURATION.log(
                "\n\nStableRankEmbeddingsMatcher - marriage performance: 00.00 (no matches found)\n"
            )

    return PipelineDataTuple(graph1, graph2)
Beispiel #24
0
def main():
    logfile = os.path.join(package_directory, '..', 'results.log')
    try:
        os.remove(logfile)
    except:
        pass

    src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                               'graph_triples_hilti_erp.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                               'graph_triples_hilti_web.nt')
    src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                              'corpus_hilti_erp.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                              'corpus_hilti_web.txt')
    gold_mapping = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                                'sap_hilti_gold.csv')
    dim = 20
    model = XGBClassifier()

    name = "w2v d2v concat muse xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_corpus))
    line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_a = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_a), None)
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_corpus))
    line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_b = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_b), None)
    line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(gold_mapping, model, logfile, name))

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim,
                                  pipeline)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)





    name = "w2v d2v concat xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_corpus))
    line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_a = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_a), None)
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_corpus))
    line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_b = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_b), None)
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTule(line_a, line_b),
                                   PipelineDataTuple(gold_mapping, model, logfile, name))

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim,
                                  pipeline)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)





    name = "w2v muse xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_corpus))
    line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_corpus))
    line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(gold_mapping, model, logfile, name))

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim,
                                  pipeline)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)





    name = "w2v xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_corpus))
    line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_corpus))
    line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(gold_mapping, model, logfile, name))

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim,
                                  pipeline)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)





    name = "d2v muse xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(gold_mapping, model, logfile, name))

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim,
                                  pipeline)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)




    name = "d2v xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(gold_mapping, model, logfile, name))

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim,
                                  pipeline)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
Beispiel #25
0
def exec(graph1, graph2, ml_model):

    OAEIMatchdata_Saver.interface(PipelineDataTuple(graph1, graph2), None, CONFIGURATION)

    train = None
    for trainset in CONFIGURATION.gold_mapping.prepared_trainsets:
        if train is None:
            train = pd.read_csv(trainset, index_col=['Unnamed: 0'])
        else:
            tmp_train = pd.read_csv(trainset, index_col=['Unnamed: 0'])
            train = train.append(tmp_train, ignore_index=True)

     # stream test data


    # #### Alternative 1: Sample the training data manually.
    #a = train_simple.loc[train_simple['label']==1].sample(n=100, replace=False)
    #b = train_simple.loc[train_simple['label']==0].sample(n=100, replace=False)
    #c = train_hard.loc[train_hard['label']==1].sample(n=0, replace=False)
    #d = train_hard.loc[train_hard['label']==0].sample(n=600, replace=False)
    #train = d.append(c.append(a.append(b, ignore_index=True), ignore_index=True), ignore_index=True)
    cachefile_path = None
    import hashlib
    import re
    cachefile = hashlib.sha256(bytes(re.escape(CONFIGURATION.gold_mapping.raw_testsets[0]), encoding='UTF-8')).hexdigest() + '.cache'
    if os.path.exists(CONFIGURATION.cachedir + cachefile) and CONFIGURATION.use_cache:
        cachefile_path = CONFIGURATION.cachedir + cachefile

    test = stream_prepare_data_from_graph(graph1, graph2, CONFIGURATION.gold_mapping.raw_testsets[0], cachefile_path)


    # ## Prepare train/test/prediction data
    x_train = train.loc[:, train.columns != 'label']
    y_train = train['label']


    # ## Prediction
    model = ml_model#RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0) #LogisticRegression(solver='lbfgs')
    model = model.fit(x_train, y_train)
    #syntactic_model = LogisticRegression(solver='lbfgs')#RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
    #syntactic_model = syntactic_model.fit(pd.DataFrame(x_train['syntactic_diff']), y_train)
    dump(model, CONFIGURATION.rundir + 'model.joblib')
    #dump(model, CONFIGURATION.rundir + 'syntactic_model.joblib')

    prediction = list()
    gold = list()
    plus_prediction = list()
    plus_gold = list()
    ctr = 0
    df = None
    for sample in test:
        ctr = ctr + 1
        print(str(ctr))
        if df is None:
            df = sample
        else:
            df = pd.concat((df, sample), axis=1)
    #    ctr = ctr + 1
    #    prediction = prediction + model.predict(sample.loc[:, sample.columns != 'label']).tolist()
    #    gold = gold + sample['label'].tolist()
    #    print(str(ctr))
    #    if sample.plus_diff.values[0] > 0.68 and sample.label.values[0] == 1 or sample.plus_diff.values[0] < 0.68 and sample.label.values[0] == 0:
    #        plus_prediction = plus_prediction + model.predict(sample.loc[:, sample.columns != 'label']).tolist()
    #        plus_gold = plus_gold + sample['label'].tolist()
    #    CONFIGURATION.log(str(sample.iloc[0].tolist() + [prediction]) + '\n')

    prediction = np.array(prediction)
    gold = np.array(gold)
    plus_prediction = np.array(plus_prediction)
    plus_gold = np.array(plus_gold)

    result = classification_report(prediction, gold, target_names=['false', 'true'])
    print("Results on test:")
    print(result)
    print(ConfusionMatrix(prediction, gold))
    print("\n\n--------------------------------------------------------------\n")
    CONFIGURATION.log("Results on test:")
    CONFIGURATION.log(str(result))
    CONFIGURATION.log(str(ConfusionMatrix(prediction, gold)))
    CONFIGURATION.log("\n\n--------------------------------------------------------------\n")

    plus_result = classification_report(plus_prediction, plus_gold, target_names=['false', 'true'])
    print("Results on test:")
    print(plus_result)
    print(ConfusionMatrix(plus_prediction, plus_gold))
    print("\n\n--------------------------------------------------------------\n")
    CONFIGURATION.log("Results on test:")
    CONFIGURATION.log(str(plus_result))
    CONFIGURATION.log(str(ConfusionMatrix(plus_prediction, plus_gold)))
    CONFIGURATION.log("\n\n--------------------------------------------------------------\n")

    print("Syntactic matching results+ on test: 0.0%")
    CONFIGURATION.log("Syntactic matching results+ on test: 0.0%")

    print("\n################################################################\n\n")
    CONFIGURATION.log("\n################################################################\n\n")


    return PipelineDataTuple(graph1, graph2)
Beispiel #26
0
def exec(graph1, graph2, model):

            setsize = 1000
            # Now start prediction:


            positive_samples, negative_samples, combined_samples, combined_samples_ids = batch_prepare_data_from_graph(graph1, graph2, CONFIGURATION.gold_mapping)
            positive_samples, negative_samples, combined_samples = extend_features(positive_samples), extend_features(negative_samples), extend_features(combined_samples)
            non_trivial_matches_ids = extract_non_trivial_matches(graph1, graph2, combined_samples_ids, CONFIGURATION.src_properties, CONFIGURATION.tgt_properties, combined_samples)

            combined_samples.to_csv(CONFIGURATION.rundir+"combined.csv")
            combined_samples.to_csv(CONFIGURATION.projectdir+"combined.csv")
            combined_samples_ids.to_csv(CONFIGURATION.rundir+"combined_ids.csv")
            negative_samples.to_csv(CONFIGURATION.rundir+"negatives.csv")
            positive_samples.to_csv(CONFIGURATION.rundir+"positives.csv")
            package_directory = os.path.dirname(os.path.abspath(__file__))

            CONFIGURATION.log("\n\n")
            CONFIGURATION.log("#####################################################\n")
            CONFIGURATION.log("#" + CONFIGURATION.name + " / " + str(model) + "\n")
            CONFIGURATION.log("-----------------------------------------------------\n")


            #Train/Test split
            X = pd.DataFrame(combined_samples.loc[:,combined_samples.columns != 'label'])
            X = pd.concat([X, combined_samples_ids], axis=1, sort=False)
            Y = pd.DataFrame(combined_samples.loc[:,'label'])

            X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.5,
                                                                                        random_state=7)


            X_test.set_index(['src_id','tgt_id'])
            combined_samples_ids.set_index(['src_id','tgt_id'])
            non_trivial_matches_ids.set_index(['src_id','tgt_id'])
            combined_samples_ids = X_test.merge(combined_samples_ids, how='inner')
            non_trivial_matches_ids = X_test.merge(non_trivial_matches_ids, how='inner')
            combined_samples_ids = combined_samples_ids.reset_index(drop = True)
            non_trivial_matches_ids = non_trivial_matches_ids.reset_index(drop = True)

            X_train = X_train.drop(['src_id','tgt_id'], axis=1)
            X_test = X_test.drop(['src_id','tgt_id'], axis=1)


            # fit model to training data
            model.fit(X_train, y_train.values.ravel())



            #scaler = StandardScaler()

            # Fit only to the training data
            #scaler.fit(X_train)



            # Now apply the transformations to the data:
            #X_train = scaler.transform(X_train)
            #X_test = scaler.transform(X_test)


            y_pred = model.predict(X_train)
            y_pred = np.array(y_pred)#scipy.stats.zscore(np.array(y_pred))
            predictions = [1 if value > 0.5 else 0 for value in y_pred]
            # evaluate predictions
            CONFIGURATION.log("Macro train: "+str(precision_recall_fscore_support(y_train, predictions, average='macro')) + "\n")
            CONFIGURATION.log("Micro train: "+str(precision_recall_fscore_support(y_train, predictions, average='micro')) + "\n")
            CONFIGURATION.log("#####################################################\n")


            y_test = y_test.reset_index(drop=True)
            y_train = y_train.reset_index(drop=True)

            y_pred = model.predict(X_test)
            y_pred = np.array(y_pred)
            #y_pred = scipy.stats.zscore(np.array(y_pred))
            persisted_predictions = [1 if value > 0.5 else 0 for value in y_pred]
            # evaluate predictions
            CONFIGURATION.log("Macro test: "+str(precision_recall_fscore_support(y_test, persisted_predictions, average='macro')) + "\n")
            CONFIGURATION.log("Micro test: "+str(precision_recall_fscore_support(y_test, persisted_predictions, average='micro')) + "\n")
            CONFIGURATION.log("#####################################################\n")
            target_names = ['neg', 'pos']
            CONFIGURATION.log("Report (pos: "+str(setsize)+" / neg: "+str(setsize)+"):\n")
            CONFIGURATION.log(str(classification_report(y_test, persisted_predictions, target_names=target_names)) + "\n")
            non_trivials = pd.merge(non_trivial_matches_ids, combined_samples_ids, left_on=['src_id','tgt_id'], right_on=['src_id','tgt_id'], how='right', indicator=True)
            non_trivials = non_trivials.loc[non_trivials['_merge'] == 'both'].index.tolist()
            #y_test = y_test['label']
            CONFIGURATION.log("#####################################################\n")
            CONFIGURATION.log("Report+ :" + str(classification_report(y_test.loc[y_test.index.isin(non_trivials)], np.array(persisted_predictions)[non_trivials], target_names=target_names)) + "\n")

            # Schema correspondence predictions
            # In the following code segment, schema correspondences are predicted using the instance-matching model.
            # However, this method is not recommended, as the model is (most likely) primarily or only trained on
            # instance-correspondences.
            '''schema_data, schema_data_ids = get_schema_data_from_graph(graph1, graph2)
            schema_data = extend_features(schema_data)
            y_pred = model.predict(schema_data)
            y_pred = scipy.stats.zscore(np.array(y_pred))
            predictions = [1 if value > 0 else 0 for value in y_pred]
            schema_predicted = pd.concat([pd.DataFrame({"prediction":predictions}), schema_data_ids], axis=1, sort=False)
            schema_predicted.to_csv(index=False,path_or_buf=package_directory+"/../../predicted_data.csv", header=False)
            pd.options.display.max_colwidth = 100
            pd.set_option('display.max_colwidth', -1)
            CONFIGURATION.log("\nschema matches predicted with ML model:\n")
            schema_predicted = schema_predicted[schema_predicted['prediction'] == 1]
            #CONFIGURATION.log(schema_predicted.to_string()+"\n")'''


            CONFIGURATION.log("\nschema matches predicted with heuristics:\n")
            persisted_predictions = [x == 1 for x in persisted_predictions]
            positive_predictions = combined_samples_ids[persisted_predictions]
            correspondece_types = dict()
            for index, row in positive_predictions.iterrows():
                try:
                    srckey = str(graph1.elements[row['src_id']].relations['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'].descriptor)
                    tgtkey = str(graph2.elements[row['tgt_id']].relations['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'].descriptor)
                    if (srckey in correspondece_types.keys()):
                        if (tgtkey in correspondece_types[srckey].keys()):
                            correspondece_types[srckey][tgtkey] = correspondece_types[srckey][tgtkey] + 1
                        else:
                            correspondece_types[srckey][tgtkey] = 1
                    else:
                        correspondece_types[srckey] = dict()
                        correspondece_types[srckey][tgtkey] = 1
                except:
                    pass

            for srckey, val in correspondece_types.items():
                maxtgtkey = None
                for tgtkey, count in val.items():
                    if maxtgtkey == None:
                        maxtgtkey = tgtkey
                    if count > val[maxtgtkey]:
                        maxtgtkey = tgtkey
                CONFIGURATION.log(str(srckey) + " --> " + str(maxtgtkey) + "\n")

            CONFIGURATION.log("\n\n\n")
            print("     --> Evaluated; logs written to " + str(CONFIGURATION.logfile))

            return PipelineDataTuple(graph1, graph2)# just return the original graph data; this is assumed to be the final step in the pipeline!
Beispiel #27
0
def main():

    logfile = os.path.join(package_directory, '..', 'results.log')

    src_triples = os.path.join(package_directory, '..', 'data',
                               'sap_hilti_data', 'balanced_walks',
                               'graph_triples_hilti_erp.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data',
                               'sap_hilti_data', 'balanced_walks',
                               'graph_triples_hilti_web.nt')
    src_corpus = os.path.join(package_directory, '..', 'data',
                              'sap_hilti_data', 'balanced_walks',
                              'corpus_hilti_erp.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data',
                              'sap_hilti_data', 'balanced_walks',
                              'corpus_hilti_web.txt')
    gold_mapping = InternalGoldStandard({
        'trainsets': [
            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                         'balanced_walks', 'final_trainset.csv')
        ],
        'testsets': [
            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                         'balanced_walks', 'possible_matches.csv')
        ]
    })
    dim = 20
    model = XGBClassifier()
    src_properties = [
        "http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"
    ]
    tgt_properties = [
        "http://rdata2graph.sap.com/hilti_web/property/products.name"
    ]

    name = "HILTI_pure_syntax"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(1, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(PureSyntaxMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankSyntaxMatcher.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    #line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_w2v_steps_walklength1"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    #line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_w2v_steps_walklength1_muse"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_w2v_steps_walklength3"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 3))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_w2v_steps_walklength1_3grams"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', True, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_w2v_batch_walklength1"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'batch', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_w2v_steps_walklength1_dim100"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(100, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_d2v_steps_walklength1_muse"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkD2V_1Embedder.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_visualization"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(2, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(TSNEInterface.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
Beispiel #28
0
def main_ngram_string():
    logfile = os.path.join(package_directory, '..', 'results.log')

    src_triples = os.path.join(package_directory, '..', 'data',
                               'sap_hilti_data', 'sap_hilti_3grams',
                               'graph_triples_hilti_erp.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data',
                               'sap_hilti_data', 'sap_hilti_3grams',
                               'graph_triples_hilti_web.nt')
    src_corpus = os.path.join(package_directory, '..', 'data',
                              'sap_hilti_data', 'sap_hilti_3grams',
                              'corpus_hilti_erp.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data',
                              'sap_hilti_data', 'sap_hilti_3grams',
                              'corpus_hilti_web.txt')
    gold_mapping = InternalGoldStandard({
        'trainsets': [
            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                         'sap_hilti_3grams', 'train_simple_sap_hilti.csv'),
            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                         'sap_hilti_3grams', 'train_hard_sap_hilti.csv')
        ],
        'testsets': [
            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                         'sap_hilti_3grams', 'test_simple_sap_hilti.csv'),
            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                         'sap_hilti_3grams', 'test_hard_sap_hilti.csv')
        ]
    })
    dim = 20
    model = XGBClassifier()
    src_properties = [
        "http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"
    ]
    tgt_properties = [
        "http://rdata2graph.sap.com/hilti_web/property/products.name"
    ]

    name = "3gram: simpletriplesembedding xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(SimpleTriplesEmbedder.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(SimpleTriplesEmbedder.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(dim))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_a = pipeline.append_step(TSNEInterface.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(2))
    line_b = pipeline.append_step(TSNEInterface.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(2))
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "3gram: simpletriplesembedding_1 xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(SimpleTriplesEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(TSNEInterface.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(2))
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "3gram: w2v xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_corpus))
    line_a = pipeline.append_step(W2VInterfaceWrapper.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_corpus))
    line_b = pipeline.append_step(W2VInterfaceWrapper.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(dim))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_a = pipeline.append_step(TSNEInterface.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(2))
    line_b = pipeline.append_step(TSNEInterface.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(2))
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "3gram: d2v xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(D2VInterfaceWrapper.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(D2VInterfaceWrapper.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(dim))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_a = pipeline.append_step(TSNEInterface.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(2))
    line_b = pipeline.append_step(TSNEInterface.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(2))
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    #    name = "3gram: pseudod2v xgb"
    #    pipeline = Pipeline()
    #    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    #    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    #    line_a = pipeline.append_step(PseudoD2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    #    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    #    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    #    line_b = pipeline.append_step(PseudoD2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    #    line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_a, line_b), None)
    #    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab),
    #                                   PipelineDataTuple(model))
    #    line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    #    line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    #    line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    #    line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    #    line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)
    #    line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None)
    #
    #    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
    #                                  pipeline, src_properties, tgt_properties)
    #    configuration_handler = ConfigurationHandler()
    #    configuration_handler.execute(configuration)

    name = "3gram: W2V_1 xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_corpus))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_corpus))
    line_ab = pipeline.append_step(W2V_1InterfaceWrapper.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(TSNEInterface.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(2))
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "3gram: D2V_1 xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(D2V_1InterfaceWrapper.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(TSNEInterface.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(2))
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
def main():

    #src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                           'graph_triples_hilti_erp.nt')
    #tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                           'graph_triples_hilti_web.nt')
    #src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                          'corpus_hilti_erp.txt')
    #tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                          'corpus_hilti_web.txt')
    #gold_mapping = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                            'train_simple_sap_hilti.csv')
    src_triples = os.path.join(package_directory, '..', 'data', 'oaei_data',
                               'graph_triples_darkscape.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data', 'oaei_data',
                               'graph_triples_oldschoolrunescape.nt')
    src_corpus = os.path.join(package_directory, '..', 'data', 'oaei_data',
                              'corpus_darkscape.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data', 'oaei_data',
                              'corpus_oldschoolrunescape.txt')
    gold_mapping = InternalGoldStandard({
        'trainsets': [
            os.path.join(package_directory, '..', 'data', 'oaei_data',
                         'oaei_gold_standard2.csv')
        ],
        'testsets': [
            os.path.join(package_directory, '..', 'data', 'oaei_data',
                         'possible_matches.csv')
        ]
    })
    dim = 20
    model = XGBClassifier()  #LogisticRegression()
    labelfile = os.path.join(package_directory, '..', 'data', 'oaei_data',
                             'labels.txt')
    src_properties = StringMatcher_Interface.get_labels_from_file(labelfile)
    tgt_properties = StringMatcher_Interface.get_labels_from_file(labelfile)

    name = "OAEI_emb_w2v_steps_walklength1"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(EmbeddingMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    #
    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
    #
    #
    #
    name = "OAEI_emb_w2v_steps_walklength1_muse"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(EmbeddingMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    #
    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
    #
    #
    name = "OAEI_emb_w2v_steps_walklength1_tsne"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(EmbeddingMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(TSNEInterface.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    #
    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
    #
    #
    #
    name = "OAEI_emb_w2v_steps_walklength3"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 3))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(EmbeddingMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
    #
    name = "OAEI_emb_w2v_steps_walklength1_3grams"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', True, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(EmbeddingMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    #
    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "OAEI_emb_w2v_batch_walklength1"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'batch', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(EmbeddingMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "OAEI_emb_w2v_steps_walklength1_dim100"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(100, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(EmbeddingMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "OAEI_emb_d2v_steps_walklength1_muse"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkD2V_1Embedder.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(EmbeddingMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankEmbeddingsMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
Beispiel #30
0
def exec(graph1, graph2, model):

    setsize = 1000
    # Now start prediction:

    package_directory = os.path.dirname(os.path.abspath(__file__))
    CONFIGURATION.gold_mapping = os.path.join(
        package_directory, '..', '..', 'data', 'sap_hilti_data',
        'sap_hilti_full_strings', 'hq_sap_hilti_gold_stratified.csv')
    positive_samples, negative_samples, combined_samples, combined_samples_ids = batch_prepare_data_from_graph(
        graph1, graph2, CONFIGURATION.gold_mapping)
    positive_samples, negative_samples, combined_samples = extend_features(
        positive_samples), extend_features(negative_samples), extend_features(
            combined_samples)
    non_trivial_matches_ids = extract_non_trivial_matches(
        graph1, graph2, combined_samples_ids, CONFIGURATION.src_properties,
        CONFIGURATION.tgt_properties, combined_samples)

    combined_samples.to_csv(CONFIGURATION.rundir + "strcombined.csv")
    # pd.merge(pd.merge(non_trivial_matches_ids, combined_samples_ids, left_on=['src_id','tgt_id'], right_on=['src_id','tgt_id'], how='inner', indicator=False),
    #         combined_samples, right_index=True, left_index=True).drop(['src_id','tgt_id'], axis=1).to_csv(CONFIGURATION.rundir+"snon_trivials.csv")
    combined_samples_ids.to_csv(CONFIGURATION.rundir + "strcombined_ids.csv")
    negative_samples.to_csv(CONFIGURATION.rundir + "strnegatives.csv")
    positive_samples.to_csv(CONFIGURATION.rundir + "strpositives.csv")

    CONFIGURATION.log("\n\n")
    CONFIGURATION.log(
        "#####################################################\n")
    CONFIGURATION.log("#" + CONFIGURATION.name + " / " + str(model) + "\n")
    CONFIGURATION.log(
        "-----------------------------------------------------\n")

    #Train/Test split
    X = pd.DataFrame(combined_samples.loc[:,
                                          combined_samples.columns != 'label'])
    #X = pd.concat([X, combined_samples_ids], axis=1, sort=False)
    Y = pd.DataFrame(combined_samples.loc[:, 'label'])

    from sklearn import metrics
    cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)
    per = cross_validate(model,
                         X,
                         Y,
                         cv=cv,
                         scoring=('f1_micro', 'f1_macro', 'precision',
                                  'recall'),
                         return_train_score=True)
    CONFIGURATION.log("F1-macro test\t" +
                      str(np.average(per['test_f1_macro'])) + " +/-" +
                      str(np.std(per['test_f1_macro'])) + "\t" +
                      str(per['test_f1_macro']) + "\n")
    CONFIGURATION.log("F1-macro train\t" +
                      str(np.average(per['train_f1_macro'])) + " +/-" +
                      str(np.std(per['train_f1_macro'])) + "\t" +
                      str(per['train_f1_macro']) + "\n")
    CONFIGURATION.log("F1-micro test:\t" +
                      str(np.average(per['test_f1_micro'])) + " +/-" +
                      str(np.std(per['test_f1_micro'])) + "\t" +
                      str(per['test_f1_micro']) + "\n")
    CONFIGURATION.log("F1-micro train:\t" +
                      str(np.average(per['train_f1_micro'])) + " +/-" +
                      str(np.std(per['train_f1_micro'])) + "\t" +
                      str(per['train_f1_micro']) + "\n")
    CONFIGURATION.log("Precision test:\t" +
                      str(np.average(per['test_precision'])) + " +/-" +
                      str(np.std(per['test_precision'])) + "\t" +
                      str(per['test_precision']) + "\n")
    CONFIGURATION.log("Precision train:\t" +
                      str(np.average(per['train_precision'])) + " +/-" +
                      str(np.std(per['train_precision'])) + "\t" +
                      str(per['train_precision']) + "\n")
    CONFIGURATION.log("Recall test:\t\t" +
                      str(np.average(per['test_recall'])) + " +/-" +
                      str(np.std(per['test_recall'])) + "\t" +
                      str(per['test_recall']) + "\n")
    CONFIGURATION.log("Recall train:\t\t" +
                      str(np.average(per['train_recall'])) + " +/-" +
                      str(np.std(per['train_recall'])) + "\t" +
                      str(per['train_recall']) + "\n")
    from sklearn.model_selection import cross_val_predict
    y_pred = cross_val_predict(model, X, Y, cv=cv)
    y_pred = np.array(y_pred)  #scipy.stats.zscore(np.array(y_pred))
    predictions = [1 if value > 0.5 else 0 for value in y_pred]
    # evaluate predictions

    persisted_predictions = [1 if value > 0.5 else 0 for value in y_pred]

    CONFIGURATION.log('\nDataset meta info:\n')
    CONFIGURATION.log('Actual samples ' + str(len(Y)) +
                      ' / Positive samples ' +
                      str(len(Y.loc[Y['label'] == 1])) +
                      ' / Negative samples ' +
                      str(len(Y.loc[Y['label'] == 0])) + '\n')
    CONFIGURATION.log(
        'Predicted samples ' + str(len(Y)) + ' / Positive samples ' +
        str(len(np.where(np.array(persisted_predictions) == 1)[0])) +
        ' / Negative samples ' +
        str(len(np.where(np.array(persisted_predictions) == 0)[0])) + '\n')
    CONFIGURATION.log(
        "#####################################################\n")

    # evaluate predictions
    non_trivials = pd.merge(non_trivial_matches_ids,
                            combined_samples_ids,
                            left_on=['src_id', 'tgt_id'],
                            right_on=['src_id', 'tgt_id'],
                            how='right',
                            indicator=True)
    non_trivials = non_trivials.loc[non_trivials['_merge'] ==
                                    'both'].index.tolist()
    #y_test = y_test['label']
    target_names = ['pos', 'neg']
    CONFIGURATION.log("Report+:" + str(
        classification_report(Y.loc[Y.index.isin(non_trivials)],
                              np.array(persisted_predictions)[non_trivials],
                              target_names=target_names)) + "\n")
    CONFIGURATION.log('\nDataset meta info:\n')
    CONFIGURATION.log('Actual samples ' + str(
        len(non_trivials)
    ) + ' / Positive samples ' + str(
        len(np.where(np.array(persisted_predictions)[non_trivials] == 1)[0])
    ) + ' / Negative samples ' + str(
        len(np.where(np.array(persisted_predictions)[non_trivials] == 0)[0])) +
                      '\n')
    CONFIGURATION.log(
        'Predicted samples ' + str(len(non_trivials)) +
        ' / Positive samples ' +
        str(len(np.where(
            np.array(Y.loc[Y.index.isin(non_trivials)]) == 1)[0])) +
        ' / Negative samples ' +
        str(len(np.where(
            np.array(Y.loc[Y.index.isin(non_trivials)]) == 0)[0])) + '\n')

    # Schema correspondence predictions
    # In the following code segment, schema correspondences are predicted using the instance-matching model.
    # However, this method is not recommended, as the model is (most likely) primarily or only trained on
    # instance-correspondences.
    '''schema_data, schema_data_ids = get_schema_data_from_graph(graph1, graph2)
            schema_data = extend_features(schema_data)
            y_pred = model.predict(schema_data)
            y_pred = scipy.stats.zscore(np.array(y_pred))
            predictions = [1 if value > 0 else 0 for value in y_pred]
            schema_predicted = pd.concat([pd.DataFrame({"prediction":predictions}), schema_data_ids], axis=1, sort=False)
            schema_predicted.to_csv(index=False,path_or_buf=package_directory+"/../../predicted_data.csv", header=False)
            pd.options.display.max_colwidth = 100
            pd.set_option('display.max_colwidth', -1)
            CONFIGURATION.log("\nschema matches predicted with ML model:\n")
            schema_predicted = schema_predicted[schema_predicted['prediction'] == 1]
            #CONFIGURATION.log(schema_predicted.to_string()+"\n")'''

    CONFIGURATION.log("\nschema matches predicted with heuristics:\n")
    persisted_predictions = [x == 1 for x in persisted_predictions]
    positive_predictions = combined_samples_ids[persisted_predictions]
    correspondece_types = dict()
    for index, row in positive_predictions.iterrows():
        try:
            srckey = str(graph1.elements[row['src_id']].relations[
                'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'].descriptor)
            tgtkey = str(graph2.elements[row['tgt_id']].relations[
                'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'].descriptor)
            if (srckey in correspondece_types.keys()):
                if (tgtkey in correspondece_types[srckey].keys()):
                    correspondece_types[srckey][
                        tgtkey] = correspondece_types[srckey][tgtkey] + 1
                else:
                    correspondece_types[srckey][tgtkey] = 1
            else:
                correspondece_types[srckey] = dict()
                correspondece_types[srckey][tgtkey] = 1
        except:
            pass

    for srckey, val in correspondece_types.items():
        maxtgtkey = None
        for tgtkey, count in val.items():
            if maxtgtkey == None:
                maxtgtkey = tgtkey
            if count > val[maxtgtkey]:
                maxtgtkey = tgtkey
        CONFIGURATION.log(str(srckey) + " --> " + str(maxtgtkey) + "\n")

    CONFIGURATION.log("\n\n\n")
    print("     --> Evaluated; logs written to " + str(CONFIGURATION.logfile))

    return PipelineDataTuple(
        graph1, graph2
    )  # just return the original graph data; this is assumed to be the final step in the pipeline!