예제 #1
0
def main():



    #src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                           'graph_triples_hilti_erp.nt')
    #tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                           'graph_triples_hilti_web.nt')
    #src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                          'corpus_hilti_erp.txt')
    #tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                          'corpus_hilti_web.txt')
    #gold_mapping = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                            'train_simple_sap_hilti.csv')
    src_triples = os.path.join(package_directory, '..', 'data', 'oaei_data',
                               'graph_triples_darkscape.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data', 'oaei_data',
                               'graph_triples_oldschoolrunescape.nt')
    src_corpus = os.path.join(package_directory, '..', 'data', 'oaei_data',
                              'corpus_darkscape.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data', 'oaei_data',
                              'corpus_oldschoolrunescape.txt')
    gold_mapping = InternalGoldStandard({'trainsets':
                                            [os.path.join(package_directory, '..', 'data',
                                            'oaei_data', 'oaei_gold_standard2.csv')],
                                         'testsets':
                                             [os.path.join(package_directory, '..', 'data',
                                            'oaei_data', 'possible_matches.csv')]
                                        })
    dim = 2
    model = XGBClassifier()#LogisticRegression()
    labelfile = os.path.join(package_directory, '..', 'data', 'oaei_data','labels.txt')
    src_properties = StringMatcher_Interface.get_labels_from_file(labelfile)
    tgt_properties = StringMatcher_Interface.get_labels_from_file(labelfile)


    name = "OAEI_visualization"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', True, 1))
    line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)


    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
                                  pipeline, src_properties, tgt_properties, calc_PLUS_SCORE=False, use_cache=False, use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
예제 #2
0
def main():



    src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                               'graph_triples_hilti_erp.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                               'graph_triples_hilti_web.nt')
    src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                              'corpus_hilti_erp.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                              'corpus_hilti_web.txt')
    gold_mapping = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                                'train_simple_sap_hilti.csv')

    dim = 3
    #model = make_pipeline(PolynomialFeatures(6), Ridge())#DecisionTreeClassifier() #make_pipeline(PolynomialFeatures(8), Ridge())
    #model = sklearn.linear_model.LinearRegression()
    #from sklearn.ensemble import RandomForestRegressor
    #model = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)
    #model = LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
    #                  intercept_scaling=1, loss='squared_hinge', max_iter=1000,
    #                  multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0)
    from sklearn.linear_model import LogisticRegression
    model = XGBClassifier()
    src_properties = ["http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"]
    tgt_properties = ["http://rdata2graph.sap.com/hilti_web/property/products.name"]


    name = "jaccard_no_props_given"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(W2V_1InterfaceWrapper.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim))
    line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)
    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
                                  pipeline, src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
예제 #3
0
def main():

    logfile = os.path.join(package_directory, '..', 'results.log')

    src_triples = os.path.join(package_directory, '..', 'data',
                               'sap_hilti_data', 'balanced_walks',
                               'graph_triples_hilti_erp.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data',
                               'sap_hilti_data', 'balanced_walks',
                               'graph_triples_hilti_web.nt')
    src_corpus = os.path.join(package_directory, '..', 'data',
                              'sap_hilti_data', 'balanced_walks',
                              'corpus_hilti_erp.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data',
                              'sap_hilti_data', 'balanced_walks',
                              'corpus_hilti_web.txt')
    gold_mapping = InternalGoldStandard({
        'trainsets': [
            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                         'balanced_walks', 'final_trainset.csv')
        ],
        'testsets': [
            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                         'balanced_walks', 'possible_matches.csv')
        ]
    })
    dim = 20
    model = XGBClassifier()
    src_properties = [
        "http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"
    ]
    tgt_properties = [
        "http://rdata2graph.sap.com/hilti_web/property/products.name"
    ]

    name = "HILTI_pure_syntax"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(1, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(PureSyntaxMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankSyntaxMatcher.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    #line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_w2v_steps_walklength1"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    #line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_w2v_steps_walklength1_muse"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_w2v_steps_walklength3"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 3))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_w2v_steps_walklength1_3grams"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', True, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_w2v_batch_walklength1"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'batch', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_w2v_steps_walklength1_dim100"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(100, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_d2v_steps_walklength1_muse"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkD2V_1Embedder.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "HILTI_visualization"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(2, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(TSNEInterface.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(2))
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name,
                                  src_corpus,
                                  tgt_corpus,
                                  src_triples,
                                  tgt_triples,
                                  gold_mapping,
                                  dim,
                                  pipeline,
                                  src_properties,
                                  tgt_properties,
                                  calc_PLUS_SCORE=False,
                                  use_cache=False,
                                  use_streams=False)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
예제 #4
0
def main_ngram_string():
    logfile = os.path.join(package_directory, '..', 'results.log')

    src_triples = os.path.join(package_directory, '..', 'data',
                               'sap_hilti_data', 'sap_hilti_3grams',
                               'graph_triples_hilti_erp.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data',
                               'sap_hilti_data', 'sap_hilti_3grams',
                               'graph_triples_hilti_web.nt')
    src_corpus = os.path.join(package_directory, '..', 'data',
                              'sap_hilti_data', 'sap_hilti_3grams',
                              'corpus_hilti_erp.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data',
                              'sap_hilti_data', 'sap_hilti_3grams',
                              'corpus_hilti_web.txt')
    gold_mapping = InternalGoldStandard({
        'trainsets': [
            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                         'sap_hilti_3grams', 'train_simple_sap_hilti.csv'),
            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                         'sap_hilti_3grams', 'train_hard_sap_hilti.csv')
        ],
        'testsets': [
            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                         'sap_hilti_3grams', 'test_simple_sap_hilti.csv'),
            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                         'sap_hilti_3grams', 'test_hard_sap_hilti.csv')
        ]
    })
    dim = 20
    model = XGBClassifier()
    src_properties = [
        "http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"
    ]
    tgt_properties = [
        "http://rdata2graph.sap.com/hilti_web/property/products.name"
    ]

    name = "3gram: simpletriplesembedding xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(SimpleTriplesEmbedder.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(SimpleTriplesEmbedder.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(dim))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_a = pipeline.append_step(TSNEInterface.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(2))
    line_b = pipeline.append_step(TSNEInterface.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(2))
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "3gram: simpletriplesembedding_1 xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(SimpleTriplesEmbedder_1.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(TSNEInterface.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(2))
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)
    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "3gram: w2v xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_corpus))
    line_a = pipeline.append_step(W2VInterfaceWrapper.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_corpus))
    line_b = pipeline.append_step(W2VInterfaceWrapper.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(dim))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_a = pipeline.append_step(TSNEInterface.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(2))
    line_b = pipeline.append_step(TSNEInterface.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(2))
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "3gram: d2v xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(D2VInterfaceWrapper.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(D2VInterfaceWrapper.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(dim))
    line_ab = pipeline.append_step(concat_combiner.interface,
                                   PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_a = pipeline.append_step(TSNEInterface.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(2))
    line_b = pipeline.append_step(TSNEInterface.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(2))
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_a, line_b), None)
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    #    name = "3gram: pseudod2v xgb"
    #    pipeline = Pipeline()
    #    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    #    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    #    line_a = pipeline.append_step(PseudoD2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    #    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    #    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    #    line_b = pipeline.append_step(PseudoD2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    #    line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_a, line_b), None)
    #    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab),
    #                                   PipelineDataTuple(model))
    #    line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    #    line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    #    line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    #    line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    #    line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)
    #    line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None)
    #
    #    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
    #                                  pipeline, src_properties, tgt_properties)
    #    configuration_handler = ConfigurationHandler()
    #    configuration_handler.execute(configuration)

    name = "3gram: W2V_1 xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_corpus))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_corpus))
    line_ab = pipeline.append_step(W2V_1InterfaceWrapper.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(TSNEInterface.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(2))
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)

    name = "3gram: D2V_1 xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_triples))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None,
                                  PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface,
                                  PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_triples))
    line_ab = pipeline.append_step(D2V_1InterfaceWrapper.interface,
                                   PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim))
    line_ab = pipeline.append_step(FlatMatcher.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    line_ab = pipeline.append_step(TSNEInterface.interface,
                                   PipelineDataTuple(line_ab),
                                   PipelineDataTuple(2))
    line_ab = pipeline.append_step(StratifiedVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(TypeVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(CategoriesVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(FullVisualizer.interface,
                                   PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface,
                                   PipelineDataTuple(line_ab), None)

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples,
                                  tgt_triples, gold_mapping, dim, pipeline,
                                  src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
예제 #5
0
def main():
    logfile = os.path.join(package_directory, '..', 'results.log')
    try:
        os.remove(logfile)
    except:
        pass

    src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                               'graph_triples_hilti_erp.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                               'graph_triples_hilti_web.nt')
    src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                              'corpus_hilti_erp.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                              'corpus_hilti_web.txt')
    gold_mapping = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                                'sap_hilti_gold.csv')
    dim = 20
    model = XGBClassifier()

    name = "w2v d2v concat muse xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_corpus))
    line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_a = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_a), None)
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_corpus))
    line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_b = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_b), None)
    line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(gold_mapping, model, logfile, name))

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim,
                                  pipeline)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)





    name = "w2v d2v concat xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_corpus))
    line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_a = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_a), None)
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_corpus))
    line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_b = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_b), None)
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTule(line_a, line_b),
                                   PipelineDataTuple(gold_mapping, model, logfile, name))

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim,
                                  pipeline)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)





    name = "w2v muse xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_corpus))
    line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_corpus))
    line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(gold_mapping, model, logfile, name))

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim,
                                  pipeline)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)





    name = "w2v xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
                                  PipelineDataTuple(src_corpus))
    line_a = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
                                  PipelineDataTuple(tgt_corpus))
    line_b = pipeline.append_step(W2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(gold_mapping, model, logfile, name))

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim,
                                  pipeline)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)





    name = "d2v muse xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(gold_mapping, model, logfile, name))

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim,
                                  pipeline)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)




    name = "d2v xgb"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_a), PipelineDataTuple(dim))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(D2VInterfaceWrapper.interface, PipelineDataTuple(line_b), PipelineDataTuple(dim))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(gold_mapping, model, logfile, name))

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, logfile, dim,
                                  pipeline)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
예제 #6
0
def main():



    src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                               'graph_triples_hilti_erp.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                               'graph_triples_hilti_web.nt')
    src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                              'corpus_hilti_erp.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
                              'corpus_hilti_web.txt')
    gold_mapping = InternalGoldStandard({'trainsets':
                                            [os.path.join(package_directory, '..', 'data',
                                            'sap_hilti_data', 'sap_hilti_full_strings', 'train_simple_sap_hilti.csv'),
                                            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                                            'sap_hilti_full_strings', 'train_hard_sap_hilti.csv')],
                                         'testsets': [os.path.join(package_directory, '..', 'data',
                                            'sap_hilti_data', 'sap_hilti_full_strings', 'test_simple_sap_hilti.csv'),
                                            os.path.join(package_directory, '..', 'data', 'sap_hilti_data',
                                            'sap_hilti_full_strings', 'test_hard_sap_hilti.csv')]
                                        })
    dim = 20
    model = LogisticRegression()
    src_properties = None#["http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"]
    tgt_properties = None#["http://rdata2graph.sap.com/hilti_web/property/products.name"]




    ##name = "W2V_1 muse xgb with 50k only on embeddings"
    ##pipeline = Pipeline()
    ##line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    ##line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    ##line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
    ##                              PipelineDataTuple(src_corpus))
    ##line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    ##line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    ##line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
    ##                              PipelineDataTuple(tgt_corpus))
    ##line_ab = pipeline.append_step(W2V_1InterfaceWrapper.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim))
    ##line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    ##line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab),
    ##                               PipelineDataTuple(model))
    ##line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)
##
    ##configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
    ##                              pipeline, src_properties, tgt_properties)
    ##configuration_handler = ConfigurationHandler()
    ##configuration_handler.execute(configuration)
##
##
##
##
    ##name = "W2V_1 muse xgb with 50k on embeddings and sim"
    ##pipeline = Pipeline()
    ##line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    ##line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    ##line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
    ##                              PipelineDataTuple(src_corpus))
    ##line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    ##line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    ##line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
    ##                              PipelineDataTuple(tgt_corpus))
    ##line_ab = pipeline.append_step(W2V_1InterfaceWrapper.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim))
    ##line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    ##line_ab = pipeline.append_step(EmbeddingMatcher.interface, PipelineDataTuple(line_ab),
    ##                               PipelineDataTuple(model))
    ##line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    ##line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)
##
    ##configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
    ##                              pipeline, src_properties, tgt_properties)
    ##configuration_handler = ConfigurationHandler()
    ##configuration_handler.execute(configuration)

    name = "jacc_no_schema_given"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    #line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
    #                              PipelineDataTuple(src_corpus))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    #line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
    #                              PipelineDataTuple(tgt_corpus))
    line_ab = pipeline.append_step(PseudoD2V_1InterfaceWrapper_2.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim))
    line_ab = pipeline.append_step(SimpleTriplesEmbedder_1.interface, PipelineDataTuple(line_a, line_b), PipelineDataTuple(dim))

    line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    #line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None)

    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
                                  pipeline, src_properties, tgt_properties)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)
예제 #7
0
def main():



    #src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                           'graph_triples_hilti_erp.nt')
    #tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                           'graph_triples_hilti_web.nt')
    #src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                          'corpus_hilti_erp.txt')
    #tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                          'corpus_hilti_web.txt')
    #gold_mapping = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'sap_hilti_full_strings',
    #                            'train_simple_sap_hilti.csv')
    src_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks',
                               'graph_triples_hilti_erp.nt')
    tgt_triples = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks',
                               'graph_triples_hilti_web.nt')
    src_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks',
                              'corpus_hilti_erp.txt')
    tgt_corpus = os.path.join(package_directory, '..', 'data', 'sap_hilti_data', 'balanced_walks',
                              'corpus_hilti_web.txt')
    gold_mapping = InternalGoldStandard({'trainsets':
                                            [os.path.join(package_directory, '..', 'data',
                                            'sap_hilti_data', 'balanced_walks', 'final_trainset.csv')],
                                         'testsets': [os.path.join(package_directory, '..', 'data',
                                            'sap_hilti_data', 'balanced_walks', 'possible_matches.csv')]
                                        })
    dim = 1
    model = LogisticRegression()#XGBClassifier()
    labelfile = os.path.join(package_directory, '..', 'data', 'sap_hilti_data','balanced_walks',
                              'labels.txt')
    src_properties = StringMatcher_Interface.get_labels_from_file(labelfile)#["http://rdata2graph.sap.com/hilti_erp/property/mara_fert.maktx"]
    tgt_properties = StringMatcher_Interface.get_labels_from_file(labelfile)#["http://rdata2graph.sap.com/hilti_web/property/products.name"]
    use_streams = False


    name = "test"
    pipeline = Pipeline()
    line_a = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(src_triples))
    line_a = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_a), PipelineDataTuple(src_triples))
    #line_a = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_a),
    #                              PipelineDataTuple(src_corpus))
    line_b = pipeline.append_step(load_kg_with_rdflib_ttl_interface, None, PipelineDataTuple(tgt_triples))
    line_b = pipeline.append_step(GraphToolbox.interface, PipelineDataTuple(line_b), PipelineDataTuple(tgt_triples))
    #line_b = pipeline.append_step(ReadSentencesInterfaceWrapper.interface, PipelineDataTuple(line_b),
    #                              PipelineDataTuple(tgt_corpus))
    line_ab = pipeline.append_step(WalkEmbedder_1.interface, PipelineDataTuple(line_a, line_b),
                                   PipelineDataTuple(dim, 'steps', False, 1))
    line_ab = pipeline.append_step(concat_combiner.interface, PipelineDataTuple(line_ab), None)

    #line_ab = pipeline.append_step(muse.interface, PipelineDataTuple(line_ab), PipelineDataTuple(gold_mapping))
    line_ab = pipeline.append_step(FlatMatcher.interface, PipelineDataTuple(line_ab),
                                   PipelineDataTuple(model))
    #line_ab = pipeline.append_step(TSNEInterface.interface, PipelineDataTuple(line_ab), PipelineDataTuple(2))
    #line_ab = pipeline.append_step(CategoriesVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(StratifiedVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(TypeVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(FullVisualizer.interface, PipelineDataTuple(line_ab), None)
    #line_ab = pipeline.append_step(EmbeddingSaver.interface, PipelineDataTuple(line_ab), None)
    line_ab = pipeline.append_step(StableRankMatcher.interface, PipelineDataTuple(line_ab), None)


    configuration = Configuration(name, src_corpus, tgt_corpus, src_triples, tgt_triples, gold_mapping, dim,
                                  pipeline, src_properties, tgt_properties, use_streams, False, True)
    configuration_handler = ConfigurationHandler()
    configuration_handler.execute(configuration)