Exemple #1
0

    args = parser.parse_args()

    idnt = args.kg_identifier

    if args.kg:
        kg_triples = FileTriplesSource(args.kg,
                                        prefix='http://exp-data.org/', safe_urls=True)

        kg_indexer = Indexer(endpoint='http://halimede:8890/sparql', identifier=idnt)
        if not kg_indexer.graph_exists():
            kg_indexer.index_triples(kg_triples)

    vos_executer=EndPointKGQueryInterfaceExtended(endpoint='http://halimede:8890/sparql',
                                                  identifiers=[idnt, idnt + '.alltypes'],
                                                  labels_identifier=idnt +'.labels.gt')

    t_entities=EntitiesLabelsFile(args.target_entities) #,
                                  # prefix='http://exp-data.org/', safe_urls=True)

    # cls = ['http://clusteringtype#UndergraduateCourse','http://clusteringtype#GraduateCourse']
    # cls=['http://clusteringtype#PublicationByProfessor','http://clusteringtype#PublicationByGraduateStudent']
    # cls=t_entities.get_uniq_labels()
    # labels_indexer = Indexer(endpoint='http://halimede:8890/sparql', identifier=idnt + '.labels.gt')
    cd = PathBasedClustersExplainerExtended(vos_executer, relation=t_entities.get_relation(),
                                    quality_method='x_coverage',
                                    with_constants=True,
                                    language_bias={'max_length':args.max_length,
                                                    'structure': ExplanationStructure[args.language_structure]})
    # cd.prepare_data(t_entities)
clustering_results_as_triples = EntityLabelsToTriples(np.column_stack((target_entities.get_entities(), y_pred)))


# to save clustering results as triples
write_triples(clustering_results_as_triples, os.path.join(experiment_dir, 'clustering.tsv'))

# evaluate clustering using normal measures and add them to methods results
current_method_result.update(
    clms.evaluate(target_entities.get_labels(), clustering_results_as_triples.get_labels(), verbose=True))

########################## Explian #############################
# Note: this is not the perfect interface, I will try to simplify it on the way.

# Interfaces to locations where the data is indexed (virtuoso).
query_interface = EndPointKGQueryInterfaceExtended(endpoint='http://tracy:8890/sparql',
                                          identifiers=['http://yago-expr.org','http://yago-expr.org.types'],
                                                       labels_identifier='http://yago-expr.org.labels.m')
# labels_indexer = Indexer(endpoint='http://tracy:8890/sparql', identifier='http://yago-expr.org.labels.m')

# Explaining engine
quality_method='x_coverage'
clusters_explainer = PathBasedClustersExplainerExtended(query_interface, quality_method=quality_method, min_coverage=0.5, top=3,
                                                        language_bias={'max_length': 2, 'structure': ExplanationStructure.SUBGRAPH}
                                                        )

#index the labels
explanations_dict = clusters_explainer.explain(clustering_results_as_triples,
                                               os.path.join(experiment_dir, 'explanations.txt'))


# evalaute rules quality
Exemple #3
0
    def _get_patterns_with_bindable_args(self, query_descriptions):
        # if last predicate was in relation that has interesting constant in check constants
        patterns_with_bindable_args = []
        for pattern_to_expand in query_descriptions:
            if pattern_to_expand.get_last_atom().predicate in self.relations_with_const_object:
                if is_var(pattern_to_expand.get_dangling_arg()):  # I think this is a redundant check
                    patterns_with_bindable_args.append(pattern_to_expand)
                    # logger.debug(str(level_query_patterns[-1]))
                    # print('Extended to bind constants\n%s' % pattern_to_expand.str_readable())

        return patterns_with_bindable_args


if __name__ == '__main__':
    vos_executer = EndPointKGQueryInterfaceExtended('http://halimede:8890/sparql',
                                                    ['http://yago-expr.org', 'http://yago-expr.org.alltypes'],
                                                    labels_identifier='http://yago-expr.org.art-labels')
    p = DescriptionMinerExtended(vos_executer,
                                 per_pattern_binding_limit=30,
                                 pattern_structure=ExplanationStructure.SUBGRAPH)
    # print(p.mine_iteratively(head=('?x', 'http://execute_aux.org/auxBelongsTo', 'http://execute_aux.org/auxC2'),
    #                          min_coverage=0.4,
    #                          negative_heads=[('?x', 'http://execute_aux.org/auxBelongsTo', 'http://execute_aux.org/auxC0'),
    #                                          ('?x', 'http://execute_aux.org/auxBelongsTo', 'http://execute_aux.org/auxC1'),
    #                                          ('?x', 'http://execute_aux.org/auxBelongsTo', 'http://execute_aux.org/auxC3'),
    #                                          ('?x', 'http://execute_aux.org/auxBelongsTo', 'http://execute_aux.org/auxC4')]))

    ds = p.mine_with_constants(head=Atom('?x', 'http://excute.org/label', 'http://exp-data.org/wordnet_song_107048000'),
                               max_length=2,
                               min_coverage=0.2
                               )
Exemple #4
0
def initiate_problem(args, time_now):
    # objective quality
    objective_quality_measure = args.objective_quality
    # kg file
    kg_filepath = args.kg
    # initialize output_dir
    # output_dir = args.output_folder if args.output_folder else os.path.join(args.output_folder, "run_%s" % time_now)
    output_dir = os.path.join(args.output_folder, "run_%s" % time_now)
    embedding_output_dir = os.path.join(output_dir, 'embedding')
    base_embedding_dir = args.embedding_dir
    # encoding_dict_dir = args.encoding_dict_dir
    # embedding_base_model = os.path.join(embedding_dir, 'base')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    logger.info('Output Dir: %s' % output_dir)
    # Traget entities
    data_prefix = args.data_prefix
    target_entities_filepath = args.target_entities
    target_entities = EntitiesLabelsFile(target_entities_filepath,
                                         prefix=data_prefix,
                                         safe_urls=args.data_safe_urls)
    number_of_clusters = args.number_of_clusters if args.number_of_clusters else target_entities.get_num_clusters(
    )
    logger.info('Number of clusters %i' % number_of_clusters)
    # file source to read the kg
    kg_file_source = FileTriplesSource(kg_filepath,
                                       prefix=data_prefix,
                                       safe_urls=args.data_safe_urls)
    # place_holder_triples = PlaceHolderTriplesSource(10, 10, prefix=data_prefix, safe_urls=args.data_safe_urls)
    kg_and_place_holders = kg_file_source  #JointTriplesSource(kg_file_source, place_holder_triples)
    kg_identfier = args.kg_identifier
    labels_identifier = kg_identfier + '_%s.labels' % time_now
    logger.info("KG Identifiers: %s\t%s" % (kg_identfier, labels_identifier))
    # index data if required
    host = args.host
    graph = None
    labels_indexer = None
    if args.index is not None:
        if args.index == 'remote':
            indexer = Indexer(endpoint=host, identifier=kg_identfier)
            labels_indexer = Indexer(endpoint=host,
                                     identifier=labels_identifier)
        else:
            indexer = Indexer(store='memory', identifier=kg_identfier)
            labels_indexer = Indexer(store='memory',
                                     identifier=labels_identifier)

        if args.drop_index or not indexer.graph_exists():
            logger.info("KG will be indexed to %s (%s %s)" %
                        (args.index, args.drop_index, indexer.graph_exists()))
            graph = indexer.index_triples(kg_file_source,
                                          drop_old=args.drop_index)
            logger.info("Done indexing!")
    logger.info("Embedding adapter chosen: %s" % args.embedding_adapter)
    update_data_mode = UpdateDataMode[args.update_data_mode.upper()]
    update_mode = UpdateMode[args.update_mode.upper()]
    iterations_history = args.update_triples_history
    update_lr = args.update_learning_rate
    # executor
    if args.index == 'remote':
        kg_query_interface = EndPointKGQueryInterfaceExtended(
            endpoint=host,
            identifiers=[kg_identfier, kg_identfier + '.types'],
            labels_identifier=labels_identifier)
    else:
        kg_query_interface = RdflibKGQueryInterfaceExtended(
            graphs=[graph, Graph(identifier=labels_identifier)])
    update_subKG = None
    if update_data_mode == UpdateDataMode.ASSIGNMENTS_SUBGRAPH or args.sub_kg or \
            update_data_mode == UpdateDataMode.SUBGRAPH_ONLY:
        logger.info("Generating SubKG for target entities!")
        # if we
        if args.context_filepath and os.path.exists(args.context_filepath):
            update_subKG = FileTriplesSource(args.context_filepath,
                                             prefix=data_prefix,
                                             safe_urls=args.data_safe_urls)
        else:
            kg_slicer = KGSlicer(kg_query_interface)
            update_subKG = kg_slicer.subgraph(target_entities.get_entities(),
                                              args.update_context_depth)
            # write_triples(update_subKG, os.path.join(output_dir, 'subkg.tsv'))
            if args.context_filepath:
                write_triples(update_subKG, args.context_filepath)

        logger.info("Done Generating SubKG for target entities!")
        if args.sub_kg:  # only use the related subset of the graph b
            kg_and_place_holders = update_subKG  #JointTriplesSource(update_subKG, place_holder_triples)
    # Init embedding
    # if args.embedding_adapter == 'ampligragh':

    embedding_adapter = AmpligraphEmbeddingAdapter(
        embedding_output_dir,
        kg_and_place_holders,
        context_subgraph=update_subKG,
        base_model_folder=base_embedding_dir,
        model_name=args.embedding_method,
        update_mode=update_mode,
        update_data_mode=update_data_mode,
        update_params={'lr': update_lr},
        iterations_history=iterations_history,
        seed=args.seed)
    # elif args.embedding_adapter == 'openke_api':
    #     embedding_adapter = OpenKETFEmbeddingAdapter(embedding_output_dir, kg_and_place_holders,
    #                                                  base_model_folder=base_embedding_dir,
    #                                                  kg_encoding_folder=encoding_dict_dir,
    #                                                  model_name=args.embedding_method)
    # else:
    #     raise Exception("Adapter %s not supported!" % args.embedding_adapter)
    aug_relation = DEFUALT_AUX_RELATION
    # relation_full_url('http://execute_aux.org/auxBelongsTo', data_prefix)
    # clusters explainning engine
    clusters_explainer = PathBasedClustersExplainerExtended(
        kg_query_interface,
        labels_indexer=labels_indexer,
        quality_method=objective_quality_measure,
        relation=aug_relation,
        min_coverage=args.expl_c_coverage,
        with_constants=True,
        language_bias={
            'max_length': args.max_length,
            'structure': ExplanationStructure[args.language_structure]
        })
    # Augmentation strategy
    aug_strategy = strategies.get_strategy(
        args.update_strategy,
        kg_query_interface=kg_query_interface,
        quality_method=objective_quality_measure,
        predictions_min_quality=args.prediction_min_q,
        aux_relation=aug_relation)

    explainable_clustering_engine = ExClusteringImpl(
        target_entities,
        embedding_adapter=embedding_adapter,
        clusters_explainer=clusters_explainer,
        augmentation_strategy=aug_strategy,
        clustering_method=args.clustering_method,
        clustering_params={
            'k': number_of_clusters,
            'distance_metric': args.clustering_distance,
            'p': args.cut_prob
        },
        out_dir=output_dir,
        max_iterations=args.max_iterations,
        save_steps=True,
        objective_quality_measure=objective_quality_measure,
        seed=args.seed)
    with open(os.path.join(output_dir, 'cli_args.txt'), 'w') as f:
        json.dump(args.__dict__, f, indent=2)
    return explainable_clustering_engine.run()
Exemple #5
0
    clms.evaluate(target_entities.get_labels(),
                  clustering_results_as_triples.get_labels(),
                  verbose=True))

########################## Explian #############################

##### RUN ONLY ONCE ######
kg_indexer = Indexer(store='remote',
                     endpoint='http://badr:8890/sparql',
                     identifier='http://imdb_example.org')
kg_indexer.index_triples(kg_triples, drop_old=False)
############################ End

# Interfaces to locations where the data is indexed (virtuoso).
query_interface = EndPointKGQueryInterfaceExtended(
    endpoint='http://badr:8890/sparql',
    identifiers=['http://imdb_example.org', 'http://imdb_example.org.types'],
    labels_identifier='http://imdb_example.org.labels.m')
# labels_indexer = Indexer(endpoint='http://tracy:8890/sparql', identifier='http://yago-expr.org.labels.m')

# Explaining engine
quality_method = 'x_coverage'
clusters_explainer = PathBasedClustersExplainerExtended(
    query_interface,
    quality_method=quality_method,
    min_coverage=0.5,
    top=3,
    language_bias={
        'max_length': 2,
        'structure': ExplanationStructure.SUBGRAPH
    })
Exemple #6
0
            triples |= triples_level
            processed |= to_process

            to_process = set(map(lambda t: t[0], triples_level)) | set(
                map(lambda t: t[2], triples_level)) - processed

        triples = np.array([list(t) for t in triples], dtype=object)

        return SimpleTriplesSource(triples, 'subgraph_%i' % hops)


if __name__ == '__main__':

    query_ex = EndPointKGQueryInterfaceExtended(
        endpoint='http://halimede:8890/sparql',
        identifiers=['http://yago-expr.org'])
    kg_sl = KGSlicer(query_ex)
    triples = kg_sl.subgraph(['http://exp-data.org/Everything_Louder'], 3)
    print(triples.size())
    for t in triples:
        print(t)

    pass

    # input_data='/GW/D5data-11/gadelrab/yago2018/train2id.txt.all'
    # output_dir='/GW/D5data-11/gadelrab/yago2018/'
    #
    # percentages=range(25,100,50)
    #
    # for p in percentages: