Exemple #1
0
    def construct_feedback(self):
        logger.info("Construct Feedback Triples!")

        output_file=None
        if self.save_steps:
            output_file = os.path.join(self.get_current_itr_directory(), 'feedback_triples.tsv')

        triples = self.augmentation_strategy.get_augmentation_triples(
            descriptions = self.current_itr.clusters_explanations_dict,
            target_entities= self.current_itr.entity_clusters_triples,
            output_file=output_file,
            iter_num=self.current_itr.id)
        
        if self.save_steps:
            write_triples(triples, output_file)

        logger.info("Done Constructing Feedback Triples!")
        return triples
Exemple #2
0
    def cluster(self):
        logger.info("Start clustering")
        entity_vectors = self.current_itr.target_entities_embeddings
        logger.debug(entity_vectors)
        logger.info("size of the data " + str(entity_vectors.shape))

        y_pred = self.clustering_method.cluster(entity_vectors, clustering_params=self.clustering_params,
                                                output_folder=self.get_current_itr_directory())
        triples = EntityLabelsToTriples(np.column_stack((self.target_entities.get_entities(), y_pred.reshape(-1, 1))),
                                        iter_id=self.current_itr.id)

        if self.save_steps:
            output_file = os.path.join(self.get_current_itr_directory(), 'clustering.tsv')
            output_vecs_file = os.path.join(self.get_current_itr_directory(), 'embeddings_vecs.tsv')
            write_triples(triples, output_file)
            write_vectors(entity_vectors, output_vecs_file)
            output_labels_file = os.path.join(self.get_current_itr_directory(), 'clustering_labels_only.tsv')
            write_vectors(y_pred.reshape(-1,1), output_labels_file)

        return triples
print(exist)
print(missing[:5])

target_entities_embedding_vectors = model.get_embeddings(target_entities.get_entities())


# cluster with whatever methods
km = KMeans(n_clusters=number_of_clusters, n_init=20, n_jobs=8)
y_pred = km.fit_predict(target_entities_embedding_vectors)

# To make the results in triples format
clustering_results_as_triples = EntityLabelsToTriples(np.column_stack((target_entities.get_entities(), y_pred)))


# to save clustering results as triples
write_triples(clustering_results_as_triples, os.path.join(experiment_dir, 'clustering.tsv'))

# evaluate clustering using normal measures and add them to methods results
current_method_result.update(
    clms.evaluate(target_entities.get_labels(), clustering_results_as_triples.get_labels(), verbose=True))

########################## Explian #############################
# Note: this is not the perfect interface, I will try to simplify it on the way.

# Interfaces to locations where the data is indexed (virtuoso).
query_interface = EndPointKGQueryInterfaceExtended(endpoint='http://tracy:8890/sparql',
                                          identifiers=['http://yago-expr.org','http://yago-expr.org.types'],
                                                       labels_identifier='http://yago-expr.org.labels.m')
# labels_indexer = Indexer(endpoint='http://tracy:8890/sparql', identifier='http://yago-expr.org.labels.m')

# Explaining engine
Exemple #4
0
def initiate_problem(args, time_now):
    # objective quality
    objective_quality_measure = args.objective_quality
    # kg file
    kg_filepath = args.kg
    # initialize output_dir
    # output_dir = args.output_folder if args.output_folder else os.path.join(args.output_folder, "run_%s" % time_now)
    output_dir = os.path.join(args.output_folder, "run_%s" % time_now)
    embedding_output_dir = os.path.join(output_dir, 'embedding')
    base_embedding_dir = args.embedding_dir
    # encoding_dict_dir = args.encoding_dict_dir
    # embedding_base_model = os.path.join(embedding_dir, 'base')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    logger.info('Output Dir: %s' % output_dir)
    # Traget entities
    data_prefix = args.data_prefix
    target_entities_filepath = args.target_entities
    target_entities = EntitiesLabelsFile(target_entities_filepath,
                                         prefix=data_prefix,
                                         safe_urls=args.data_safe_urls)
    number_of_clusters = args.number_of_clusters if args.number_of_clusters else target_entities.get_num_clusters(
    )
    logger.info('Number of clusters %i' % number_of_clusters)
    # file source to read the kg
    kg_file_source = FileTriplesSource(kg_filepath,
                                       prefix=data_prefix,
                                       safe_urls=args.data_safe_urls)
    # place_holder_triples = PlaceHolderTriplesSource(10, 10, prefix=data_prefix, safe_urls=args.data_safe_urls)
    kg_and_place_holders = kg_file_source  #JointTriplesSource(kg_file_source, place_holder_triples)
    kg_identfier = args.kg_identifier
    labels_identifier = kg_identfier + '_%s.labels' % time_now
    logger.info("KG Identifiers: %s\t%s" % (kg_identfier, labels_identifier))
    # index data if required
    host = args.host
    graph = None
    labels_indexer = None
    if args.index is not None:
        if args.index == 'remote':
            indexer = Indexer(endpoint=host, identifier=kg_identfier)
            labels_indexer = Indexer(endpoint=host,
                                     identifier=labels_identifier)
        else:
            indexer = Indexer(store='memory', identifier=kg_identfier)
            labels_indexer = Indexer(store='memory',
                                     identifier=labels_identifier)

        if args.drop_index or not indexer.graph_exists():
            logger.info("KG will be indexed to %s (%s %s)" %
                        (args.index, args.drop_index, indexer.graph_exists()))
            graph = indexer.index_triples(kg_file_source,
                                          drop_old=args.drop_index)
            logger.info("Done indexing!")
    logger.info("Embedding adapter chosen: %s" % args.embedding_adapter)
    update_data_mode = UpdateDataMode[args.update_data_mode.upper()]
    update_mode = UpdateMode[args.update_mode.upper()]
    iterations_history = args.update_triples_history
    update_lr = args.update_learning_rate
    # executor
    if args.index == 'remote':
        kg_query_interface = EndPointKGQueryInterfaceExtended(
            endpoint=host,
            identifiers=[kg_identfier, kg_identfier + '.types'],
            labels_identifier=labels_identifier)
    else:
        kg_query_interface = RdflibKGQueryInterfaceExtended(
            graphs=[graph, Graph(identifier=labels_identifier)])
    update_subKG = None
    if update_data_mode == UpdateDataMode.ASSIGNMENTS_SUBGRAPH or args.sub_kg or \
            update_data_mode == UpdateDataMode.SUBGRAPH_ONLY:
        logger.info("Generating SubKG for target entities!")
        # if we
        if args.context_filepath and os.path.exists(args.context_filepath):
            update_subKG = FileTriplesSource(args.context_filepath,
                                             prefix=data_prefix,
                                             safe_urls=args.data_safe_urls)
        else:
            kg_slicer = KGSlicer(kg_query_interface)
            update_subKG = kg_slicer.subgraph(target_entities.get_entities(),
                                              args.update_context_depth)
            # write_triples(update_subKG, os.path.join(output_dir, 'subkg.tsv'))
            if args.context_filepath:
                write_triples(update_subKG, args.context_filepath)

        logger.info("Done Generating SubKG for target entities!")
        if args.sub_kg:  # only use the related subset of the graph b
            kg_and_place_holders = update_subKG  #JointTriplesSource(update_subKG, place_holder_triples)
    # Init embedding
    # if args.embedding_adapter == 'ampligragh':

    embedding_adapter = AmpligraphEmbeddingAdapter(
        embedding_output_dir,
        kg_and_place_holders,
        context_subgraph=update_subKG,
        base_model_folder=base_embedding_dir,
        model_name=args.embedding_method,
        update_mode=update_mode,
        update_data_mode=update_data_mode,
        update_params={'lr': update_lr},
        iterations_history=iterations_history,
        seed=args.seed)
    # elif args.embedding_adapter == 'openke_api':
    #     embedding_adapter = OpenKETFEmbeddingAdapter(embedding_output_dir, kg_and_place_holders,
    #                                                  base_model_folder=base_embedding_dir,
    #                                                  kg_encoding_folder=encoding_dict_dir,
    #                                                  model_name=args.embedding_method)
    # else:
    #     raise Exception("Adapter %s not supported!" % args.embedding_adapter)
    aug_relation = DEFUALT_AUX_RELATION
    # relation_full_url('http://execute_aux.org/auxBelongsTo', data_prefix)
    # clusters explainning engine
    clusters_explainer = PathBasedClustersExplainerExtended(
        kg_query_interface,
        labels_indexer=labels_indexer,
        quality_method=objective_quality_measure,
        relation=aug_relation,
        min_coverage=args.expl_c_coverage,
        with_constants=True,
        language_bias={
            'max_length': args.max_length,
            'structure': ExplanationStructure[args.language_structure]
        })
    # Augmentation strategy
    aug_strategy = strategies.get_strategy(
        args.update_strategy,
        kg_query_interface=kg_query_interface,
        quality_method=objective_quality_measure,
        predictions_min_quality=args.prediction_min_q,
        aux_relation=aug_relation)

    explainable_clustering_engine = ExClusteringImpl(
        target_entities,
        embedding_adapter=embedding_adapter,
        clusters_explainer=clusters_explainer,
        augmentation_strategy=aug_strategy,
        clustering_method=args.clustering_method,
        clustering_params={
            'k': number_of_clusters,
            'distance_metric': args.clustering_distance,
            'p': args.cut_prob
        },
        out_dir=output_dir,
        max_iterations=args.max_iterations,
        save_steps=True,
        objective_quality_measure=objective_quality_measure,
        seed=args.seed)
    with open(os.path.join(output_dir, 'cli_args.txt'), 'w') as f:
        json.dump(args.__dict__, f, indent=2)
    return explainable_clustering_engine.run()