def construct_feedback(self): logger.info("Construct Feedback Triples!") output_file=None if self.save_steps: output_file = os.path.join(self.get_current_itr_directory(), 'feedback_triples.tsv') triples = self.augmentation_strategy.get_augmentation_triples( descriptions = self.current_itr.clusters_explanations_dict, target_entities= self.current_itr.entity_clusters_triples, output_file=output_file, iter_num=self.current_itr.id) if self.save_steps: write_triples(triples, output_file) logger.info("Done Constructing Feedback Triples!") return triples
def cluster(self): logger.info("Start clustering") entity_vectors = self.current_itr.target_entities_embeddings logger.debug(entity_vectors) logger.info("size of the data " + str(entity_vectors.shape)) y_pred = self.clustering_method.cluster(entity_vectors, clustering_params=self.clustering_params, output_folder=self.get_current_itr_directory()) triples = EntityLabelsToTriples(np.column_stack((self.target_entities.get_entities(), y_pred.reshape(-1, 1))), iter_id=self.current_itr.id) if self.save_steps: output_file = os.path.join(self.get_current_itr_directory(), 'clustering.tsv') output_vecs_file = os.path.join(self.get_current_itr_directory(), 'embeddings_vecs.tsv') write_triples(triples, output_file) write_vectors(entity_vectors, output_vecs_file) output_labels_file = os.path.join(self.get_current_itr_directory(), 'clustering_labels_only.tsv') write_vectors(y_pred.reshape(-1,1), output_labels_file) return triples
print(exist) print(missing[:5]) target_entities_embedding_vectors = model.get_embeddings(target_entities.get_entities()) # cluster with whatever methods km = KMeans(n_clusters=number_of_clusters, n_init=20, n_jobs=8) y_pred = km.fit_predict(target_entities_embedding_vectors) # To make the results in triples format clustering_results_as_triples = EntityLabelsToTriples(np.column_stack((target_entities.get_entities(), y_pred))) # to save clustering results as triples write_triples(clustering_results_as_triples, os.path.join(experiment_dir, 'clustering.tsv')) # evaluate clustering using normal measures and add them to methods results current_method_result.update( clms.evaluate(target_entities.get_labels(), clustering_results_as_triples.get_labels(), verbose=True)) ########################## Explian ############################# # Note: this is not the perfect interface, I will try to simplify it on the way. # Interfaces to locations where the data is indexed (virtuoso). query_interface = EndPointKGQueryInterfaceExtended(endpoint='http://tracy:8890/sparql', identifiers=['http://yago-expr.org','http://yago-expr.org.types'], labels_identifier='http://yago-expr.org.labels.m') # labels_indexer = Indexer(endpoint='http://tracy:8890/sparql', identifier='http://yago-expr.org.labels.m') # Explaining engine
def initiate_problem(args, time_now): # objective quality objective_quality_measure = args.objective_quality # kg file kg_filepath = args.kg # initialize output_dir # output_dir = args.output_folder if args.output_folder else os.path.join(args.output_folder, "run_%s" % time_now) output_dir = os.path.join(args.output_folder, "run_%s" % time_now) embedding_output_dir = os.path.join(output_dir, 'embedding') base_embedding_dir = args.embedding_dir # encoding_dict_dir = args.encoding_dict_dir # embedding_base_model = os.path.join(embedding_dir, 'base') if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info('Output Dir: %s' % output_dir) # Traget entities data_prefix = args.data_prefix target_entities_filepath = args.target_entities target_entities = EntitiesLabelsFile(target_entities_filepath, prefix=data_prefix, safe_urls=args.data_safe_urls) number_of_clusters = args.number_of_clusters if args.number_of_clusters else target_entities.get_num_clusters( ) logger.info('Number of clusters %i' % number_of_clusters) # file source to read the kg kg_file_source = FileTriplesSource(kg_filepath, prefix=data_prefix, safe_urls=args.data_safe_urls) # place_holder_triples = PlaceHolderTriplesSource(10, 10, prefix=data_prefix, safe_urls=args.data_safe_urls) kg_and_place_holders = kg_file_source #JointTriplesSource(kg_file_source, place_holder_triples) kg_identfier = args.kg_identifier labels_identifier = kg_identfier + '_%s.labels' % time_now logger.info("KG Identifiers: %s\t%s" % (kg_identfier, labels_identifier)) # index data if required host = args.host graph = None labels_indexer = None if args.index is not None: if args.index == 'remote': indexer = Indexer(endpoint=host, identifier=kg_identfier) labels_indexer = Indexer(endpoint=host, identifier=labels_identifier) else: indexer = Indexer(store='memory', identifier=kg_identfier) labels_indexer = Indexer(store='memory', identifier=labels_identifier) if args.drop_index or not indexer.graph_exists(): logger.info("KG will be indexed to %s (%s %s)" % (args.index, args.drop_index, indexer.graph_exists())) graph = indexer.index_triples(kg_file_source, drop_old=args.drop_index) logger.info("Done indexing!") logger.info("Embedding adapter chosen: %s" % args.embedding_adapter) update_data_mode = UpdateDataMode[args.update_data_mode.upper()] update_mode = UpdateMode[args.update_mode.upper()] iterations_history = args.update_triples_history update_lr = args.update_learning_rate # executor if args.index == 'remote': kg_query_interface = EndPointKGQueryInterfaceExtended( endpoint=host, identifiers=[kg_identfier, kg_identfier + '.types'], labels_identifier=labels_identifier) else: kg_query_interface = RdflibKGQueryInterfaceExtended( graphs=[graph, Graph(identifier=labels_identifier)]) update_subKG = None if update_data_mode == UpdateDataMode.ASSIGNMENTS_SUBGRAPH or args.sub_kg or \ update_data_mode == UpdateDataMode.SUBGRAPH_ONLY: logger.info("Generating SubKG for target entities!") # if we if args.context_filepath and os.path.exists(args.context_filepath): update_subKG = FileTriplesSource(args.context_filepath, prefix=data_prefix, safe_urls=args.data_safe_urls) else: kg_slicer = KGSlicer(kg_query_interface) update_subKG = kg_slicer.subgraph(target_entities.get_entities(), args.update_context_depth) # write_triples(update_subKG, os.path.join(output_dir, 'subkg.tsv')) if args.context_filepath: write_triples(update_subKG, args.context_filepath) logger.info("Done Generating SubKG for target entities!") if args.sub_kg: # only use the related subset of the graph b kg_and_place_holders = update_subKG #JointTriplesSource(update_subKG, place_holder_triples) # Init embedding # if args.embedding_adapter == 'ampligragh': embedding_adapter = AmpligraphEmbeddingAdapter( embedding_output_dir, kg_and_place_holders, context_subgraph=update_subKG, base_model_folder=base_embedding_dir, model_name=args.embedding_method, update_mode=update_mode, update_data_mode=update_data_mode, update_params={'lr': update_lr}, iterations_history=iterations_history, seed=args.seed) # elif args.embedding_adapter == 'openke_api': # embedding_adapter = OpenKETFEmbeddingAdapter(embedding_output_dir, kg_and_place_holders, # base_model_folder=base_embedding_dir, # kg_encoding_folder=encoding_dict_dir, # model_name=args.embedding_method) # else: # raise Exception("Adapter %s not supported!" % args.embedding_adapter) aug_relation = DEFUALT_AUX_RELATION # relation_full_url('http://execute_aux.org/auxBelongsTo', data_prefix) # clusters explainning engine clusters_explainer = PathBasedClustersExplainerExtended( kg_query_interface, labels_indexer=labels_indexer, quality_method=objective_quality_measure, relation=aug_relation, min_coverage=args.expl_c_coverage, with_constants=True, language_bias={ 'max_length': args.max_length, 'structure': ExplanationStructure[args.language_structure] }) # Augmentation strategy aug_strategy = strategies.get_strategy( args.update_strategy, kg_query_interface=kg_query_interface, quality_method=objective_quality_measure, predictions_min_quality=args.prediction_min_q, aux_relation=aug_relation) explainable_clustering_engine = ExClusteringImpl( target_entities, embedding_adapter=embedding_adapter, clusters_explainer=clusters_explainer, augmentation_strategy=aug_strategy, clustering_method=args.clustering_method, clustering_params={ 'k': number_of_clusters, 'distance_metric': args.clustering_distance, 'p': args.cut_prob }, out_dir=output_dir, max_iterations=args.max_iterations, save_steps=True, objective_quality_measure=objective_quality_measure, seed=args.seed) with open(os.path.join(output_dir, 'cli_args.txt'), 'w') as f: json.dump(args.__dict__, f, indent=2) return explainable_clustering_engine.run()