def process() -> tuple: X, Y = dataset_helper.get_dataset_cached(graph_cache_file) X = graph_helper.get_graphs_only(X) graph_helper.convert_graphs_to_adjs_tuples(X) X = [(x, [0] * len(y)) for x, y in X] estimator, params = task_helper.get_graph_estimator_and_params(X, Y) return ClassificationData(X, Y, estimator, params)
def process_dataset(cache_file, label_lookup_file, args): dataset = filename_utils.get_dataset_from_filename(cache_file) cache_filename = filename_utils.get_filename_only(cache_file, with_extension=False) threshold, topn = filename_utils.get_topn_threshold_from_lookupfilename( label_lookup_file) result_file = cache_file.replace( dataset, 'relabeled_threshold_{}_topn_{}_{}'.format(threshold, topn, dataset)) if not args.force and os.path.exists(result_file): return with open(label_lookup_file, 'rb') as f: label_lookup = pickle.load(f) X, Y = dataset_helper.get_dataset_cached(cache_file) X = graph_helper.get_graphs_only(X) # Get label to be renamed node_labels = list(chain.from_iterable([x.nodes() for x in X])) unique_labels = set(node_labels) counter = collections.Counter(node_labels) node_labels_to_be_renamed = set([ label for label, occurrences in counter.items() if occurrences <= args.max_occurrence ]) lookup_ = { label: new_label for label, new_label in label_lookup.items() if label in node_labels_to_be_renamed } new_labels = set(lookup_.values()) lookup__ = collections.defaultdict(list) for label, new_label in label_lookup.items(): if new_label in new_labels: lookup__[label].append(new_label) lookup_ = dict(lookup_, **lookup__) LOGGER.info( '{:80} topn={:4} threshold={:4}\n\t\t#relabeled labels: {}\n\t\t#unique labels: {}\n\t\t#nodes: {}' .format(cache_filename, topn, threshold, len(lookup_), len(unique_labels), len(node_labels))) relabel_trans = transformers.RelabelGraphsTransformer(lookup_) X = relabel_trans.transform(X) with open(result_file, 'wb') as f: pickle.dump((X, Y), f)
def process() -> tuple: K, Y = dataset_helper.get_dataset_cached(gram_cache_file, check_validity=False) estimator = sklearn.pipeline.Pipeline([('classifier', None)]) params = dict( classifier=[sklearn.svm.SVC()], classifier__kernel='precomputed', classifier__class_weight='balanced' ) return ClassificationData(K, Y, estimator, params)
def process_relabeled(): X, Y = dataset_helper.get_dataset_cached(graph_cache_file) X = graph_helper.get_graphs_only(X) estimator, params = task_helper.get_graph_estimator_and_params(X, Y) params['graph_preprocessing'] = [transformers.RelabelGraphsTransformer()] params['graph_preprocessing__dataset'] = [dataset] params['graph_preprocessing__threshold'] = [0.99] params['graph_preprocessing__topn'] = [10] return ClassificationData(X, Y, estimator, params)
def process_dataset(dataset_name, args, embedding_models): print('\tdataset: {:20} - Processing'.format(dataset_name)) results = {} used_models = embedding_models + [ ('trained', dataset_helper.get_w2v_embedding_for_dataset(dataset_name)) ] if args.check_own_embeddings else embedding_models all_graph_cache_files = [ x for x in dataset_helper.get_all_cached_graph_datasets() if x.endswith('{}.npy'.format(dataset_name)) ] graph_cache_files = [] found_all_cache = False found_gml_cache = False for cache_file in all_graph_cache_files: # ... if len(graph_cache_files) == 2: break if (not found_gml_cache and 'gml' in cache_file) or (not found_all_cache and 'all' in cache_file): found_all_cache = found_all_cache or 'all' in cache_file found_gml_cache = found_gml_cache or 'gml' in cache_file graph_cache_files.append(cache_file) if len(graph_cache_files) != 2: print('\tdataset: {:20} - Found: gml: {}, all: {}'.format( dataset_name, found_gml_cache, found_all_cache)) for model_name, model in used_models: results[model_name] = {} print('\tdataset: {:20} - Model: {}'.format(dataset_name, model_name)) for graph_cache_file in graph_cache_files: print('\tdataset: {:20} - Graph: {}'.format( dataset_name, graph_cache_file)) X, Y = dataset_helper.get_dataset_cached(graph_cache_file) labels = graph_helper.get_all_node_labels_uniq(X) print('\tdataset: {:20} - #unique labels: {}'.format( dataset_name, len(labels))) counter = {'found': 0, 'not_found': 0} not_found_labels = [] for idx, label in enumerate(labels): if label in model: counter['found'] += 1 else: if len(not_found_labels) < 100: not_found_labels.append(label) counter['not_found'] += 1 print( '\tdataset: {:20} - {}, Found: {}%, Missing Labels Sample: {}'. format(dataset_name, counter, int(100 * counter['found'] / len(labels)), not_found_labels[:10])) results[model_name][graph_cache_file] = { 'num_labels': len(labels), 'counts': counter, 'not_found_sample': not_found_labels } print('\tdataset: {:20} - Finished'.format(dataset_name)) return results
def test_convert_graph_datasets(self): for graph_dataset, dataset_name in self.iterate_graph_cache_datasets(): X, Y = dataset_helper.get_dataset_cached(graph_dataset) self.assertTrue(len(X)) self.assertTrue(len(Y)) graph_helper.convert_graphs_to_adjs_tuples(X) for x in X: self.assertTrue(isinstance(x, tuple)) self.assertTrue(isinstance(x[0], scipy.sparse.spmatrix)) self.assertTrue(isinstance(x[1], list)) break
def get_combined_text_graph_dataset(graph_cache_file, use_ana=False) -> typing.Tuple[typing.List[typing.Tuple], typing.List]: dataset_name = filename_utils.get_dataset_from_filename(graph_cache_file) X_text, Y_text = dataset_helper.get_dataset(dataset_name + ('-ana' if use_ana else '')) X_graph, Y_graph = dataset_helper.get_dataset_cached(graph_cache_file) # Same length but has ID if len(X_graph) == len(X_text) and (not isinstance(X_graph[0], tuple) or not isinstance(X_graph[0][1], str)): return list(zip(X_graph, X_text, [None] * len(X_graph))), Y_graph # Get class to class ids mapping class_2_id = collections.defaultdict(lambda: []) for x, y in zip(X_text, Y_text): class_2_id[y].append(x) X_combined, Y_combined = [], Y_graph for (x_graph, y_id), y_graph in zip(X_graph, Y_graph): y_id = int(y_id) X_combined.append((x_graph, class_2_id[y_graph][y_id], y_id)) return X_combined, Y_combined
def process_dataset(dataset_name, args): LOGGER.info('{:15} - Start'.format(dataset_name)) LOGGER.info('{:15} - Retrieving trained embedding'.format(dataset_name)) pre_trained_embedding = embeddings.get_embedding_model( args.pre_trained_embedding, binary=False, first_line_header=True, with_gensim=True) try: trained_embedding = dataset_helper.get_w2v_embedding_for_dataset( dataset_name) except FileNotFoundError as e: LOGGER.exception(e) return cmap_cache_files = dataset_helper.get_all_cached_graph_datasets( dataset_name=dataset_name, graph_type=constants.TYPE_CONCEPT_MAP) coo_cache_files = [ x for x in dataset_helper.get_all_cached_graph_datasets( dataset_name=dataset_name, graph_type=constants.TYPE_COOCCURRENCE) if 'all' in x ] if not len(cmap_cache_files) or not len(coo_cache_files): return used_graphs = [cmap_cache_files[0], coo_cache_files[0]] LOGGER.info('{:15} - Retrieving dataset'.format(dataset_name)) all_labels = set() for graph_cache_file in used_graphs: X, _ = dataset_helper.get_dataset_cached(graph_cache_file) X = graph_helper.get_graphs_only(X) all_labels |= graph_helper.get_all_node_labels_uniq( X, as_sorted_list=False) LOGGER.info('{:15} - Resolving embeddings'.format(dataset_name)) embeddings_pre_trained, not_found_pre_trained_coreferenced, not_found_trained, not_found_pre_trained, lookup, similar_els = embeddings.get_embeddings_for_labels_with_lookup( all_labels, trained_embedding, pre_trained_embedding) LOGGER.info('{:15} - Missing'.format(dataset_name)) for label, s in [('trained', not_found_trained), ('pre_trained', not_found_pre_trained), ('after_coreference', not_found_pre_trained_coreferenced) ]: LOGGER.info('\t{:20} {:>6}'.format(label, len(s))) embedding_file = '{}/{}.w2v.txt'.format(args.embeddings_result_folder, dataset_name) embeddings.save_embedding_dict(embeddings_pre_trained, embedding_file) embeddings_pre_trained = embeddings.load_word2vec_format( fname=embedding_file, binary=False) LOGGER.info('{:15} - Co-reference resolution'.format(dataset_name)) max_topn = max(args.topn) similar_labels = coreference.get_most_similar_labels( all_labels, embeddings_pre_trained, max_topn) for topn in args.topn: for threshold in args.merge_threshold: LOGGER.info( '{:15} - Co-reference resolution: topn: {}, threshold: {}'. format(dataset_name, topn, threshold)) clique_lookup = coreference.create_label_cliques_by_similarity( similar_labels, threshold=threshold, topn=topn) new_lookup = embeddings.merge_lookups(clique_lookup, lookup) with open( '{}/{}.threshold-{}.topn-{}.label-lookup.npy'.format( args.embeddings_result_folder, dataset_name, threshold, topn), 'wb') as f: pickle.dump(new_lookup, f) LOGGER.info('{:15} - Finished'.format(dataset_name))
def process(): X, Y = dataset_helper.get_dataset_cached(graph_cache_file) estimator, params = text_pipeline.get_params(reduced=False) estimator.steps.insert(0, ('graph_to_text', GraphToTextTransformer())) params = dict(params, **dict(graph_to_text__use_edges=[True, False])) return ClassificationData(X, Y, estimator, params)
def process() -> tuple: X, Y = dataset_helper.get_dataset_cached(graph_cache_file) X = graph_helper.get_graphs_only(X) estimator, params = task_helper.get_graph_estimator_and_params(X, Y, with_node_weights=True) return ClassificationData(X, Y, estimator, params)
def process_graph_cache_file(graph_cache_file, args): graph_cache_filename = graph_cache_file.split('/')[-1].rsplit('.')[0] dataset = filename_utils.get_dataset_from_filename(graph_cache_file) if '.phi.' in graph_cache_filename or not filter_utils.file_should_be_processed( graph_cache_filename, args.include_filter, args.exclude_filter, args.limit_dataset): return LOGGER.info('{:15} starting ({})'.format(dataset, graph_cache_filename)) fast_wl_trans = FastWLGraphKernelTransformer( h=args.wl_h, use_early_stopping=False, truncate_to_highest_label=False) try: phi_graph_cache_file = graph_cache_file.replace('.npy', '.phi.npy') X_graphs, Y = dataset_helper.get_dataset_cached(graph_cache_file) X_graphs = graph_helper.get_graphs_only(X_graphs) # Kernel: WL if args.use_wl: used_phi_graph_cache_file = phi_graph_cache_file splitted_phi_graph_cache_file = phi_graph_cache_file.replace( '.phi', '.splitted.phi') phi_same_label_graph_cache_file = phi_graph_cache_file.replace( dataset, '{}_same-label'.format(dataset)).replace( '.phi', '.splitted.phi') # Stop here if all files have already been created if not args.force and np.all([ os.path.exists(x) for x in [ splitted_phi_graph_cache_file, used_phi_graph_cache_file, phi_same_label_graph_cache_file ] ]): return X_, Y_ = np.array(np.copy(X_graphs)), np.array(np.copy(Y)) if args.wl_sort_classes: X_, Y_ = sort(X_, Y_, by=Y_) num_vertices = len(graph_helper.get_all_node_labels(X_)) fast_wl_trans.set_params(phi_dim=num_vertices) X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split( np.copy(X_), np.copy(Y_), stratify=Y_, test_size=args.wl_test_size) X_train, Y_train = sort(X_train, Y_train, by=Y_train) X_test, Y_test = sort(X_test, Y_test, by=Y_test) # Splitted version if args.force or not os.path.exists(splitted_phi_graph_cache_file): t = sklearn.base.clone(fast_wl_trans).set_params( same_label=True) phi_train = t.fit_transform(np.copy(X_train)) phi_test = t.transform(np.copy(X_test)) with open(splitted_phi_graph_cache_file, 'wb') as f: pickle.dump((phi_train, phi_test, X_train, X_test, Y_train, Y_test), f) # Splitted, same label if args.force or not os.path.exists( phi_same_label_graph_cache_file): t = sklearn.base.clone(fast_wl_trans) phi_train = t.fit_transform(X_train) phi_test = t.transform(X_test) with open(phi_same_label_graph_cache_file, 'wb') as f: pickle.dump((phi_train, phi_test, X_train, X_test, Y_train, Y_test), f) # Whole dataset if args.force or not os.path.exists(used_phi_graph_cache_file): t = sklearn.base.clone(fast_wl_trans) with open(used_phi_graph_cache_file, 'wb') as f: pickle.dump((t.fit_transform(X_), Y_), f) # Kernel: spgk if args.use_spgk: for depth in args.spgk_depth: spgk_graph_cache_file = graph_cache_file.replace( '.npy', '.spgk-{}.gram.npy'.format(depth)) if args.force or not os.path.exists(spgk_graph_cache_file): K = spgk.transform(X_graphs, depth=depth) with open(spgk_graph_cache_file, 'wb') as f: pickle.dump((K, Y), f) except Exception as e: LOGGER.exception(e) LOGGER.info('{:15} finished ({})'.format(dataset, graph_cache_filename))