def main(): psi = 256 t = 100 parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataset', type=str, default='ENZYMES') parser.add_argument('--crossvalidation', default=True, action='store_true', help='Enable a 10-fold crossvalidation') parser.add_argument('--h', type=int, required=False, default=5, help="(Max) number of WL iterations") args = parser.parse_args() dataset = args.dataset h = args.h data_path = os.path.join('./data', dataset) output_path = os.path.join('output', dataset) results_path = os.path.join('results', dataset) for path in [output_path, results_path]: if not os.path.exists(path): os.makedirs(path) label_file = os.path.join(data_path, 'Labels.txt') y = np.array(read_labels(label_file)) cv = StratifiedKFold(n_splits=10, shuffle=True) accuracy_scores = [] label_sequences = compute_wl_embeddings_continuous_by_IK( data_path, h, t, psi) label_sequences = np.array(label_sequences) for train_index, test_index in cv.split(label_sequences, y): X_train = label_sequences[train_index] X_test = label_sequences[test_index] y_train, y_test = y[train_index], y[test_index] # use liblinear X_train = average_graph(X_train, t) X_test = average_graph(X_test, t) gs = LinearSVC().fit(X_train, y_train) y_pred = gs.predict(X_test) accuracy_scores.append(accuracy_score(y_test, y_pred)) print(accuracy_scores) if not args.crossvalidation: break if args.crossvalidation: print('Mean 10-fold accuracy: {:2.2f} +- {:2.2f} %'.format( np.mean(accuracy_scores) * 100, np.std(accuracy_scores) * 100)) else: print('Final accuracy: {:2.3f} %'.format( np.mean(accuracy_scores) * 100))
def main(args, logger): graphs = [ig.read(filename) for filename in args.FILES] labels = read_labels(args.labels) # Set the label to be uniform over all graphs in case no labels are # available. This essentially changes our iteration to degree-based # checks. for graph in graphs: if 'label' not in graph.vs.attributes(): graph.vs['label'] = [0] * len(graph.vs) logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels))) assert len(graphs) == len(labels) pwl = PersistentWeisfeilerLehman( use_cycle_persistence=args.use_cycle_persistence, use_original_features=args.use_original_features, use_label_persistence=args.use_persistence_features, ) if args.use_cycle_persistence: logger.info('Using cycle persistence') y = LabelEncoder().fit_transform(labels) X, num_columns_per_iteration = pwl.transform(graphs, args.num_iterations) X = StandardScaler().fit_transform(X) X = MinMaxScaler().fit_transform(X) logger.info('Finished persistent Weisfeiler-Lehman transformation') logger.info('Obtained ({} x {}) feature matrix'.format( X.shape[0], X.shape[1])) num_classes = len(np.bincount(y)) fig, ax = plt.subplots(nrows=num_classes, ncols=2, sharex=True, sharey=False, squeeze=False) for index in range(num_classes): ax[index][0].matshow(X[y == index], aspect='auto') ax[index][0].set_title(f'Class {index} (features)') ax[index][1].matshow(np.mean(X[y == index], axis=0).reshape(1, -1), aspect='auto') ax[index][1].set_title(f'Class {index} (mean)') plt.show()
def main(args, logger): graphs = [ig.read(filename) for filename in args.FILES] labels = read_labels(args.labels) # Set the label to be uniform over all graphs in case no labels are # available. This essentially changes our iteration to degree-based # checks. for graph in graphs: if 'label' not in graph.vs.attributes(): graph.vs['label'] = [0] * len(graph.vs) logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels))) assert len(graphs) == len(labels) pwl = PersistentWeisfeilerLehman( use_cycle_persistence=args.use_cycle_persistence, use_original_features=args.use_original_features, metric=args.metric, use_label_persistence=True, ) if args.use_cycle_persistence: logger.info('Using cycle persistence') y = LabelEncoder().fit_transform(labels) # This ignores *all* other feature generation methods and falls back # to the original Weisfeiler--Lehman subtree kernel. if args.use_subtree_features: logger.info('Using original subtree features') wl_subtree = WeisfeilerLehmanSubtree() X, num_columns_per_iteration = \ wl_subtree.transform(graphs, args.num_iterations) else: X, num_columns_per_iteration = \ pwl.transform(graphs, args.num_iterations) logger.info('Finished persistent Weisfeiler-Lehman transformation') logger.info('Obtained ({} x {}) feature matrix'.format( X.shape[0], X.shape[1])) np.random.seed(42) mean_accuracies = [] params = [ 'balanced', 'num_iterations', 'filtration', 'use_cycle_persistence', 'use_original_features', 'use_subtree_features', 'metric' ] cv_results = [] entry = {} for param in params: entry[param] = args.__dict__[param] entry['dataset'] = dirname(args.FILES[0]).split('/')[1] for i in range(10): # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i) for n, indices in enumerate(cv.split(X, y)): entry_fold = copy.copy(entry) train_index = indices[0] test_index = indices[1] pipeline = Pipeline( [('fs', FeatureSelector(num_columns_per_iteration)), ('clf', RandomForestClassifier( class_weight='balanced' if args.balanced else None, random_state=42))], ) grid_params = { 'fs__num_iterations': np.arange(0, args.num_iterations + 1), 'clf__n_estimators': [25, 50, 100], } clf = GridSearchCV(pipeline, grid_params, cv=StratifiedKFold(n_splits=5, shuffle=True), iid=False, scoring='accuracy', n_jobs=4) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # TODO: need to discuss whether this is 'allowed' or smart # to do; this assumes normality of the attributes. scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) accuracy_scores.append(acc) for param, param_val in clf.best_params_.items(): entry_fold[param] = param_val entry[param] = '' entry_fold['fold'] = n + 1 entry_fold['it'] = i entry_fold['acc'] = acc * 100 entry_fold['std'] = 0.0 cv_results.append(entry_fold) logger.info('Best classifier for this fold:{}'.format( clf.best_params_)) mean_accuracies.append(np.mean(accuracy_scores)) logger.info( ' - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]' .format(mean_accuracies[-1] * 100, np.mean(mean_accuracies) * 100)) entry['fold'] = 'all' entry['it'] = 'all' entry['acc'] = np.mean(mean_accuracies) * 100 entry['std'] = np.std(mean_accuracies) * 100 cv_results.append(entry) logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format( np.mean(mean_accuracies) * 100, np.std(mean_accuracies) * 100)) if exists(args.result_file): with open(args.result_file, 'a') as f: pd.DataFrame(cv_results).to_csv(f, index=False, header=None) else: pd.DataFrame(cv_results).to_csv(args.result_file, index=False)
type=str, help='Labels file', required=True) parser.add_argument('-n', '--num-graphs', type=int, required=True, help='Sample size') parser.add_argument('-o', '--out-dir', type=str, required=True, help='Output directory') args = parser.parse_args() labels = read_labels(args.labels) y = LabelEncoder().fit_transform(labels) n = len(y) sss = StratifiedShuffleSplit(n_splits=1, random_state=23, train_size=args.num_graphs) for train_index, _ in sss.split(range(n), y): train_index = sorted(train_index) files = np.array(args.FILES) files = files[train_index] try: os.makedirs(args.out_dir)
def main(args, logger): graphs = [ig.read(filename) for filename in args.FILES] labels = read_labels(args.labels) # Set the label to be uniform over all graphs in case no labels are # available. This essentially changes our iteration to degree-based # checks. for graph in graphs: if 'label' not in graph.vs.attributes(): graph.vs['label'] = [0] * len(graph.vs) logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels))) assert len(graphs) == len(labels) # Calculate graph kernel gram_matrix = gk.CalculateEdgeHistKernel(graphs) y = LabelEncoder().fit_transform(labels) #np.random.seed(42) mean_accuracies = [] params = ['balanced'] cv_results = [] entry = {} for param in params: entry[param] = args.__dict__[param] entry['dataset'] = dirname(args.FILES[0]).split('/')[1] entry['baseline'] = 'edge hist kernel' for i in range(10): # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i) for n, indices in enumerate(cv.split(graphs, y)): entry_fold = copy.copy(entry) train_index = indices[0] test_index = indices[1] pipeline = Pipeline( [('clf', SVC(class_weight='balanced' if args.balanced else None, random_state=42, kernel='precomputed'))], ) grid_params = {'clf__C': [1e-1, 1e0, 1e1]} X_train, X_test = gram_matrix[ train_index][:, train_index], gram_matrix[test_index][:, train_index] y_train, y_test = y[train_index], y[test_index] kgscv = KernelGridSearchCV( pipeline, param_grid=grid_params, cv=cv, ) kgscv.fit(X_train, y_train) p = kgscv._best_params sc = kgscv._best_score clf = kgscv._best_estimator clf.fit(X_train, y_train) y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) accuracy_scores.append(acc) for param, param_val in kgscv._best_params.items(): entry_fold[param] = param_val entry[param] = '' entry_fold['fold'] = n + 1 entry_fold['it'] = i entry_fold['acc'] = acc * 100 entry_fold['std'] = 0.0 cv_results.append(entry_fold) logger.info('Best classifier for this fold:{}'.format( kgscv._best_params)) mean_accuracies.append(np.mean(accuracy_scores)) logger.info( ' - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]' .format(mean_accuracies[-1] * 100, np.mean(mean_accuracies) * 100)) entry['fold'] = 'all' entry['it'] = 'all' entry['acc'] = np.mean(mean_accuracies) * 100 entry['std'] = np.std(mean_accuracies) * 100 cv_results.append(entry) logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format( np.mean(mean_accuracies) * 100, np.std(mean_accuracies) * 100)) if exists(args.result_file): with open(args.result_file, 'a') as f: pd.DataFrame(cv_results).to_csv(f, index=False, header=None) else: pd.DataFrame(cv_results).to_csv(args.result_file, index=False)
def main(args, logger): # Read all graphs and labels; there is no direct way of checking # that the labels are 'correct' for the graphs, but at least the # code will check that they have the same cardinality. graphs = [ig.read(filename) for filename in args.FILES] labels = read_labels(args.labels) # Simple pre-processing to ensure that all graphs are set up # equally. # # TODO: make this into a shared function? for graph in graphs: # Set the label to be uniform over all graphs in case no labels are # available. This essentially changes our iteration to degree-based # checks. if 'label' not in graph.vs.attributes(): graph.vs['label'] = [0] * len(graph.vs) # Reset edge weights if they already exist if 'weight' in graph.es.attributes(): graph.es['weight'] = [0] * len(graph.es) logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels))) assert len(graphs) == len(labels) # Replace selected metric if necessary; this only applies to the # uniform metric shortcut. if args.use_uniform_metric: args.metric = 'uniform' pwl = PersistentWeisfeilerLehman( use_cycle_persistence=args.use_cycle_persistence, use_original_features=args.use_original_features, use_label_persistence=True, metric=args.metric, p=args.power, smooth=args.smooth) if args.use_cycle_persistence: logger.info('Using cycle persistence') # Ensures that labels are encoded correctly, regardless of whether # they are numerical or not. y = LabelEncoder().fit_transform(labels) # This ignores *all* other feature generation methods and falls back # to the original Weisfeiler--Lehman subtree kernel. if args.use_subtree_features: logger.info('Using original subtree features') wl_subtree = WeisfeilerLehmanSubtree() X, num_columns_per_iteration = \ wl_subtree.transform(graphs, args.num_iterations) else: X, num_columns_per_iteration = \ pwl.transform(graphs, args.num_iterations) logger.info('Finished persistent Weisfeiler-Lehman transformation') logger.info('Obtained ({} x {}) feature matrix'.format( X.shape[0], X.shape[1])) np.random.seed(42) cv = StratifiedKFold(n_splits=10, shuffle=True) mean_accuracies = [] for i in range(10): # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] for train_index, test_index in cv.split(X, y): rf_clf = RandomForestClassifier( n_estimators=50, class_weight='balanced' if args.balanced else None, random_state=42) if args.grid_search: pipeline = Pipeline( [('fs', FeatureSelector(num_columns_per_iteration)), ('clf', rf_clf)], ) grid_params = { 'fs__num_iterations': np.arange(0, args.num_iterations + 1), 'clf__n_estimators': [10, 20, 50, 100, 150, 200], } clf = GridSearchCV(pipeline, grid_params, cv=StratifiedKFold(n_splits=10, shuffle=True), iid=False, scoring='accuracy', n_jobs=4) else: clf = rf_clf X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy_scores.append(accuracy_score(y_test, y_pred)) logger.debug('Best classifier for this fold: {}'.format(clf)) if args.grid_search: logger.debug('Best parameters for this fold: {}'.format( clf.best_params_)) else: logger.debug('Best parameters for this fold: {}'.format( clf.get_params())) mean_accuracies.append(np.mean(accuracy_scores)) logger.info( ''' - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]'''.format(mean_accuracies[-1] * 100, np.mean(mean_accuracies) * 100)) logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format( np.mean(mean_accuracies) * 100, np.std(mean_accuracies) * 100))
action='store_true', help='Make random forest classifier balanced') parser.add_argument('-l', '--labels', type=str, help='Labels file', required=True) parser.add_argument('-n', '--num-iterations', default=3, type=int, help='Number of Weisfeiler-Lehman iterations') args = parser.parse_args() graphs = [ig.read(filename) for filename in args.FILES] y = np.array(read_labels(args.labels)) wl = WeisfeilerLehman() label_dicts = wl.fit_transform(graphs, args.num_iterations) # Each entry in the list represents the label sequence of a single # graph. The label sequence contains the vertices in its rows, and # the individual iterations in its columns. # # Hence, (i, j) will contain the label of vertex i at iteration j. label_sequences = [ np.full((len(graph.vs), args.num_iterations + 1), np.nan) for graph in graphs ] for iteration in sorted(label_dicts.keys()):
def main(args, logger): graphs = [ig.read(filename) for filename in args.FILES] labels = read_labels(args.labels) # Stores *all* vertex labels of the given graph in order to # determine the conversion factor for persistence diagrams. vertex_labels = set() # Set the label to be uniform over all graphs in case no labels are # available. This essentially changes our iteration to degree-based # checks. for graph in graphs: if 'label' not in graph.vs.attributes(): graph.vs['label'] = [0] * len(graph.vs) vertex_labels.update(graph.vs['label']) logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels))) assert len(graphs) == len(labels) pwl = PersistentWeisfeilerLehman( use_cycle_persistence=args.use_cycle_persistence, use_original_features=args.use_original_features, use_label_persistence=True, store_persistence_diagrams=True, ) if args.use_cycle_persistence: logger.info('Using cycle persistence') y = LabelEncoder().fit_transform(labels) X, num_columns_per_iteration = pwl.transform(graphs, args.num_iterations) persistence_diagrams = pwl._persistence_diagrams fig, ax = plt.subplots(args.num_iterations + 1) for iteration in persistence_diagrams.keys(): M = collections.defaultdict(list) for index, pd in enumerate(persistence_diagrams[iteration]): label = y[index] for _, d, _ in pd: M[label].append(d) d_min = sys.float_info.max d_max = -d_min for hist in M.values(): d_min = min(d_min, min(hist)) d_max = max(d_max, max(hist)) bins = np.linspace(d_min, d_max, 10) for label, hist in M.items(): sns.distplot(hist, bins=bins, rug=True, kde=True, hist=False, ax=ax[iteration]) plt.show() L = len(vertex_labels) assert L > 0 original_labels = pwl._original_labels # Will store *all* persistence diagrams in the form of a probability # distribution. M = np.zeros((len(graphs), (args.num_iterations + 1) * L)) # Will store *all* pairwise distances according to the # Jensen--Shannon divergence (JS), or, alternatively, # the Kullback--Leibler divergence (KL). D_KL = np.zeros((len(graphs), len(graphs))) D_JS = np.zeros((len(graphs), len(graphs))) D = np.zeros((len(graphs), len(graphs))) for iteration in persistence_diagrams.keys(): M, D_KL, D_JS = make_kernel_matrices( persistence_diagrams[iteration], original_labels, # notice that they do *not* change L) D += D_JS D = -D fig, ax = plt.subplots(len(set(y))) for label in sorted(set(y)): ax[label].matshow(M[y == label], aspect='auto', vmin=0, vmax=1) plt.show() logger.info('Finished persistent Weisfeiler-Lehman transformation') logger.info('Obtained ({} x {}) feature matrix'.format( X.shape[0], X.shape[1])) np.random.seed(42) cv = StratifiedKFold(n_splits=10, shuffle=True) mean_accuracies = [] for i in range(10): # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] for train_index, test_index in cv.split(X, y): rf_clf = RandomForestClassifier( n_estimators=50, class_weight='balanced' if args.balanced else None) if args.grid_search: pipeline = Pipeline( [('fs', FeatureSelector(num_columns_per_iteration)), ('clf', rf_clf)], ) grid_params = { 'fs__num_iterations': np.arange(0, args.num_iterations + 1), 'clf__n_estimators': [10, 20, 50, 100, 150, 200], } clf = GridSearchCV(pipeline, grid_params, cv=StratifiedKFold(n_splits=10, shuffle=True), iid=False, scoring='accuracy', n_jobs=4) else: clf = rf_clf clf = SVC(kernel='precomputed') clf.fit(D, y) y_test = y y_pred = clf.predict(D) #X_train, X_test = X[train_index], X[test_index] #y_train, y_test = y[train_index], y[test_index] ## TODO: need to discuss whether this is 'allowed' or smart ## to do; this assumes normality of the attributes. #scaler = StandardScaler() #X_train = scaler.fit_transform(X_train) #X_test = scaler.transform(X_test) #scaler = MinMaxScaler() #X_train = scaler.fit_transform(X_train) #X_test = scaler.transform(X_test) #clf.fit(X_train, y_train) #y_pred = clf.predict(X_test) accuracy_scores.append(accuracy_score(y_test, y_pred)) logger.debug('Best classifier for this fold: {}'.format(clf)) if args.grid_search: logger.debug('Best parameters for this fold: {}'.format( clf.best_params_)) else: logger.debug('Best parameters for this fold: {}'.format( clf.get_params())) mean_accuracies.append(np.mean(accuracy_scores)) logger.info( ' - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]' .format(mean_accuracies[-1] * 100, np.mean(mean_accuracies) * 100)) logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format( np.mean(mean_accuracies) * 100, np.std(mean_accuracies) * 100))
import utilities # Decide read/write mode based on python version read_mode, write_mode = ('r', 'w') if six.PY2 else ('rt', 'wt') # Set the path to your consolidated files path = '/Users/chrysovalantis/Documents/UCY/EPL451/Project' os.chdir(path) # File names ftrain = 'train_consolidation.txt' ftest = 'test_consolidation.txt' flabel = 'trainLabels.csv' fsubmission = 'submission.csv' labels = utilities.read_labels(flabel) # Dimensions for train set ntrain = 10868 nfeature = 16 ** 2 + 1 + 1 # For two_byte_codes, no_que_marks, label train = utilities.read_train(ntrain, nfeature, labels, ftrain) X = train[:, :-1] y = train[:, -1] del labels del train # Parameters for trees random_state = 5342 n_jobs = 8
def main(args, logger): graphs = [ig.read(filename) for filename in args.FILES] labels = read_labels(args.labels) # Set the label to be uniform over all graphs in case no labels are # available. This essentially changes our iteration to degree-based # checks. for graph in graphs: if 'label' not in graph.vs.attributes(): graph.vs['label'] = [0] * len(graph.vs) logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels))) assert len(graphs) == len(labels) pwl = PersistentWeisfeilerLehman( use_label_persistence=True, store_persistence_diagrams=False, # TODO: might need this later on? ) y = LabelEncoder().fit_transform(labels) X, num_columns_per_iteration = pwl.transform(graphs, args.num_iterations) logger.info('Finished persistent Weisfeiler-Lehman transformation') logger.info('Obtained ({} x {}) feature matrix'.format(X.shape[0], X.shape[1])) X = to_probability_distribution(X, num_columns_per_iteration) np.random.seed(42) cv = StratifiedKFold(n_splits=3, shuffle=True) mean_accuracies = [] def product_kernel(X, Y, k): # Index for the current iteration; indicates which column is # used as the *first* one. start_index = 0 K = np.zeros((X.shape[0], Y.shape[0])) for iteration in sorted(num_columns_per_iteration.keys()): end_index = num_columns_per_iteration[iteration] P = X[:, start_index:end_index] Q = Y[:, start_index:end_index] # TODO: can this be made more efficient? K_iteration = np.array( [k(p, q) for p, q in itertools.product(P, Q)]).reshape( X.shape[0], Y.shape[0] ) K += K_iteration start_index += end_index return K def jensen_shannon_kernel(X, Y): return product_kernel(X, Y, jensen_shannon) def kullback_leibler_kernel(X, Y): return -product_kernel(X, Y, kullback_leibler) for i in range(3): # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] for train_index, test_index in cv.split(X, y): clf = SVC( kernel=jensen_shannon_kernel, ) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # TODO: need to discuss whether this is 'allowed' or smart # to do; this assumes normality of the attributes. scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy_scores.append(accuracy_score(y_test, y_pred)) logger.debug('Best classifier for this fold: {}'.format(clf)) logger.debug('Best parameters for this fold: {}'.format(clf.get_params())) mean_accuracies.append(np.mean(accuracy_scores)) logger.info(' - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]'.format(mean_accuracies[-1] * 100, np.mean(mean_accuracies) * 100)) logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format(np.mean(mean_accuracies) * 100, np.std(mean_accuracies) * 100))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataset', type=str, help='Provide the dataset name (MUTAG or Enzymes)', choices=['MUTAG', 'ENZYMES']) parser.add_argument('--crossvalidation', default=False, action='store_true', help='Enable a 10-fold crossvalidation') parser.add_argument('--gridsearch', default=False, action='store_true', help='Enable grid search') parser.add_argument('--sinkhorn', default=False, action='store_true', help='Use sinkhorn approximation') parser.add_argument('--h', type = int, required=False, default=2, help = "(Max) number of WL iterations") args = parser.parse_args() dataset = args.dataset h = args.h sinkhorn = args.sinkhorn print(f'Generating results for {dataset}...') #--------------------------------- # Setup #--------------------------------- # Start by making directories for intermediate and final files data_path = os.path.join('../data', dataset) output_path = os.path.join('output', dataset) results_path = os.path.join('results', dataset) for path in [output_path, results_path]: if not os.path.exists(path): os.makedirs(path) #--------------------------------- # Embeddings #--------------------------------- # Load the data and generate the embeddings embedding_type = 'continuous' if dataset == 'ENZYMES' else 'discrete' print(f'Generating {embedding_type} embeddings for {dataset}.') if dataset == 'ENZYMES': label_sequences = compute_wl_embeddings_continuous(data_path, h) else: label_sequences = compute_wl_embeddings_discrete(data_path, h) # Save embeddings to output folder out_name = f'{dataset}_wl_{embedding_type}_embeddings_h{h}.npy' np.save(os.path.join(output_path, out_name), label_sequences) print(f'Embeddings for {dataset} computed, saved to {os.path.join(output_path, out_name)}.') print() #--------------------------------- # Wasserstein & Kernel computations #--------------------------------- # Run Wasserstein distance computation print('Computing the Wasserstein distances...') wasserstein_distances = compute_wasserstein_distance(label_sequences, h, sinkhorn=sinkhorn, discrete=dataset=='MUTAG') # Save Wasserstein distance matrices for i, D_w in enumerate(wasserstein_distances): filext = 'wasserstein_distance_matrix' if sinkhorn: filext += '_sinkhorn' filext += f'_it{i}.npy' np.save(os.path.join(output_path,filext), D_w) print('Wasserstein distances computation done. Saved to file.') print() # Transform to Kernel # Here the flags come into play if args.gridsearch: # Gammas in eps(-gamma*M): gammas = np.logspace(-4,1,num=6) # iterate over the iterations too hs = range(h) param_grid = [ {'C': np.logspace(-3,3,num=7)} ] else: gammas = [0.001] hs = [h] kernel_matrices = [] kernel_params = [] for i, current_h in enumerate(hs): # Generate the full list of kernel matrices from which to select M = wasserstein_distances[current_h] for g in gammas: K = np.exp(-g*M) kernel_matrices.append(K) kernel_params.append((current_h, g)) # Check for no hyperparam: if not args.gridsearch: assert len(kernel_matrices) == 1 print('Kernel matrices computed.') print() #--------------------------------- # Classification #--------------------------------- # Run hyperparameter search if needed print(f'Running SVMs, crossvalidation: {args.crossvalidation}, gridsearch: {args.gridsearch}.') # Load labels label_file = os.path.join(data_path, 'Labels.txt') y = np.array(read_labels(label_file)) # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] np.random.seed(42) cv = StratifiedKFold(n_splits=10, shuffle=True) # Hyperparam logging best_C = [] best_h = [] best_gamma = [] for train_index, test_index in cv.split(kernel_matrices[0], y): K_train = [K[train_index][:, train_index] for K in kernel_matrices] K_test = [K[test_index][:, train_index] for K in kernel_matrices] y_train, y_test = y[train_index], y[test_index] # Gridsearch if args.gridsearch: gs, best_params = custom_grid_search_cv(SVC(kernel='precomputed'), param_grid, K_train, y_train, cv=5) # Store best params C_ = best_params['params']['C'] h_, gamma_ = kernel_params[best_params['K_idx']] y_pred = gs.predict(K_test[best_params['K_idx']]) else: gs = SVC(C=100, kernel='precomputed').fit(K_train[0], y_train) y_pred = gs.predict(K_test[0]) h_, gamma_, C_ = h, gammas[0], 100 best_C.append(C_) best_h.append(h_) best_gamma.append(gamma_) accuracy_scores.append(accuracy_score(y_test, y_pred)) if not args.crossvalidation: break #--------------------------------- # Printing and logging #--------------------------------- if args.crossvalidation: print('Mean 10-fold accuracy: {:2.2f} +- {:2.2f} %'.format( np.mean(accuracy_scores) * 100, np.std(accuracy_scores) * 100)) else: print('Final accuracy: {:2.3f} %'.format(np.mean(accuracy_scores)*100)) # Save to file if args.crossvalidation or args.gridsearch: extension = '' if args.crossvalidation: extension += '_crossvalidation' if args.gridsearch: extension += '_gridsearch' results_filename = os.path.join(results_path, f'results_{dataset}'+extension+'.csv') n_splits = 10 if args.crossvalidation else 1 pd.DataFrame(np.array([best_h, best_C, best_gamma, accuracy_scores]).T, columns=[['h', 'C', 'gamma', 'accuracy']], index=['fold_id{}'.format(i) for i in range(n_splits)]).to_csv(results_filename) print(f'Results saved in {results_filename}.') else: print('No results saved to file as --crossvalidation or --gridsearch were not selected.')
def main(args, logger): labels = read_labels(args.labels) # Load matrices matrices = np.load(args.MATRICES) print( f"Loaded {len(list(matrices.keys()))} matrices, with shape {matrices['0'].shape}" ) matrix_dict = {} for h in matrices.keys(): matrix_dict[int(h)] = {'gram': matrices[h]} y = LabelEncoder().fit_transform(labels) np.random.seed(42) mean_accuracies = [] kernel_params = np.array(basename(args.MATRICES)[:-4].split('_'))[1:] params = ['dataset', 'max_h', 'sigma'] cv_results = [] entry = {} for i, param in enumerate(params): entry[param] = kernel_params[i] for i in range(10): # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i) for n, indices in enumerate(cv.split(matrix_dict[0]['gram'], y)): entry_fold = copy.copy(entry) train_index = indices[0] test_index = indices[1] y_train = y[train_index] y_test = y[test_index] # Override current full matrices for h, m_dict in matrix_dict.items(): X_train = m_dict['gram'][train_index][:, train_index] X_test = m_dict['gram'][test_index][:, train_index] m_dict['X_train'] = X_train m_dict['X_test'] = X_test pipeline = Pipeline( [('clf', SVC(class_weight='balanced' if args.balanced else None, random_state=42, kernel='precomputed'))], ) grid_params = { 'clf__C': [1e-1, 1e0, 1e1], } clf, best_params = custom_grid_search_cv(pipeline, grid_params, matrix_dict, y_train) X_test = np.zeros(shape=matrix_dict[0]['X_test'].shape) counter = 0 for h in range(best_params['h'] + 1): X_test += matrix_dict[h]['X_test'] counter += 1 y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) accuracy_scores.append(acc) best_params['params']['h'] = best_params['h'] for param, param_val in best_params['params'].items(): entry_fold[param] = param_val entry[param] = '' entry_fold['fold'] = n + 1 entry_fold['it'] = i entry_fold['acc'] = acc * 100 entry_fold['std'] = 0.0 cv_results.append(entry_fold) print('Best classifier for this fold:{}'.format( best_params['params'])) mean_accuracies.append(np.mean(accuracy_scores)) print( ' - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]' .format(mean_accuracies[-1] * 100, np.mean(mean_accuracies) * 100)) entry['fold'] = 'all' entry['it'] = 'all' entry['acc'] = np.mean(mean_accuracies) * 100 entry['std'] = np.std(mean_accuracies) * 100 cv_results.append(entry) logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format( np.mean(mean_accuracies) * 100, np.std(mean_accuracies) * 100)) if exists(args.result_file): with open(args.result_file, 'a') as f: pd.DataFrame(cv_results).to_csv(f, index=False, header=None) else: pd.DataFrame(cv_results).to_csv(args.result_file, index=False)
def main(args, logger): graphs = [ig.read(filename) for filename in args.FILES] labels = read_labels(args.labels) for graph in graphs: # Make sure that no label information exists as a graph # attribute already. assert 'label' not in graph.vs.attributes() graph.vs['degree'] = graph.vs.degree() logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels))) assert len(graphs) == len(labels) prop = WeisfeilerLehmanAttributePropagation() attributes_per_iteration = prop.transform( graphs, 'degree', args.num_iterations ) use_vertex_weights = args.vertex_weights # Stores *all* persistence diagrams because they will be used to # represent the data set later on. persistence_diagrams_per_iteration = collections.defaultdict(list) for iteration in sorted(attributes_per_iteration.keys()): # Determine maximum attribute value over *all* graphs and their # respective filtrations. max_attribute = max([np.max(attributes_per_iteration[iteration][index]) for index, _ in enumerate(graphs)]) unpaired_value = 2 * max_attribute if use_vertex_weights: pdc = PersistenceDiagramCalculator( unpaired_value=unpaired_value, vertex_attribute='degree', ) else: pdc = PersistenceDiagramCalculator( unpaired_value=unpaired_value ) for index, graph in enumerate(graphs): attributes = attributes_per_iteration[iteration][index] graph.vs['degree'] = attributes weighted_graph = assign_filtration_values( graph, attributes, normalize=args.normalize ) pd, edge_indices_cycles = pdc.fit_transform(graph) # Store the persistence diagram as a 2D array in order to # facilitate the subsequent kernel calculations. persistence_diagrams_per_iteration[iteration].append( np.array([(c, d) for c, d, _ in pd]) ) np.savetxt( '/tmp/{:04d}_d0_h{:d}.txt'.format(index, iteration), np.array([(c, d) for c, d, _ in pd]), fmt='%.f' )
def main(args, logger): graphs = [ig.read(filename) for filename in args.FILES] labels = read_labels(args.labels) # Set the label to be uniform over all graphs in case no labels are # available. This essentially changes our iteration to degree-based # checks. for graph in graphs: if 'label' not in graph.vs.attributes(): graph.vs['label'] = [0] * len(graph.vs) logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels))) assert len(graphs) == len(labels) pwl_list = [] for p in [1, 2]: pwl = PersistentWeisfeilerLehman( use_cycle_persistence=args.use_cycle_persistence, use_original_features=args.use_original_features, metric=args.metric, use_label_persistence=True, p=p) X, num_columns_per_iteration = pwl.transform(graphs, args.num_iterations) pwl_list.append({'p': p, 'X': X}) logger.info( f'Finished persistent Weisfeiler-Lehman transformation for \ p={p}') logger.info('Obtained ({} x {}) feature matrix'.format( X.shape[0], X.shape[1])) if args.use_cycle_persistence: logger.info('Using cycle persistence') y = LabelEncoder().fit_transform(labels) np.random.seed(42) mean_accuracies = [] params = [ 'balanced', 'num_iterations', 'filtration', 'use_cycle_persistence', 'use_original_features', 'metric' ] cv_results = [] entry = {} for param in params: entry[param] = args.__dict__[param] entry['dataset'] = dirname(args.FILES[0]).split('/')[1] for i in range(10): # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i) for n, indices in enumerate(cv.split(X, y)): entry_fold = copy.copy(entry) train_index = indices[0] test_index = indices[1] y_train = y[train_index] y_test = y[test_index] # Override current full matrices for pwl_dict in pwl_list: scaler = StandardScaler() X_train = scaler.fit_transform(pwl_dict['X'][train_index]) X_test = scaler.transform(pwl_dict['X'][test_index]) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) pwl_dict['X_train'] = X_train pwl_dict['X_test'] = X_test pipeline = Pipeline( [('fs', FeatureSelector(num_columns_per_iteration)), ('clf', RandomForestClassifier( class_weight='balanced' if args.balanced else None, random_state=42, n_jobs=4))], ) grid_params = { 'fs__num_iterations': np.arange(0, args.num_iterations + 1), 'clf__n_estimators': [25, 50, 100], } clf, best_params = custom_grid_search_cv(pipeline, grid_params, pwl_list, y_train) X_test = pwl_list[best_params['pwl_idx']]['X_test'] y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) accuracy_scores.append(acc) best_params['params']['p'] = best_params['pwl_idx'] + 1 for param, param_val in best_params['params'].items(): entry_fold[param] = param_val entry[param] = '' entry_fold['fold'] = n + 1 entry_fold['it'] = i entry_fold['acc'] = acc * 100 entry_fold['std'] = 0.0 cv_results.append(entry_fold) logger.info('Best classifier for this fold:{}'.format( best_params['params'])) mean_accuracies.append(np.mean(accuracy_scores)) logger.info( ' - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]' .format(mean_accuracies[-1] * 100, np.mean(mean_accuracies) * 100)) entry['fold'] = 'all' entry['it'] = 'all' entry['acc'] = np.mean(mean_accuracies) * 100 entry['std'] = np.std(mean_accuracies) * 100 cv_results.append(entry) logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format( np.mean(mean_accuracies) * 100, np.std(mean_accuracies) * 100)) if exists(args.result_file): with open(args.result_file, 'a') as f: pd.DataFrame(cv_results).to_csv(f, index=False, header=None) else: pd.DataFrame(cv_results).to_csv(args.result_file, index=False)