def main(args, logger): graphs = [ig.read(filename) for filename in args.FILES] labels = read_labels(args.labels) # Set the label to be uniform over all graphs in case no labels are # available. This essentially changes our iteration to degree-based # checks. for graph in graphs: if 'label' not in graph.vs.attributes(): graph.vs['label'] = [0] * len(graph.vs) logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels))) assert len(graphs) == len(labels) pwl = PersistentWeisfeilerLehman( use_cycle_persistence=args.use_cycle_persistence, use_original_features=args.use_original_features, metric=args.metric, use_label_persistence=True, ) if args.use_cycle_persistence: logger.info('Using cycle persistence') y = LabelEncoder().fit_transform(labels) # This ignores *all* other feature generation methods and falls back # to the original Weisfeiler--Lehman subtree kernel. if args.use_subtree_features: logger.info('Using original subtree features') wl_subtree = WeisfeilerLehmanSubtree() X, num_columns_per_iteration = \ wl_subtree.transform(graphs, args.num_iterations) else: X, num_columns_per_iteration = \ pwl.transform(graphs, args.num_iterations) logger.info('Finished persistent Weisfeiler-Lehman transformation') logger.info('Obtained ({} x {}) feature matrix'.format( X.shape[0], X.shape[1])) np.random.seed(42) mean_accuracies = [] params = [ 'balanced', 'num_iterations', 'filtration', 'use_cycle_persistence', 'use_original_features', 'use_subtree_features', 'metric' ] cv_results = [] entry = {} for param in params: entry[param] = args.__dict__[param] entry['dataset'] = dirname(args.FILES[0]).split('/')[1] for i in range(10): # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i) for n, indices in enumerate(cv.split(X, y)): entry_fold = copy.copy(entry) train_index = indices[0] test_index = indices[1] pipeline = Pipeline( [('fs', FeatureSelector(num_columns_per_iteration)), ('clf', RandomForestClassifier( class_weight='balanced' if args.balanced else None, random_state=42))], ) grid_params = { 'fs__num_iterations': np.arange(0, args.num_iterations + 1), 'clf__n_estimators': [25, 50, 100], } clf = GridSearchCV(pipeline, grid_params, cv=StratifiedKFold(n_splits=5, shuffle=True), iid=False, scoring='accuracy', n_jobs=4) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # TODO: need to discuss whether this is 'allowed' or smart # to do; this assumes normality of the attributes. scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) accuracy_scores.append(acc) for param, param_val in clf.best_params_.items(): entry_fold[param] = param_val entry[param] = '' entry_fold['fold'] = n + 1 entry_fold['it'] = i entry_fold['acc'] = acc * 100 entry_fold['std'] = 0.0 cv_results.append(entry_fold) logger.info('Best classifier for this fold:{}'.format( clf.best_params_)) mean_accuracies.append(np.mean(accuracy_scores)) logger.info( ' - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]' .format(mean_accuracies[-1] * 100, np.mean(mean_accuracies) * 100)) entry['fold'] = 'all' entry['it'] = 'all' entry['acc'] = np.mean(mean_accuracies) * 100 entry['std'] = np.std(mean_accuracies) * 100 cv_results.append(entry) logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format( np.mean(mean_accuracies) * 100, np.std(mean_accuracies) * 100)) if exists(args.result_file): with open(args.result_file, 'a') as f: pd.DataFrame(cv_results).to_csv(f, index=False, header=None) else: pd.DataFrame(cv_results).to_csv(args.result_file, index=False)
def main(args, logger): # Read all graphs and labels; there is no direct way of checking # that the labels are 'correct' for the graphs, but at least the # code will check that they have the same cardinality. graphs = [ig.read(filename) for filename in args.FILES] labels = read_labels(args.labels) # Simple pre-processing to ensure that all graphs are set up # equally. # # TODO: make this into a shared function? for graph in graphs: # Set the label to be uniform over all graphs in case no labels are # available. This essentially changes our iteration to degree-based # checks. if 'label' not in graph.vs.attributes(): graph.vs['label'] = [0] * len(graph.vs) # Reset edge weights if they already exist if 'weight' in graph.es.attributes(): graph.es['weight'] = [0] * len(graph.es) logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels))) assert len(graphs) == len(labels) # Replace selected metric if necessary; this only applies to the # uniform metric shortcut. if args.use_uniform_metric: args.metric = 'uniform' pwl = PersistentWeisfeilerLehman( use_cycle_persistence=args.use_cycle_persistence, use_original_features=args.use_original_features, use_label_persistence=True, metric=args.metric, p=args.power, smooth=args.smooth) if args.use_cycle_persistence: logger.info('Using cycle persistence') # Ensures that labels are encoded correctly, regardless of whether # they are numerical or not. y = LabelEncoder().fit_transform(labels) # This ignores *all* other feature generation methods and falls back # to the original Weisfeiler--Lehman subtree kernel. if args.use_subtree_features: logger.info('Using original subtree features') wl_subtree = WeisfeilerLehmanSubtree() X, num_columns_per_iteration = \ wl_subtree.transform(graphs, args.num_iterations) else: X, num_columns_per_iteration = \ pwl.transform(graphs, args.num_iterations) logger.info('Finished persistent Weisfeiler-Lehman transformation') logger.info('Obtained ({} x {}) feature matrix'.format( X.shape[0], X.shape[1])) np.random.seed(42) cv = StratifiedKFold(n_splits=10, shuffle=True) mean_accuracies = [] for i in range(10): # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] for train_index, test_index in cv.split(X, y): rf_clf = RandomForestClassifier( n_estimators=50, class_weight='balanced' if args.balanced else None, random_state=42) if args.grid_search: pipeline = Pipeline( [('fs', FeatureSelector(num_columns_per_iteration)), ('clf', rf_clf)], ) grid_params = { 'fs__num_iterations': np.arange(0, args.num_iterations + 1), 'clf__n_estimators': [10, 20, 50, 100, 150, 200], } clf = GridSearchCV(pipeline, grid_params, cv=StratifiedKFold(n_splits=10, shuffle=True), iid=False, scoring='accuracy', n_jobs=4) else: clf = rf_clf X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy_scores.append(accuracy_score(y_test, y_pred)) logger.debug('Best classifier for this fold: {}'.format(clf)) if args.grid_search: logger.debug('Best parameters for this fold: {}'.format( clf.best_params_)) else: logger.debug('Best parameters for this fold: {}'.format( clf.get_params())) mean_accuracies.append(np.mean(accuracy_scores)) logger.info( ''' - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]'''.format(mean_accuracies[-1] * 100, np.mean(mean_accuracies) * 100)) logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format( np.mean(mean_accuracies) * 100, np.std(mean_accuracies) * 100))
from features import FeatureSelector import pickle import matplotlib.pyplot as plt states = pickle.load(open('featureset.pt', 'rb')) for state in states: selector = FeatureSelector(state) selector.agent_at_border() print(selector.features) plt.figure() vis_field = state['field'].copy() x, y = state['self'][3] vis_field[x, y] = 4 #for bomb in state['bombs']: # bx, by = bomb[0] # if bomb[1] != 0: # vis_field[bx, by] = 2 for bomb in state['coins']: bx, by = bomb vis_field[bx, by] = 2 for agent in state['others']: bx, by = agent[3] vis_field[bx, by] = 3 plt.imshow(vis_field) plt.show()
def main(args, logger): graphs = [ig.read(filename) for filename in args.FILES] labels = read_labels(args.labels) # Stores *all* vertex labels of the given graph in order to # determine the conversion factor for persistence diagrams. vertex_labels = set() # Set the label to be uniform over all graphs in case no labels are # available. This essentially changes our iteration to degree-based # checks. for graph in graphs: if 'label' not in graph.vs.attributes(): graph.vs['label'] = [0] * len(graph.vs) vertex_labels.update(graph.vs['label']) logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels))) assert len(graphs) == len(labels) pwl = PersistentWeisfeilerLehman( use_cycle_persistence=args.use_cycle_persistence, use_original_features=args.use_original_features, use_label_persistence=True, store_persistence_diagrams=True, ) if args.use_cycle_persistence: logger.info('Using cycle persistence') y = LabelEncoder().fit_transform(labels) X, num_columns_per_iteration = pwl.transform(graphs, args.num_iterations) persistence_diagrams = pwl._persistence_diagrams fig, ax = plt.subplots(args.num_iterations + 1) for iteration in persistence_diagrams.keys(): M = collections.defaultdict(list) for index, pd in enumerate(persistence_diagrams[iteration]): label = y[index] for _, d, _ in pd: M[label].append(d) d_min = sys.float_info.max d_max = -d_min for hist in M.values(): d_min = min(d_min, min(hist)) d_max = max(d_max, max(hist)) bins = np.linspace(d_min, d_max, 10) for label, hist in M.items(): sns.distplot(hist, bins=bins, rug=True, kde=True, hist=False, ax=ax[iteration]) plt.show() L = len(vertex_labels) assert L > 0 original_labels = pwl._original_labels # Will store *all* persistence diagrams in the form of a probability # distribution. M = np.zeros((len(graphs), (args.num_iterations + 1) * L)) # Will store *all* pairwise distances according to the # Jensen--Shannon divergence (JS), or, alternatively, # the Kullback--Leibler divergence (KL). D_KL = np.zeros((len(graphs), len(graphs))) D_JS = np.zeros((len(graphs), len(graphs))) D = np.zeros((len(graphs), len(graphs))) for iteration in persistence_diagrams.keys(): M, D_KL, D_JS = make_kernel_matrices( persistence_diagrams[iteration], original_labels, # notice that they do *not* change L) D += D_JS D = -D fig, ax = plt.subplots(len(set(y))) for label in sorted(set(y)): ax[label].matshow(M[y == label], aspect='auto', vmin=0, vmax=1) plt.show() logger.info('Finished persistent Weisfeiler-Lehman transformation') logger.info('Obtained ({} x {}) feature matrix'.format( X.shape[0], X.shape[1])) np.random.seed(42) cv = StratifiedKFold(n_splits=10, shuffle=True) mean_accuracies = [] for i in range(10): # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] for train_index, test_index in cv.split(X, y): rf_clf = RandomForestClassifier( n_estimators=50, class_weight='balanced' if args.balanced else None) if args.grid_search: pipeline = Pipeline( [('fs', FeatureSelector(num_columns_per_iteration)), ('clf', rf_clf)], ) grid_params = { 'fs__num_iterations': np.arange(0, args.num_iterations + 1), 'clf__n_estimators': [10, 20, 50, 100, 150, 200], } clf = GridSearchCV(pipeline, grid_params, cv=StratifiedKFold(n_splits=10, shuffle=True), iid=False, scoring='accuracy', n_jobs=4) else: clf = rf_clf clf = SVC(kernel='precomputed') clf.fit(D, y) y_test = y y_pred = clf.predict(D) #X_train, X_test = X[train_index], X[test_index] #y_train, y_test = y[train_index], y[test_index] ## TODO: need to discuss whether this is 'allowed' or smart ## to do; this assumes normality of the attributes. #scaler = StandardScaler() #X_train = scaler.fit_transform(X_train) #X_test = scaler.transform(X_test) #scaler = MinMaxScaler() #X_train = scaler.fit_transform(X_train) #X_test = scaler.transform(X_test) #clf.fit(X_train, y_train) #y_pred = clf.predict(X_test) accuracy_scores.append(accuracy_score(y_test, y_pred)) logger.debug('Best classifier for this fold: {}'.format(clf)) if args.grid_search: logger.debug('Best parameters for this fold: {}'.format( clf.best_params_)) else: logger.debug('Best parameters for this fold: {}'.format( clf.get_params())) mean_accuracies.append(np.mean(accuracy_scores)) logger.info( ' - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]' .format(mean_accuracies[-1] * 100, np.mean(mean_accuracies) * 100)) logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format( np.mean(mean_accuracies) * 100, np.std(mean_accuracies) * 100))
]] test = pd.read_csv('data/validation_data.csv', parse_dates=['date']) X_test = test.merge(employee, on='employee id').loc[:, [ 'date', 'category', 'pre-tax amount', 'role', ]] model = Pipeline([ ('features', FeatureUnion([ # weekend? ('weekend', Pipeline([ ('selector', FeatureSelector('date')), ('transform', IsWeekendTransformer()), ])), # category ('category', Pipeline([ ('selector', FeatureSelector('category')), ('encode', PipelineLabelBinarizer()), ])), # role ('role', Pipeline([ ('selector', FeatureSelector('role')), ('encode', PipelineLabelBinarizer()), ])),
y_train = training.loc[:, ['category']].values.ravel() X_val = validation.loc[:, ['expense description', 'pre-tax amount']] y_val = validation.loc[:, ['category']].values.ravel() # build data pipeline! pipeline = Pipeline([ ( 'features', FeatureUnion( [ # expense description feature ('description', Pipeline([ ('selector', FeatureSelector('expense description')), ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ])), # pretax amount feature ('pretax', Pipeline([ ('selector', FeatureSelector('pre-tax amount', wrap=True)), ('scaler', StandardScaler()), ])) ], transformer_weights={ 'description': 1.0, 'pretax': 1.0,
#!/usr/bin/env python #-*- encoding:utf-8 -*- import sys, os from preprocess import Preprocessor from features import FeatureSelector from bayes import BayesClassifier if __name__ == '__main__': train_file = sys.argv[1] test_file = sys.argv[2] pr = Preprocessor() pr.build_vocabulary_and_categories(train_file) fs = FeatureSelector(train_file, ck = 500) fs.select_features() bc = BayesClassifier(train_file, test_file, model = 'bernoulli') bc.train() bc.test()
def main(args, logger): graphs = [ig.read(filename) for filename in args.FILES] labels = read_labels(args.labels) # Set the label to be uniform over all graphs in case no labels are # available. This essentially changes our iteration to degree-based # checks. for graph in graphs: if 'label' not in graph.vs.attributes(): graph.vs['label'] = [0] * len(graph.vs) logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels))) assert len(graphs) == len(labels) pwl_list = [] for p in [1, 2]: pwl = PersistentWeisfeilerLehman( use_cycle_persistence=args.use_cycle_persistence, use_original_features=args.use_original_features, metric=args.metric, use_label_persistence=True, p=p) X, num_columns_per_iteration = pwl.transform(graphs, args.num_iterations) pwl_list.append({'p': p, 'X': X}) logger.info( f'Finished persistent Weisfeiler-Lehman transformation for \ p={p}') logger.info('Obtained ({} x {}) feature matrix'.format( X.shape[0], X.shape[1])) if args.use_cycle_persistence: logger.info('Using cycle persistence') y = LabelEncoder().fit_transform(labels) np.random.seed(42) mean_accuracies = [] params = [ 'balanced', 'num_iterations', 'filtration', 'use_cycle_persistence', 'use_original_features', 'metric' ] cv_results = [] entry = {} for param in params: entry[param] = args.__dict__[param] entry['dataset'] = dirname(args.FILES[0]).split('/')[1] for i in range(10): # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i) for n, indices in enumerate(cv.split(X, y)): entry_fold = copy.copy(entry) train_index = indices[0] test_index = indices[1] y_train = y[train_index] y_test = y[test_index] # Override current full matrices for pwl_dict in pwl_list: scaler = StandardScaler() X_train = scaler.fit_transform(pwl_dict['X'][train_index]) X_test = scaler.transform(pwl_dict['X'][test_index]) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) pwl_dict['X_train'] = X_train pwl_dict['X_test'] = X_test pipeline = Pipeline( [('fs', FeatureSelector(num_columns_per_iteration)), ('clf', RandomForestClassifier( class_weight='balanced' if args.balanced else None, random_state=42, n_jobs=4))], ) grid_params = { 'fs__num_iterations': np.arange(0, args.num_iterations + 1), 'clf__n_estimators': [25, 50, 100], } clf, best_params = custom_grid_search_cv(pipeline, grid_params, pwl_list, y_train) X_test = pwl_list[best_params['pwl_idx']]['X_test'] y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) accuracy_scores.append(acc) best_params['params']['p'] = best_params['pwl_idx'] + 1 for param, param_val in best_params['params'].items(): entry_fold[param] = param_val entry[param] = '' entry_fold['fold'] = n + 1 entry_fold['it'] = i entry_fold['acc'] = acc * 100 entry_fold['std'] = 0.0 cv_results.append(entry_fold) logger.info('Best classifier for this fold:{}'.format( best_params['params'])) mean_accuracies.append(np.mean(accuracy_scores)) logger.info( ' - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]' .format(mean_accuracies[-1] * 100, np.mean(mean_accuracies) * 100)) entry['fold'] = 'all' entry['it'] = 'all' entry['acc'] = np.mean(mean_accuracies) * 100 entry['std'] = np.std(mean_accuracies) * 100 cv_results.append(entry) logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format( np.mean(mean_accuracies) * 100, np.std(mean_accuracies) * 100)) if exists(args.result_file): with open(args.result_file, 'a') as f: pd.DataFrame(cv_results).to_csv(f, index=False, header=None) else: pd.DataFrame(cv_results).to_csv(args.result_file, index=False)