def main(): parser = argparse.ArgumentParser() parser.add_argument('-p', help='path to thedirectory with libsvm files') args = parser.parse_args() path = args.p dataset_reader = DatasetReader(path) fold = 0 result = {'25': [], '200': []} start = time.time() while dataset_reader.has_next(): print('FOLD {}'.format(fold)) # Load the regular data X_train, y_train, X_test, y_test = dataset_reader.get_next_fold() for N in [25, 200]: print('{} Neighbours'.format(N)) clf = MetaLazyClassifier( n_neighbors=N, select_features=False, weight_function='inverse', log_time_file= '/home/lfmendes/data/mestrado/metalazy/results/tempos2/logtimes{}_{}.json' .format(N, fold)) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # print(classification_report(y_pred=y_pred, y_true=y_test)) result[str(N)].append( f1_score(y_true=y_test, y_pred=y_pred, average='macro')) clf.flush_log_time_file() fold = fold + 1 print('\n\n ---------\n EXPERIMENT RESULT \n ---------') print(result) for N in ['25', '200']: print('{}: {}'.format(N, np.mean(np.array(result[N])))) end = time.time() print(end - start)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-p', help='path to the directory with libsvm files') parser.add_argument('-o', help='path to the output directory') parser.add_argument( '-j', help='number of jobs to run in parallel. use -1 for all - Default:-1') parser.add_argument( '-g', help='Size of the sample to the hyperparameter search - Default-5000') args = parser.parse_args() output_path = args.o if not os.path.exists(output_path): os.makedirs(output_path) path = args.p n_jobs = -1 if args.j: n_jobs = int(args.j) grid_size = 5000 if args.g: grid_size = int(args.g) dataset_reader = DatasetReader(path) fold = 0 result = [] times = [] start = time.time() while dataset_reader.has_next(): time_dic = {} print('FOLD {}'.format(fold)) start_fold = time.time() # Load the regular data X_train, y_train, X_test, y_test = dataset_reader.get_next_fold() # Create the classifier clf = MetaLazyClassifier(select_features=False, n_jobs=n_jobs, grid_size=grid_size) clf = ExtraTreesClassifier(n_jobs=-1) # tuned_parameters = [{'specific_classifier': ['nb'], # 'weight_function': ['none', 'cosine', 'inverse'], # 'n_neighbors': [200, 50], 'number_of_cooccurrences': [1, 10]}] #tuned_parameters = [{'specific_classifier': ['nb', 'logistic', 'extrarf'], tuned_parameters = [{ 'specific_classifier': ['nb', 'logistic', 'extrarf'], 'weight_function': ['inverse'], 'n_neighbors': [100], 'number_of_cooccurrences': [10] }] tuned_parameters = [{ 'criterion': ['gini', 'entropy'], 'max_features': ['log2', 'sqrt'], 'class_weight': ['balanced', 'None'], 'n_estimators': [100, 200] }] print('GENERAL STARTING') start_grid = time.time() grid = GridSearchCV(clf, tuned_parameters, cv=3, scoring='f1_macro', n_jobs=1) grid.fit(X_train, y_train) end = time.time() print('GENERAL - Total grid time: {}'.format((end - start_grid))) print('GENERAL - Best score was {} with \n {}'.format( grid.best_score_, grid.best_estimator_)) grid.best_score_, grid.best_estimator_ # Fit the train data fit(grid.best_estimator_, X_train, y_train, time_dic) # Predict y_pred = predict(grid.best_estimator_, X_test, time_dic) print(str(grid.best_estimator_)) print(str(grid.best_estimator_.weaker)) # Save the result result.append({ 'macro': f1_score(y_true=y_test, y_pred=y_pred, average='macro'), 'micro': f1_score(y_true=y_test, y_pred=y_pred, average='micro'), 'config': str(grid.best_estimator_), 'best_clf': str(grid.best_estimator_.weaker), }) print('Macro: {}'.format( f1_score(y_true=y_test, y_pred=y_pred, average='macro'))) print('Micro: {}'.format( f1_score(y_true=y_test, y_pred=y_pred, average='micro'))) times.append(time_dic) fold = fold + 1 end_fold = time.time() print('Total fold time: {}'.format((end_fold - start_fold))) print('train size {}'.format(X_train.shape)) print('test size {}'.format(X_test.shape)) print() print(result) end = time.time() print('Total time: {}'.format((end - start))) result_dataframe = pd.DataFrame(data=result) print(result_dataframe.head(10)) result_dataframe.to_csv(output_path + '/result.csv', index=False) times_dataframe = pd.DataFrame(data=times) print(times_dataframe.head(10)) times_dataframe.to_csv(output_path + '/times.csv', index=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-p', help='path to the directory with libsvm files') parser.add_argument('-o', help='path to the output directory') parser.add_argument('-j', help='number of jobs to run in parallel. use -1 for all - Default:-1') parser.add_argument('-g', help='Size of the sample to the hyperparameter search - Default-5000') args = parser.parse_args() output_path = args.o if not os.path.exists(output_path): os.makedirs(output_path) path = args.p n_jobs = -1 if args.j: n_jobs = int(args.j) grid_size = 5000 if args.g: grid_size = int(args.g) dataset_reader = DatasetReader(path) fold = 0 result = [] times = [] configurations = {'specific_classifier': [0, 1], 'weight': [0, 1], 'cooccurrence': [0, 1]} start = time.time() while dataset_reader.has_next(): time_dic = {} print('FOLD {}'.format(fold)) # Load the regular data X_train, y_train, X_test, y_test = dataset_reader.get_next_fold() # Create the classifier clf = MetaLazyClassifier(select_features=False, n_jobs=n_jobs, grid_size=grid_size) # for each fold we vary weight function, number of co occurrences and the choosing of the classifier for specific in configurations['specific_classifier']: for weight in configurations['weight']: for cooccurrence in configurations['cooccurrence']: print( 'Running for specific {}, weight {} and cooccurrence {}'.format(specific, weight, cooccurrence)) tuned_parameters = choose_tunning_parameters(specific=specific, weight=weight, coccurrence=cooccurrence) print('GENERAL STARTING') start_grid = time.time() grid = GridSearchCV(clf, tuned_parameters, cv=3, scoring='f1_macro', n_jobs=1) grid.fit(X_train, y_train) end = time.time() print('GENERAL - Total grid time: {}'.format((end - start_grid))) print('GENERAL - Best score was {} with \n {}'.format(grid.best_score_, grid.best_estimator_)) grid.best_score_, grid.best_estimator_ # Fit the train data fit(grid.best_estimator_, X_train, y_train, time_dic) # Predict y_pred = predict(grid.best_estimator_, X_test, time_dic) print(str(grid.best_estimator_)) print(str(grid.best_estimator_.weaker)) # Save the result result.append({ 'macro': f1_score(y_true=y_test, y_pred=y_pred, average='macro'), 'micro': f1_score(y_true=y_test, y_pred=y_pred, average='micro'), 'config': str(grid.best_estimator_), 'best_clf': str(grid.best_estimator_.weaker), }) configuration = {'weight': weight, 'specific': specific, 'cooc': cooccurrence} result[-1].update(configuration) print('Macro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='macro'))) print('Micro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='micro'))) times.append(time_dic) fold = fold + 1 result_dataframe = pd.DataFrame(data=result) print(result_dataframe.head(10)) result_dataframe.to_csv(output_path + '/result_factorial.csv', index=False) times_dataframe = pd.DataFrame(data=times) print(times_dataframe.head(10)) times_dataframe.to_csv(output_path + '/times.csv', index=False) print(result) end = time.time() print('Total time: {}'.format((end - start))) result_dataframe = pd.DataFrame(data=result) print(result_dataframe.head(10)) result_dataframe.to_csv(output_path + '/result_factorial.csv', index=False) times_dataframe = pd.DataFrame(data=times) print(times_dataframe.head(10)) times_dataframe.to_csv(output_path + '/times.csv', index=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-p', help='path to the directory with libsvm files') parser.add_argument('-o', help='path to the output directory') parser.add_argument('-c', help='classifier') parser.add_argument('-k', help='number of neighbours') parser.add_argument('-w', help='weight function') parser.add_argument('-f', help='number of cooc features - default 10') parser.add_argument('-j', help='number of jobs to run in parallel. use -1 for all - Default:-1') parser.add_argument('-g', help='Size of the sample to the hyperparameter search - Default-5000') args = parser.parse_args() output_path = args.o if not os.path.exists(output_path): os.makedirs(output_path) path = args.p k = int(args.k) weight_function = args.w classifier_name = args.c n_cooc = 10 if args.f: n_cooc = int(args.f) n_jobs = -1 if args.j: n_jobs = int(args.j) grid_size = 5000 if args.g: grid_size = int(args.g) dataset_reader = DatasetReader(path) fold = 0 result = [] times = [] start = time.time() while dataset_reader.has_next(): time_dic = {} print('FOLD {}'.format(fold)) # Load the regular data X_train, y_train, X_test, y_test = dataset_reader.get_next_fold() # Create the classifier clf = MetaLazyClassifier(specific_classifier=classifier_name, n_neighbors=k, select_features=False, weight_function=weight_function, n_jobs=n_jobs, grid_size=grid_size, number_of_cooccurrences=n_cooc) # Fit the train data fit(clf, X_train, y_train, time_dic) # Predict y_pred = predict(clf, X_test, time_dic) print(str(clf)) print(str(clf.weaker)) # Save the result result.append({ 'macro': f1_score(y_true=y_test, y_pred=y_pred, average='macro'), 'micro': f1_score(y_true=y_test, y_pred=y_pred, average='micro'), 'config': str(clf), 'best_clf': str(clf.weaker), }) print('Macro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='macro'))) print('Micro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='micro'))) times.append(time_dic) fold = fold + 1 print(result) end = time.time() print('Total time: {}'.format((end - start))) result_dataframe = pd.DataFrame(data=result) print(result_dataframe.head(10)) result_dataframe.to_csv(output_path + '/result.csv', index=False) times_dataframe = pd.DataFrame(data=times) print(times_dataframe.head(10)) times_dataframe.to_csv(output_path + '/times.csv', index=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-p', help='path to the directory with libsvm files') parser.add_argument('-o', help='path to the output directory') parser.add_argument('-j', help='number of jobs to run in parallel. use -1 for all - Default:-1') parser.add_argument('-g', help='Size of the sample to the hyperparameter search - Default-5000') parser.add_argument('-d', help='Use the dataset default parameters.Dont use this parameter if you want to do grid search') parser.add_argument('-t', help='Limit test size, for each fold only use this number of instances') args = parser.parse_args() output_path = args.o if not os.path.exists(output_path): os.makedirs(output_path) path = args.p n_jobs = -1 if args.j: n_jobs = int(args.j) grid_size = 5000 if args.g: grid_size = int(args.g) dataset = None if args.d: dataset = args.d test_size_limit = None if args.t: test_size_limit = int(args.t) dataset_reader = DatasetReader(path) fold = 0 result = [] times = [] start = time.time() while dataset_reader.has_next(): time_dic = {} print('FOLD {}'.format(fold)) # Load the regular data X_train, y_train, X_test, y_test = dataset_reader.get_next_fold() if test_size_limit: X_test = X_test[0:test_size_limit] y_test = y_test[0:test_size_limit] if dataset is None: # Create the classifier estimator = MetaLazyClassifier(select_features=False, n_jobs=n_jobs, grid_size=grid_size) tuned_parameters = choose_tunning_parameters(specific=1, weight=1, coccurrence=1) print(tuned_parameters) # first we find the best configuration in general print('GRID SEARCH FOR FOLD {}'.format(fold)) start_grid = time.time() grid = GridSearchCV(estimator, tuned_parameters, cv=3, scoring='f1_macro', n_jobs=1) grid.fit(X_train, y_train) end = time.time() time_dic['grid'] = (end - start_grid) print('GENERAL - Total grid time: {}'.format((end - start_grid))) print('GENERAL - Best score was {} with \n {}'.format(grid.best_score_, grid.best_estimator_)) estimator = grid.best_estimator_ best_param = grid.best_params_ print('GENERAL - Best param was {}\n'.format(grid.best_params_)) else: print('Using default dataset parameters') estimator = get_best_version_for_each_dataset(dataset=dataset, n_jobs=n_jobs, grid_size=grid_size) print(estimator) estimator.log_time_file = output_path + '/log_times_{}.json'.format(fold) # Fit the train data fit(estimator, X_train, y_train, time_dic) # Predict y_pred = predict(estimator, X_test, time_dic) print('\nWeaker Classifier used:') print(str(estimator.weaker)) # Save the result result.append({ 'macro': f1_score(y_true=y_test, y_pred=y_pred, average='macro'), 'micro': f1_score(y_true=y_test, y_pred=y_pred, average='micro'), 'config': str(estimator), 'best_clf': str(estimator.weaker), 'fold': str(fold), }) print('Macro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='macro'))) print('Micro: {}'.format(f1_score(y_true=y_test, y_pred=y_pred, average='micro'))) times.append(time_dic) fold = fold + 1 result_dataframe = pd.DataFrame(data=result) print(result_dataframe.head(10)) result_dataframe.to_csv(output_path + '/result_tunning_time.csv', index=False) times_dataframe = pd.DataFrame(data=times) print(times_dataframe.head(10)) times_dataframe.to_csv(output_path + '/times.csv', index=False) estimator.flush_log_time_file() #FIXME #break print(result) end = time.time() print('Total time: {}'.format((end - start))) result_dataframe = pd.DataFrame(data=result) print(result_dataframe.head(10)) result_dataframe.to_csv(output_path + '/result_tunning_time.csv', index=False) times_dataframe = pd.DataFrame(data=times) print(times_dataframe.head(10)) times_dataframe.to_csv(output_path + '/times.csv', index=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-p', help='path to the directory with libsvm files') parser.add_argument('-o', help='path to the output directory') parser.add_argument( '-j', help='number of jobs to run in parallel. use -1 for all - Default:-1') parser.add_argument( '-g', help='Size of the sample to the hyperparameter search - Default-5000') parser.add_argument('-c', help='Which classifier to user (svc, nb)') args = parser.parse_args() output_path = args.o if not os.path.exists(output_path): os.makedirs(output_path) path = args.p clf = args.c n_jobs = -1 if args.j: n_jobs = int(args.j) grid_size = 5000 if args.g: grid_size = int(args.g) dataset_reader = DatasetReader(path) fold = 0 result = [] times = [] start = time.time() while dataset_reader.has_next(): time_dic = {} print('FOLD {}'.format(fold)) # Load the regular data X_train, y_train, X_test, y_test = dataset_reader.get_next_fold() # Create the classifier if clf == 'svc': clf = SVC({ 'kernel': 'linear', 'C': 1, 'verbose': False, 'probability': False, 'degree': 3, 'shrinking': True, 'decision_function_shape': None, 'tol': 0.001, 'cache_size': 25000, 'coef0': 0.0, 'gamma': 'auto', 'class_weight': None, 'random_state': 42 }) # Set the parameters by cross-validation # 'kernel': ['rbf'], 'gamma': [0.001, 0.01, 0.1, 1, 10], # 'C': [0.1, 1, 10, 100, 1000], 'class_weight': ['balanced', None], # 'probability': [True]}, tuned_parameters = [{ 'kernel': ['linear'], 'C': np.append(2.0**np.arange(-5, 15, 2), 1), 'class_weight': ['balanced', None], 'probability': [False] }] print(tuned_parameters) elif clf == 'nb': clf = MultinomialNB() tuned_parameters = [{'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}] # first we find the best configuration in general print('GRID SEARCH FOR FOLD {}'.format(fold)) start_grid = time.time() grid = GridSearchCV(clf, tuned_parameters, cv=3, scoring='f1_macro', n_jobs=1) grid.fit(X_train, y_train) end = time.time() time_dic['grid'] = (end - start_grid) print('GENERAL - Total grid time: {}'.format((end - start_grid))) print('GENERAL - Best score was {} with \n {}'.format( grid.best_score_, grid.best_estimator_)) estimator = grid.best_estimator_ best_param = grid.best_params_ print('GENERAL - Best param was {}\n'.format(grid.best_params_)) # Fit the train data fit(estimator, X_train, y_train, time_dic) # Predict y_pred = predict(grid.best_estimator_, X_test, time_dic) print(str(grid.best_estimator_)) # Save the result result.append({ 'macro': f1_score(y_true=y_test, y_pred=y_pred, average='macro'), 'micro': f1_score(y_true=y_test, y_pred=y_pred, average='micro'), 'config': str(grid.best_estimator_), # 'best_clf': str(grid.best_estimator_.weaker), 'fold': str(fold), }) print('Macro: {}'.format( f1_score(y_true=y_test, y_pred=y_pred, average='macro'))) print('Micro: {}'.format( f1_score(y_true=y_test, y_pred=y_pred, average='micro'))) times.append(time_dic) fold = fold + 1 result_dataframe = pd.DataFrame(data=result) print(result_dataframe.head(10)) result_dataframe.to_csv(output_path + '/result_tunning_time.csv', index=False) times_dataframe = pd.DataFrame(data=times) print(times_dataframe.head(10)) times_dataframe.to_csv(output_path + '/times.csv', index=False) break print(result) end = time.time() print('Total time: {}'.format((end - start))) result_dataframe = pd.DataFrame(data=result) print(result_dataframe.head(10)) result_dataframe.to_csv(output_path + '/result_tunning_time.csv', index=False) times_dataframe = pd.DataFrame(data=times) print(times_dataframe.head(10)) times_dataframe.to_csv(output_path + '/times.csv', index=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-p', help='path to the directory with libsvm files') parser.add_argument('-o', help='path to the output directory') parser.add_argument( '-j', help='number of jobs to run in parallel. use -1 for all - Default:-1') parser.add_argument( '-g', help='Size of the sample to the hyperparameter search - Default-5000') args = parser.parse_args() output_path = args.o if not os.path.exists(output_path): os.makedirs(output_path) path = args.p n_jobs = -1 if args.j: n_jobs = int(args.j) grid_size = 5000 if args.g: grid_size = int(args.g) dataset_reader = DatasetReader(path) fold = 0 times = [] specific_classifier = ['nb', 'logistic', 'extrarf'] configurations = {'weight': [0, 1], 'cooccurrence': [0, 1]} start = time.time() while dataset_reader.has_next(): time_dic = {} print('FOLD {}'.format(fold)) # Load the regular data X_train, y_train, X_test, y_test = dataset_reader.get_next_fold() result_df = pd.DataFrame() # for each fold we vary the specific classifier for specific in specific_classifier: print('Running for specific {}'.format(specific)) # setting the 0 configurations (turn off cooc or weight) estimator = MetaLazyClassifier(specific_classifier=specific, select_features=False, n_jobs=n_jobs, number_of_cooccurrences=0, weight_function='none', grid_size=grid_size) print(estimator) # Fit the train data fit(estimator, X_train, y_train, time_dic) # Predict y_pred = predict(estimator, X_test, time_dic) # Save the result result_df[specific] = y_pred result_df['y_test'] = y_test times.append(time_dic) fold = fold + 1 print(result_df.head(10)) result_df.to_csv(output_path + '/result_oracle_off_fold_{}.csv'.format(fold), index=False) times_dataframe = pd.DataFrame(data=times) print(times_dataframe.head(10)) times_dataframe.to_csv(output_path + '/times.csv', index=False) end = time.time() print('Total time: {}'.format((end - start))) times_dataframe = pd.DataFrame(data=times) print(times_dataframe.head(10)) times_dataframe.to_csv(output_path + '/times.csv', index=False)