parser.add_argument('-i', '--input', type=str, required=True, dest='infolder', help='path to folder with pan dataset for a language') parser.add_argument('-o', '--output', type=str, required=True, dest='outfolder', help='path to folder where model should be written') parser.add_argument('-m', '--model', type=str, required=True, dest='model', help='path to learned model to use for predictions') args = parser.parse_args() model = args.model infolder = args.infolder outfolder = args.outfolder dataset = ProfilingDataset(infolder) print('Loaded {} users...\n'.format(len(dataset.entries))) config = dataset.config tasks = config.tasks all_models = joblib.load(model) if not all(task in tasks for task in all_models.keys()): print("The models you are using aren't all specified in config file") print('Did you change the config file after training???!') print('Exiting.. try training again.') exit(1) print('\n--------------- Thy time of Judgement ---------------') for task in tasks: test_data(dataset, all_models[task], task) # write output to file dataset.write_data(outfolder)
parser.add_argument('-i', '--input', type=str, required=True, dest='infolder', help='path to folder with pan dataset for a language') parser.add_argument('-n', '--numfolds', type=int, dest='num_folds', default=4, help='Number of folds to use in cross validation') args = parser.parse_args() infolder = args.infolder num_folds = args.num_folds print('Loading dataset...') dataset = ProfilingDataset(infolder) print('Loaded %s users...\n' % len(dataset.entries)) config = dataset.config tasks = config.tasks print('\n--------------- Thy time of Running ---------------') for task in tasks: tictac = from_recipe(config.recipes[task]) cross_val(dataset, task, tictac, num_folds) # print results at end print('\n--------------- Thy time of Judgement ---------------') for message in log: print(message)
def main_(): infolder = "../DATA/pan16-author-profiling-training-dataset-2016-04-25/pan16-author-profiling-training-dataset-english-2016-02-29/" outfolder = "models/" print('Loading dataset->Grouping User texts.\n') dataset = ProfilingDataset(infolder) print('Loaded {} users...\n'.format(len(dataset.entries))) # get config config = dataset.config tasks = config.tasks print('\n--------------- Thy time of Running ---------------') for task in tasks: print('Learning to judge %s..' % task) # load data X, y = dataset.get_data(task) X, y = dataset.get_data('age') #X, y = dataset.get_data('gender') print len(X) #print X[0] X = preprocess.preprocess(X) grams3 = TfidfVectorizer(analyzer='word', ngram_range=[2,2], max_features=5000, stop_words='english') svm = SVC(kernel='rbf', C=10, gamma=1, class_weight='balanced', probability=False) pipe = Pipeline([('3grams',grams3), ('svm', svm)]) soac = features.SOAC_Model2(max_df=1.0, min_df=1, tokenizer_var='sklearn', max_features=None) svm = SVC(kernel='rbf', C=1, gamma=1, class_weight='balanced', probability=False) #combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash), # ('count_urls', countUrls), ('count_replies', countReplies), # ('soa', soa), ('soac', soac)])+ #combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash), # ('count_urls', countUrls), ('count_replies', countReplies)]) pipe1 = Pipeline([('soac',soac), ('svm', svm)]) LSImodel = LSI_Model(num_topics=100) svm = SVC(kernel='rbf', C=0.1, gamma=1, class_weight='balanced', probability=False) #pipe2 = Pipeline([('counts',combined), ('svm', svm)]) pipe2 = Pipeline([('LSI',LSImodel), ('svm', svm)]) # Base Models base_models = [pipe, pipe1, pipe2] base_model_names = ['3grams', 'soac', 'lsi'] # Meta Voting Models eclf = VotingClassifier(estimators=[("0", pipe), ('1', pipe1), ('2', pipe2)], voting='soft') eclfh = VotingClassifier(estimators=[("0", pipe), ('1', pipe1), ('2', pipe2)], voting='hard') voting_dic = {'votingf':eclf, 'votingh':eclfh} combinator_names = ['majority', 'weights', 'accuracy', 'optimal'] #meta_models_names = ['votingf', 'votingh', 'space3', 'meta'] + combinator_names meta_models_names = ['space3'] + combinator_names #meta_models_names = [] ## all_models ## all_models_names = base_model_names + meta_models_names #eclf = VotingClassifier(estimators=[("0", pipe), ('1', pipe1)], voting='soft') #eclfh = VotingClassifier(estimators=[("0", pipe), ('1', pipe1)], voting='hard') #models = [pipe,pipe1,eclf, eclfh] #model_names = ['3grams', 'soac', 'voting', 'votingh'] results = {'over':[]} for name in all_models_names: results[name] = {'pred': [], 'conf': [], 'rep': [], 'acc': []} num_folds = 4 train_split = 0.3 meta_split = 0.5 cv_rounds = 1 t0 = time.time() t1 = t0 for j in xrange(cv_rounds): X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=train_split, stratify=y) for i, x in enumerate(X_train): if len(x)==0: X_train.remove(x) y_train.remove(y_train[i]) for i, x in enumerate(X_cv): if len(x)==0: X_cv.remove(x) y_cv.remove(y_cv[i]) if meta_split > 0: X_meta, X_cv, y_meta, y_cv = train_test_split(X_cv, y_cv, test_size=meta_split, stratify=y_cv) print len(X_train), len(X_cv), len(X_meta), len(X_cv) + len(X_train) + len(X_meta), len(X) else: print len(X_train), len(X_cv), len(X_cv) + len(X_train) , len(X) trained_base_models = [] predictions = [] base_predictions_cv = [] base_predictions_meta = [] for i, model in enumerate(base_models): model.fit(X_train,y_train) trained_base_models.append(model) predict = model.predict(X_cv) predictions.append(predict) base_predictions_cv.append(predict) base_predictions_meta.append(model.predict(X_meta)) results[base_model_names[i]]['pred'].append(predict) results[base_model_names[i]]['acc'].append(accuracy_score(y_cv, predict)) results[base_model_names[i]]['conf'].append(confusion_matrix(y_cv, predict, labels=list(set(y)))) results[base_model_names[i]]['rep'].append(classification_report(y_cv, predict, labels=list(set(y)))) trained_all_models = copy.deepcopy(trained_base_models) for name in meta_models_names: #print name if name =='votingf' or name=='votingh': model = voting_dic[name] model.fit(X_train, y_train) predict = model.predict(X_cv) if name == 'space': models_for_space = {} cv_scores = [] for i, base_trained_model in enumerate(trained_base_models): models_for_space[base_model_names[i]] = base_trained_model cv_scores.append(base_trained_model.score(X_meta, y_meta)) model = combinations.SubSpaceEnsemble4_2(models_for_space, cv_scores, k=6, weights=[0.65,0.35,0.32,6], N_rand=10, rand_split=0.6) model.fit(X_meta, y_meta) predict = model.predict(X_cv) if name == 'space3': models_for_space = {} for i, base_trained_model in enumerate(trained_base_models): models_for_space[base_model_names[i]] = base_trained_model model = SubSpaceEnsemble3(models_for_space, k=5, weights= [2,1,3,0.6]) model.fit(X_train, y_train) predict = model.predict(X_cv) if name == 'meta': model_dic = {} for i, base_trained_model in enumerate(trained_base_models): model_dic[base_model_names[i]] = base_trained_model model = Metaclassifier(models=model_dic, C=1.0, weights='balanced') model.fit(X_meta, y_meta) predict = model.predict(X_cv) if name in combinator_names: #print 'mpike' model = combinations.Combinator(scheme=name, weights= [1/float(len(base_predictions_meta)) for i in xrange(len(base_predictions_meta))]) model.fit(base_predictions_meta, y_meta) predict = model.predict(base_predictions_cv) trained_all_models.append(model) predictions.append(predict) results[name]['pred'].append(predict) results[name]['acc'].append(accuracy_score(y_cv, predict)) results[name]['conf'].append(confusion_matrix(y_cv, predict, labels=list(set(y)))) results[name]['rep'].append(classification_report(y_cv, predict, labels=list(set(y)))) print('Round %d took: %0.3f seconds') % (j, time.time()-t1) t1 = time.time() print('Total time: %0.3f seconds') % (time.time()-t0) for name in all_models_names: print '%%%%%%%%%%%%%%%% ' + name + ' % %%%%%%%%%%%%%%%%%%%%%%%' print '#################################' mean_acc = 0 mean_prec = 0 mean_rec = 0 mean_f1 = 0 conf = numpy.zeros([5,5]) for i in xrange(cv_rounds): mean_acc += results[name]['acc'][i] #print results[key]['report'][i].split(' ') mean_prec += float(results[name]['rep'][i].split(' ')[-4][2:]) mean_rec += float(results[name]['rep'][i].split(' ')[-3][2:]) mean_f1 += float(results[name]['rep'][i].split(' ')[-2][2:]) conf += results[name]['conf'][i] mean_acc = mean_acc/float(cv_rounds) mean_prec = mean_prec/float(cv_rounds) mean_rec = mean_rec/float(cv_rounds) mean_f1 = mean_f1/float(cv_rounds) conf = conf/float(cv_rounds) print('Accuracy : {}'.format(mean_acc)) print('Precision : {}'.format(mean_prec)) print('Recall : {}'.format(mean_rec)) print('F1 : {}'.format(mean_f1)) print('Confusion matrix :\n {}'.format(conf)) print '#################################' print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
help='path to folder with pan dataset for a language') parser.add_argument('-f', '--feature', type=str, dest='feature', default='gender', help='feature to plot learning curves for') parser.add_argument('-r', '--recipe', type=str, dest='recipe', help='path to the recipe to use, if not specified ' 'default recipe is used') args = parser.parse_args() infolder = args.infolder task = args.feature recipe = args.recipe print 'Loading dataset...' data = ProfilingDataset(infolder) print 'Loaded {} users...\n'.format(len(data.entries)) config = data.config tasks = config.tasks if task in tasks: print ('Creating learning curves for %s task..' % task) if not recipe: recipe = config.recipes[task] clf = from_recipe(recipe) else: clf = from_recipe(recipe) print ('Loading recipe from file %s..' % recipe) X, y = data.get_data(task) # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = cross_validation.KFold(len(X), n_folds=5, random_state=0)
if __name__ == '__main__': parser = ArgumentParser(description='Train pan model on pan dataset') parser.add_argument('-i', '--input', type=str, required=True, dest='infolder', help='path to folder with pan dataset for a language') parser.add_argument('-o', '--output', type=str, required=True, dest='outfolder', help='path to folder where model should be written') args = parser.parse_args() infolder = args.infolder outfolder = args.outfolder dataset = ProfilingDataset(infolder) print('Loaded {} users...\n'.format(len(dataset.entries))) # get config config = dataset.config tasks = config.tasks print('\n--------------- Thy time of Running ---------------') all_models = {} for task in tasks: print('Learning to judge %s..' % task) # load data X, y = dataset.get_data(task) tictac = from_recipe(config.recipes[task]) all_models[task] = tictac.fit(X, y) modelfile = os.path.join(outfolder, '%s.bin' % dataset.lang) print('Writing model to {}'.format(modelfile)) joblib.dump(all_models, modelfile, compress=3)
default='gender', help='feature to plot learning curves for') parser.add_argument('-r', '--recipe', type=str, dest='recipe', help='path to the recipe to use, if not specified ' 'default recipe is used') args = parser.parse_args() infolder = args.infolder task = args.feature recipe = args.recipe print 'Loading dataset...' data = ProfilingDataset(infolder) print 'Loaded {} users...\n'.format(len(data.entries)) config = data.config tasks = config.tasks if task in tasks: print('Creating learning curves for %s task..' % task) if not recipe: recipe = config.recipes[task] clf = from_recipe(recipe) else: clf = from_recipe(recipe) print('Loading recipe from file %s..' % recipe) X, y = data.get_data(task) # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = cross_validation.KFold(len(X), n_folds=5, random_state=0)