def do_iteration(i): print("Iteration " + str(i+1)) dev_df = train.copy(deep=True) test_df = test.copy(deep=True) rng = np.random.RandomState() rng.seed(seeds[i]) folds_i = iterative_stratification(dev_df, labels, cv_folds, rng) # Create the appropriate statistics container for this iteration. validation_stats_i = Statistics() testing_stats_i = Statistics() for train_idx, valid_idx in cv_iterator(folds_i): training_fold = dev_df.loc[train_idx, ] validation_fold = dev_df.loc[valid_idx, ] # shuffle the folds training_fold = training_fold.reindex(rng.permutation(training_fold.index)) validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index)) stats_valid, stats_test = multi_label_crf( labels=labels, df_train=training_fold, df_valid=validation_fold, df_test=test_df, binary=binary, connectivity='full', vectorizer_method=vectorizer_method ) validation_stats_i.merge(stats_valid) testing_stats_i.merge(stats_test) log.write('Iteration {}\n'.format(i)) validation_stats_i.write(log, 'a') testing_stats_i.write(log, 'a') return validation_stats_i, testing_stats_i
y = np.vstack(zip(*label_y_test)) y_prob = np.vstack(zip(*label_pred_proba_test)) testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) return training_stats_i_f, validation_stats_i_f, testing_stats_i_f # For each iteration, batch the folds into parallel jobs statistics_objects_i = parallel_map(do_fold, range(cv_folds), n_jobs) for (train, val, test) in statistics_objects_i: training_stats_i.merge(train) validation_stats_i.merge(val) testing_stats_i.merge(test) log.write('Iteration {}\n'.format(i)) log.write('Training {}\n'.format(i)) training_stats_i.write(log, 'a') log.write('Validation {}\n'.format(i)) validation_stats_i.write(log, 'a') log.write('Testing {}\n'.format(i)) testing_stats_i.write(log, 'a') statistics_objects.append([training_stats_i, validation_stats_i, testing_stats_i]) # return training_stats_i, validation_stats_i, testing_stats_i # containers = parallel_map(do_iteration, range(iterations), n_jobs=n_jobs) train_containers = [statistics_objects[i][0] for i in range(iterations)] valid_containers = [statistics_objects[i][1] for i in range(iterations)] test_containers = [statistics_objects[i][2] for i in range(iterations)] for train in train_containers: training_stats.merge(train)
for container in test_containers: testing_stats.merge(container) # --------------------- FINAL RESULTS ---------------------------- # direc = tempfile.mkdtemp(prefix='LLDA-{}-'.format(date), dir='results/') pickle.dump((validation_stats, testing_stats, config), open(direc + '/LLDA-statistics.pkl', 'w')) results = open(direc + '/LLDA-results.txt', 'w') results.write("\nRun Settings: \n") results.write("\tDate: \t\t\t\t{0}\n".format(date)) results.write("\tMethod: \t\t\t{0}\n".format('L-LDA')) results.write("\tBinary: \t\t\t{0}\n".format('NA')) results.write("\tBalanced: \t\t\t{0}\n".format(balanced)) results.write("\tChained: \t\t\t{0}\n".format('NA')) results.write("\tInduced: \t\t\t{0}\n".format(induce)) results.write("\tIterations:\t\t\t{0}\n".format(iterations)) results.write("\tFolds:\t\t\t\t{0}\n".format(cv_folds)) results.write("\tSelection:\t\t\t{0}\n".format(selection)) results.write("\tVectorizer:\t\t\t{0}\n".format('NA')) n_samples_train = len(train) n_samples_test = len(test) results.write("\nValidation performance:\n") validation_stats.write(results, mode='a') results.write("\nTest performance:\n") testing_stats.write(results, mode='a') results.close() log.close()