def do_iteration(i): print("Iteration " + str(i+1)) dev_df = train.copy(deep=True) test_df = test.copy(deep=True) rng = np.random.RandomState() rng.seed(seeds[i]) folds_i = iterative_stratification(dev_df, labels, cv_folds, rng) # Create the appropriate statistics container for this iteration. validation_stats_i = Statistics() testing_stats_i = Statistics() for train_idx, valid_idx in cv_iterator(folds_i): training_fold = dev_df.loc[train_idx, ] validation_fold = dev_df.loc[valid_idx, ] # shuffle the folds training_fold = training_fold.reindex(rng.permutation(training_fold.index)) validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index)) stats_valid, stats_test = multi_label_crf( labels=labels, df_train=training_fold, df_valid=validation_fold, df_test=test_df, binary=binary, connectivity='full', vectorizer_method=vectorizer_method ) validation_stats_i.merge(stats_valid) testing_stats_i.merge(stats_test) log.write('Iteration {}\n'.format(i)) validation_stats_i.write(log, 'a') testing_stats_i.write(log, 'a') return validation_stats_i, testing_stats_i
def do_iteration(data): # for i in xrange(iterations): i = data[0] print("Iteration " + str(i+1)) train_i = train.copy(deep=True) rng = np.random.RandomState() rng.seed(seeds[i]) su_make_dir('llda/models/iteration-{}'.format(i+1)) folds_i = iterative_stratification(train_i, labels, cv_folds, rng) combinations = itertools.combinations(range(0, cv_folds), cv_folds-1) # Create the appropriate statistics container for this iteration. validation_stats_i = Statistics() testing_stats_i = Statistics() for n, j in enumerate(combinations): # print('\tFold ' + str(n+1)) su_make_dir('llda/models/iteration-{}/fold-{}'.format(i+1, n+1)) file_path = 'llda/models/iteration-{}/fold-{}/'.format(i+1, n+1) training_folds_j = folds_i[list(j)] validation_fold_j = folds_i[[f for f in range(0, cv_folds) if f not in j]] assert len(validation_fold_j) == 1 training_fold = reduce( lambda x, y: pd.concat([x, y], ignore_index=True, copy=True), training_folds_j[1:], training_folds_j[0] ) validation_fold = validation_fold_j[0] # shuffle the folds if balanced: training_fold = training_fold.reindex(rng.permutation(training_fold.index)) training_fold = prep.reduce_label_bias(training_fold, labels, 'activation', 5, random_state=rng) training_fold = training_fold.reindex(rng.permutation(training_fold.index)) validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index)) write_tsv(training_fold, file_path + '/train.tsv', test=False) write_tsv(validation_fold, file_path + '/valid.tsv', test=True) write_tsv(test, file_path + '/test.tsv', test=True) # ------------------ CALL JAVA TO LLDA ON THIS PARTAY ----------------- # DEVNULL = open(os.devnull, 'w') args = [ 'java', '-jar', '-Xmx2048m', 'llda/tmt-0.4.0.jar', 'llda/llda.scala', file_path, '/train.tsv', '/valid.tsv', '/test.tsv', 'model-{}-{}'.format(i+1, n+1), '{}-{}'.format(i+1, n+1) ] p = Popen(args, stdout=DEVNULL, stderr=STDOUT) p.wait() # Perform evaluation validation_proba = np.genfromtxt('llda/results/validation-{}-{}.csv'.format(i+1, n+1), delimiter=',')[:, 1:] test_proba = np.genfromtxt('llda/results/test-{}-{}.csv'.format(i+1, n+1), delimiter=',')[:, 1:] labels_j = get_labels_from_model(file_path + '/llda-cvb0-model-{}-{}'.format(1+i, n+1)) validation_stats_i_j = Statistics() testing_stats_i_j = Statistics() for l_index, l in enumerate(labels_j): y_validation = [p for p in validation_fold[l].values] y_proba_validation = [[1-p, p] for p in validation_proba[:, l_index]] y_pred_validation = [int(p >= threshold) for p in validation_proba[:, l_index]] y_hprd = [p for p in test[l].values] y_proba_hprd = [[1-p, p] for p in test_proba[:, l_index]] y_pred_hprd = [int(p >= threshold) for p in test_proba[:, l_index]] validation_stats_i_j = evaluate_model( y=y_validation, y_pred=y_pred_validation, y_pred_prob=y_proba_validation, label=l, statistics=validation_stats_i_j, verbose=0 ) testing_stats_i_j = evaluate_model( y=y_hprd, y_pred=y_pred_hprd, y_pred_prob=y_proba_hprd, label=l, statistics=testing_stats_i_j, verbose=0 ) validation_stats_i.merge(validation_stats_i_j) testing_stats_i.merge(testing_stats_i_j) return validation_stats_i, testing_stats_i