Example #1
0
    def do_iteration(i):
        print("Iteration " + str(i+1))
        dev_df = train.copy(deep=True)
        test_df = test.copy(deep=True)
        rng = np.random.RandomState()
        rng.seed(seeds[i])
        folds_i = iterative_stratification(dev_df, labels, cv_folds, rng)

        # Create the appropriate statistics container for this iteration.
        validation_stats_i = Statistics()
        testing_stats_i = Statistics()

        for train_idx, valid_idx in cv_iterator(folds_i):
            training_fold = dev_df.loc[train_idx, ]
            validation_fold = dev_df.loc[valid_idx, ]

            # shuffle the folds
            training_fold = training_fold.reindex(rng.permutation(training_fold.index))
            validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index))

            stats_valid, stats_test = multi_label_crf(
                labels=labels,
                df_train=training_fold,
                df_valid=validation_fold,
                df_test=test_df,
                binary=binary,
                connectivity='full',
                vectorizer_method=vectorizer_method
            )

            validation_stats_i.merge(stats_valid)
            testing_stats_i.merge(stats_test)

        log.write('Iteration {}\n'.format(i))
        validation_stats_i.write(log, 'a')
        testing_stats_i.write(log, 'a')
        return validation_stats_i, testing_stats_i
Example #2
0
            y = np.vstack(zip(*label_y_test))
            y_prob = np.vstack(zip(*label_pred_proba_test))
            testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            return training_stats_i_f, validation_stats_i_f, testing_stats_i_f

        # For each iteration, batch the folds into parallel jobs
        statistics_objects_i = parallel_map(do_fold, range(cv_folds), n_jobs)
        for (train, val, test) in statistics_objects_i:
            training_stats_i.merge(train)
            validation_stats_i.merge(val)
            testing_stats_i.merge(test)

        log.write('Iteration {}\n'.format(i))
        log.write('Training {}\n'.format(i))
        training_stats_i.write(log, 'a')
        log.write('Validation {}\n'.format(i))
        validation_stats_i.write(log, 'a')
        log.write('Testing {}\n'.format(i))
        testing_stats_i.write(log, 'a')

        statistics_objects.append([training_stats_i, validation_stats_i, testing_stats_i])
        # return training_stats_i, validation_stats_i, testing_stats_i

    # containers = parallel_map(do_iteration, range(iterations), n_jobs=n_jobs)
    train_containers = [statistics_objects[i][0] for i in range(iterations)]
    valid_containers = [statistics_objects[i][1] for i in range(iterations)]
    test_containers = [statistics_objects[i][2] for i in range(iterations)]

    for train in train_containers:
        training_stats.merge(train)
Example #3
0
    for container in test_containers:
        testing_stats.merge(container)

    # --------------------- FINAL RESULTS ---------------------------- #
    direc = tempfile.mkdtemp(prefix='LLDA-{}-'.format(date), dir='results/')
    pickle.dump((validation_stats, testing_stats, config), open(direc + '/LLDA-statistics.pkl', 'w'))
    results = open(direc + '/LLDA-results.txt', 'w')

    results.write("\nRun Settings: \n")
    results.write("\tDate: \t\t\t\t{0}\n".format(date))
    results.write("\tMethod: \t\t\t{0}\n".format('L-LDA'))
    results.write("\tBinary: \t\t\t{0}\n".format('NA'))
    results.write("\tBalanced: \t\t\t{0}\n".format(balanced))
    results.write("\tChained: \t\t\t{0}\n".format('NA'))
    results.write("\tInduced: \t\t\t{0}\n".format(induce))
    results.write("\tIterations:\t\t\t{0}\n".format(iterations))
    results.write("\tFolds:\t\t\t\t{0}\n".format(cv_folds))
    results.write("\tSelection:\t\t\t{0}\n".format(selection))
    results.write("\tVectorizer:\t\t\t{0}\n".format('NA'))
    n_samples_train = len(train)
    n_samples_test = len(test)

    results.write("\nValidation performance:\n")
    validation_stats.write(results, mode='a')

    results.write("\nTest performance:\n")
    testing_stats.write(results, mode='a')
    results.close()
    log.close()