Esempio n. 1
0
    def do_iteration(i):
        print("Iteration " + str(i+1))
        dev_df = train.copy(deep=True)
        test_df = test.copy(deep=True)
        rng = np.random.RandomState()
        rng.seed(seeds[i])
        folds_i = iterative_stratification(dev_df, labels, cv_folds, rng)

        # Create the appropriate statistics container for this iteration.
        validation_stats_i = Statistics()
        testing_stats_i = Statistics()

        for train_idx, valid_idx in cv_iterator(folds_i):
            training_fold = dev_df.loc[train_idx, ]
            validation_fold = dev_df.loc[valid_idx, ]

            # shuffle the folds
            training_fold = training_fold.reindex(rng.permutation(training_fold.index))
            validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index))

            stats_valid, stats_test = multi_label_crf(
                labels=labels,
                df_train=training_fold,
                df_valid=validation_fold,
                df_test=test_df,
                binary=binary,
                connectivity='full',
                vectorizer_method=vectorizer_method
            )

            validation_stats_i.merge(stats_valid)
            testing_stats_i.merge(stats_test)

        log.write('Iteration {}\n'.format(i))
        validation_stats_i.write(log, 'a')
        testing_stats_i.write(log, 'a')
        return validation_stats_i, testing_stats_i
Esempio n. 2
0
    def do_iteration(data):
    # for i in xrange(iterations):
        i = data[0]
        print("Iteration " + str(i+1))
        train_i = train.copy(deep=True)
        rng = np.random.RandomState()
        rng.seed(seeds[i])
        su_make_dir('llda/models/iteration-{}'.format(i+1))
        folds_i = iterative_stratification(train_i, labels, cv_folds, rng)
        combinations = itertools.combinations(range(0, cv_folds), cv_folds-1)

        # Create the appropriate statistics container for this iteration.
        validation_stats_i = Statistics()
        testing_stats_i = Statistics()

        for n, j in enumerate(combinations):
            # print('\tFold ' + str(n+1))
            su_make_dir('llda/models/iteration-{}/fold-{}'.format(i+1, n+1))
            file_path = 'llda/models/iteration-{}/fold-{}/'.format(i+1, n+1)

            training_folds_j = folds_i[list(j)]
            validation_fold_j = folds_i[[f for f in range(0, cv_folds) if f not in j]]
            assert len(validation_fold_j) == 1

            training_fold = reduce(
                lambda x, y: pd.concat([x, y], ignore_index=True, copy=True),
                training_folds_j[1:],
                training_folds_j[0]
            )
            validation_fold = validation_fold_j[0]

            # shuffle the folds
            if balanced:
                training_fold = training_fold.reindex(rng.permutation(training_fold.index))
                training_fold = prep.reduce_label_bias(training_fold, labels, 'activation', 5, random_state=rng)
            training_fold = training_fold.reindex(rng.permutation(training_fold.index))
            validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index))

            write_tsv(training_fold, file_path + '/train.tsv', test=False)
            write_tsv(validation_fold, file_path + '/valid.tsv', test=True)
            write_tsv(test, file_path + '/test.tsv', test=True)

            # ------------------ CALL JAVA TO LLDA ON THIS PARTAY ----------------- #
            DEVNULL = open(os.devnull, 'w')
            args = [
                'java',
                '-jar',
                '-Xmx2048m',
                'llda/tmt-0.4.0.jar',
                'llda/llda.scala',
                file_path,
                '/train.tsv',
                '/valid.tsv',
                '/test.tsv',
                'model-{}-{}'.format(i+1, n+1),
                '{}-{}'.format(i+1, n+1)
            ]
            p = Popen(args, stdout=DEVNULL, stderr=STDOUT)
            p.wait()

            # Perform evaluation
            validation_proba = np.genfromtxt('llda/results/validation-{}-{}.csv'.format(i+1, n+1), delimiter=',')[:, 1:]
            test_proba = np.genfromtxt('llda/results/test-{}-{}.csv'.format(i+1, n+1), delimiter=',')[:, 1:]
            labels_j = get_labels_from_model(file_path + '/llda-cvb0-model-{}-{}'.format(1+i, n+1))

            validation_stats_i_j = Statistics()
            testing_stats_i_j = Statistics()
            for l_index, l in enumerate(labels_j):
                y_validation = [p for p in validation_fold[l].values]
                y_proba_validation = [[1-p, p] for p in validation_proba[:, l_index]]
                y_pred_validation = [int(p >= threshold) for p in validation_proba[:, l_index]]

                y_hprd = [p for p in test[l].values]
                y_proba_hprd = [[1-p, p] for p in test_proba[:, l_index]]
                y_pred_hprd = [int(p >= threshold) for p in test_proba[:, l_index]]

                validation_stats_i_j = evaluate_model(
                    y=y_validation,
                    y_pred=y_pred_validation,
                    y_pred_prob=y_proba_validation,
                    label=l,
                    statistics=validation_stats_i_j,
                    verbose=0
                )

                testing_stats_i_j = evaluate_model(
                    y=y_hprd,
                    y_pred=y_pred_hprd,
                    y_pred_prob=y_proba_hprd,
                    label=l,
                    statistics=testing_stats_i_j,
                    verbose=0
                )
            validation_stats_i.merge(validation_stats_i_j)
            testing_stats_i.merge(testing_stats_i_j)

        return validation_stats_i, testing_stats_i