Exemple #1
0
    def xval(self, datafp):
        """Run a cross-validation test on the given data.

        Keyword Arguments
        :param datafp: the system filepath of the CSV dataset
        """
        if self.rand:
            searchname = f'RANDOM SEARCH on the model "{self.model}" '
        else:
            searchname = f'Exhaustive GRID SEARCH on the model "{self.model}" '
        logger.info(searchname +
                    f'for {len(self.param_grid)} parameter combinations '
                    f'with {self.repeat} repetition')
        datafp = os.path.join(self.path, datafp)
        self.x_arys, self.y_arys = load_data(datafp,
                                             target='all',
                                             profile=False)
        self.x_arys_p, self.y_arys_p = load_data(datafp,
                                                 target='all',
                                                 profile=True)

        trials = Trials()
        best = fmin(self._run_xval,
                    space=get_param_dists(self.model),
                    algo=tpe.suggest,
                    max_evals=self.evals,
                    trials=trials,
                    verbose=1)

        with open(f'{self.basename}.pkl', 'wb') as f:
            pickle.dump(trials, f)
        logger.info(f'the best parameters are: {best}')
        logger.info(f'the f1 of which is: {trials.best_trial["result"]}')
    def fixedf(self, trainfp, testfp):
        """Run a standard train/test cycle on the given data.

        Keyword Arguments
        :param trainfp: the system filepath of the CSV training dataset
        :param testfp: the system filepath of the CSV testing dataset
        :return: macro-F score average and std-dev
        """
        logger.info(f'training {self.model} for {self.repeat} iterations')
        trainfp = os.path.join(self.path, trainfp)
        testfp = os.path.join(self.path, testfp)
        fmacro_list = []
        x_train_arys, y_train_arys = \
            load_data(trainfp, target=self.target, profile=self.profile)
        x_test_arys, y_test_arys = \
            load_data(testfp, target=self.target, profile=self.profile)
        for i in range(self.repeat):
            if self.repeat > 1:
                logger.info(f'iteration: {i+1}')
            fmacro = run_fixed_fast(self.model,
                                    x_train_arys, y_train_arys,
                                    x_test_arys, y_test_arys,
                                    self.wvfp, self.profile,
                                    params=self.params)
            fmacro_list.append(fmacro)
        average = np.average(fmacro_list)
        stdev = np.std(fmacro_list)
        logger.info(f'total iterations: {self.repeat}; '
                    f'fmacro average: {average:.4}; '
                    f'fmacro std dev: {stdev:.4}'
                    )
        return average, stdev
Exemple #3
0
    def fixed(self, trainfp, testfp):
        """Run a standard train/test cycle on the given data.

        Keyword Arguments
        :param trainfp: the system filepath of the CSV training dataset
        :param testfp: the system filepath of the CSV testing dataset
        """
        logger.info(f'BAYESIAN SEARCH on the model "{self.model}" '
                    f'for {self.evals} parameter combinations '
                    f'with {self.repeat} repetition')

        trainfp = os.path.join(self.path, trainfp)
        testfp = os.path.join(self.path, testfp)
        self.x_train_arys, self.y_train_arys = \
            load_data(trainfp, target='all', profile=False)
        self.x_train_arys_p, self.y_train_arys_p = \
            load_data(trainfp, target='all', profile=True)
        self.x_test_arys, self.y_test_arys = \
            load_data(testfp, target='all', profile=False)
        self.x_test_arys_p, self.y_test_arys_p = \
            load_data(testfp, target='all', profile=True)

        trials = Trials()
        best = fmin(self._run_fixed,
                    space=get_param_dists(self.model),
                    algo=tpe.suggest,
                    max_evals=self.evals,
                    trials=trials,
                    verbose=1)

        with open(f'{self.basename}.pkl', 'wb') as f:
            pickle.dump(trials, f)
        logger.info(f'the best parameters are: {best}')
        logger.info(f'the f1 of which is: {trials.best_trial["result"]}')
    def xval(self, datafp):
        """Run a cross-validation test on the given data.

        Keyword Arguments
        :param datafp: the system filepath of the CSV dataset
        """
        if self.rand:
            searchname = f'RANDOM SEARCH on the model "{self.model}" '
        else:
            searchname = f'Exhaustive GRID SEARCH on the model "{self.model}" '
        logger.info(
            searchname +
            f'for {len(self.param_grid)} parameter combinations '
            f'with {self.repeat} repetition')
        datafp = os.path.join(self.path, datafp)
        x_arys, y_arys = load_data(
            datafp, target='all', profile=False)
        x_arys_p, y_arys_p = load_data(
            datafp, target='all', profile=True)
        for params in self.param_grid:
            if filter_useless_combs(params):
                continue
            logger.info(str(params))
            profile = params.pop('profile', None)
            if profile:
                fmacro_list = run_xval(self.model, x_arys_p, y_arys_p,
                                       self.wvfp, profile,
                                       cv=self.cv, params=params)
            else:
                fmacro_list = run_xval(self.model, x_arys, y_arys,
                                       self.wvfp, profile,
                                       cv=self.cv, params=params)
            self._result2csvrow(params, fmacro_list)
        self._csvf.close()
    def xval(self, datafp, outfp):
        """Run a cross-validation test on the given data.

        Keyword Arguments
        :param datafp: the system filepath of the CSV dataset
        :return: macro-F score average and std-dev
        """
        logger.info(f'running {self.model} X-val for {self.repeat} iterations')
        datafp = os.path.join(self.path, datafp)
        x_arys, y_arys = load_data(
            datafp, target=self.target, profile=self.profile)
        fmacro_list, df_report = run_xval(self.model, x_arys, y_arys,
                               self.wvfp, self.profile,
                               cv=self.cv, params=self.params)
        average = np.average(fmacro_list)
        stdev = np.std(fmacro_list)
        logger.info(f'total CV iterations: {self.cv}; '
                    f'target-wise average: {average:.4}; '
                    f'target-wise std dev: {stdev:.4}')

        if outfp:
            df_mean = pd.DataFrame(df_report.mean()).transpose()
            with open(outfp, "a") as f:
                df_mean.to_csv(f, header=False, index=False)

        return average, stdev
    def fixed(self, trainfp, testfp):
        """Run a standard train/test cycle on the given data.

        Keyword Arguments
        :param trainfp: the system filepath of the CSV training dataset
        :param testfp: the system filepath of the CSV testing dataset
        """
        logger.info(
            f'Exhaustive GRID SEARCH on the model "{self.model}" '
            f'for {len(self.param_grid)} parameter combinations '
            f'with {self.repeat} repetition')

        trainfp = os.path.join(self.path, trainfp)
        testfp = os.path.join(self.path, testfp)
        x_train_arys, y_train_arys = \
            load_data(trainfp, target='all', profile=False)
        x_train_arys_p, y_train_arys_p = \
            load_data(trainfp, target='all', profile=True)
        x_test_arys, y_test_arys = \
            load_data(testfp, target='all', profile=False)
        x_test_arys_p, y_test_arys_p = \
            load_data(testfp, target='all', profile=True)
        for params in self.param_grid:
            # TODO: error tolerance
            if filter_useless_combs(params):
                continue
            logger.info(str(params))
            profile = params.pop('profile', None)
            fmacro_list: List[float] = []
            for i in range(self.repeat):
                if self.repeat > 1:
                    logger.info(f'iteration: {i+1}')
                if profile:
                    fmacro = run_fixed_fast(self.model,
                                            x_train_arys_p, y_train_arys_p,
                                            x_test_arys_p, y_test_arys_p,
                                            self.wvfp, profile,
                                            params=params)
                else:
                    fmacro = run_fixed_fast(self.model,
                                            x_train_arys, y_train_arys,
                                            x_test_arys, y_test_arys,
                                            self.wvfp, profile,
                                            params=params)
                fmacro_list.append(fmacro)
            self._result2csvrow(params, fmacro_list)
        self._csvf.close()
    def train(self, trainfp, testfp, outfp=None, combined=False):
        """Run a standard train/test cycle on the given data.

        Keyword Arguments
        :param trainfp: the system filepath of the CSV training dataset
        :param testfp: the system filepath of the CSV testing dataset
        :param outfp: the system filepath to output in CSV format
        :param combined: whether or not to combine different companies into same train set
        :return: macro-F score average and std-dev
        """
        logger.info(f'training {self.model} for {self.repeat} iterations')
        trainfp = os.path.join(self.path, trainfp)
        testfp = os.path.join(self.path, testfp)
        fmacro_list = []
        x_train_arys, y_train_arys = \
            load_data(trainfp, target=self.target, profile=self.profile)
        x_test_arys, y_test_arys = \
            load_data(testfp, target=self.target, profile=self.profile)
        for i in range(self.repeat):
            if self.repeat > 1:
                logger.info(f'iteration: {i+1}')
            fmacro, df_report = run_train(self.model,
                               x_train_arys, y_train_arys,
                               x_test_arys, y_test_arys,
                               self.wvfp, self.profile,
                               params=self.params, combined=combined)
            fmacro_list.append(fmacro)
        average = np.average(fmacro_list)
        stdev = np.std(fmacro_list)
        logger.info(f'total iterations: {self.repeat}; '
                    f'fmacro average: {average:.4}; '
                    f'fmacro std dev: {stdev:.4}'
                    )

        df_report['combined'] = pd.Series(average)
        if outfp:
            with open(outfp, "a") as f:
                df_report.to_csv(f, header=False, index=False)

        return average, stdev
def main():
    vocabulary_input = "abcdefghijklmnopqrtsuvwxyz"
    vocabulary_target = "0123"

    training_steps = 1000000
    display_step = 1000
    batch_size = 32
    vocab_size = 27
    input_embedded_size = 27
    output_classes = 4
    num_units = 1024
    num_layers = 2
    DROPOUT_KEEP_PROB_TRAIN = 1.0
    DROPOUT_KEEP_PROB_TEST = 1.0

    #file_name = "dataset_diff_lengths_test.txt"
    file_name = "datasetC.txt"
    #file_name = "dset.txt"
    ratio = np.array([0.8, 0.9])
    columns = np.array([1, 2])
    X_train, X_train_lengths, X_val, X_val_lengths, X_test, X_test_lengths, y_train, y_val, y_test, train_batch_sizes, test_batch_sizes = load_data(
        file_name, ratio, columns, batch_size)

    X_train_emb = emb_encode(X_train, vocabulary_input, train_batch_sizes,
                             batch_size)
    X_test_emb = emb_encode(X_test, vocabulary_input, test_batch_sizes,
                            batch_size)

    y_train_one_hot = one_hot_encode(y_train, vocabulary_target)
    y_train_one_hot = np.squeeze(y_train_one_hot, axis=1)
    y_test_one_hot = one_hot_encode(y_test, vocabulary_target)
    y_test_one_hot = np.squeeze(y_test_one_hot, axis=1)

    model = DynamicClassifier(vocab_size, input_embedded_size, output_classes,
                              num_units, num_layers)
    iteration = 0

    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())
        try:
            start = t.time()
            for step in range(1, training_steps + 1):
                batch_x = X_train_emb[batch_size * iteration:batch_size *
                                      (iteration + 1), :]
                batch_len_x = X_train_lengths[batch_size *
                                              iteration:batch_size *
                                              (iteration + 1)]
                batch_y = y_train_one_hot[batch_size * iteration:batch_size *
                                          (iteration + 1), :]
                iteration += 1
                if iteration >= (X_train_emb.shape[0] / batch_size) - 1:
                    iteration = 0

                err, _, acc, = sess.run(
                    [model.error, model.train, model.accuracy],
                    feed_dict={
                        model.inputs: batch_x,
                        model.outputs: batch_y,
                        model.keep_prob: DROPOUT_KEEP_PROB_TRAIN,
                        model.sequence_length: batch_len_x
                    })

                if step % display_step == 0 or step == 1:

                    # Calculate batch loss and accuracy
                    err, acc, = sess.run(
                        [model.error, model.accuracy],
                        feed_dict={
                            model.inputs: batch_x,
                            model.outputs: batch_y,
                            model.keep_prob: DROPOUT_KEEP_PROB_TRAIN,
                            model.sequence_length: batch_len_x
                        })
                    end = t.time()
                    print("Step " + str(step) + ", Minibatch Loss= " + \
                          "{:.4f}".format(err) + ", Training Accuracy= " + \
                          "{:.3f}".format(acc))
                    print("Epoch Time: ", end - start)
                    start = t.time()
                #print('step')

        except KeyboardInterrupt:
            print('training interrupted')

        batch_x = X_test_emb[0:5000, :]
        batch_len_x = X_test_lengths[0:5000]
        batch_y = y_test_one_hot[0:5000, :]
        acc = sess.run(model.accuracy,
                       feed_dict={
                           model.inputs: batch_x,
                           model.outputs: batch_y,
                           model.keep_prob: DROPOUT_KEEP_PROB_TEST,
                           model.sequence_length: batch_len_x
                       })
        print("Testing Accuracy:" + str(acc))