Beispiel #1
0
def question_j():
    logging.info("<Question J> Multiclass Classification")
    category = [
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
        'soc.religion.christian'
    ]
    train, test = utils.fetch_data(category)

    train_idf = utils.model_data(train)
    test_idf = utils.model_data(test)
    logging.info("Creating TFxIDF Vector Representations")

    logging.info("Performing LSI on TFxIDF Matrices")
    # apply LSI to TDxIDF matrices
    svd = TruncatedSVD(n_components=50)
    train_lsi = svd.fit_transform(train_idf)
    test_lsi = svd.fit_transform(test_idf)

    logging.info("TFxIDF Matrices Transformed")

    logging.info("Size of Transformed Training Dataset: {0}".format(
        train_lsi.shape))
    logging.info("Size of Transformed Testing Dataset: {0}".format(
        test_lsi.shape))

    clf_list = [
        OneVsOneClassifier(GaussianNB()),
        OneVsOneClassifier(svm.SVC(kernel='linear')),
        OneVsRestClassifier(GaussianNB()),
        OneVsRestClassifier(svm.SVC(kernel='linear'))
    ]
    clf_name = [
        'OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM',
        'OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM'
    ]

    # perform classification
    for clf, clf_n in zip(clf_list, clf_name):
        logging.info("Training {0} Classifier ".format(clf_n))
        clf.fit(train_lsi, train.target)
        logging.info("Testing {0} Classifier".format(clf_n))
        test_predicted = clf.predict(test_lsi)
        utils.calculate_stats(test.target, test_predicted)
Beispiel #2
0
def test_summary_rank(contest='558'):
    responses = utils.read_responses(f'{contest}-responses.csv.zip')
    summary = utils.init_summary(responses)
    summary = utils.read_summary(f'{contest}_summary_LilUCB.csv')

    df = utils.calculate_stats(summary)

    ranks = {rank: mean for rank, mean in zip(df['rank'], df['score'])}
    for i in range(len(ranks) - 1):
        assert ranks[i + 1] >= ranks[i + 2]
Beispiel #3
0
def test_calculate_stats(errors={
    'rank': 325,
    'score': 1e-10,
    'precision': 1e-10
}):
    df = utils.read_summary('536_summary_LilUCB.csv')
    summary = utils.read_summary('536_summary_LilUCB.csv')
    for key in errors:
        if key in summary:
            del summary[key]
    summary = utils.calculate_stats(summary)

    for key, max_error in errors.items():
        if max_error == 'allclose':
            error = np.abs(summary[key] - df[key])
            assert error.max() < max_error
Beispiel #4
0
def main():
    # Relabelling and stuff
    classes = [
        computer_technologies, recreational_activity, science, miscellaneus,
        politics, religion
    ]
    all_categories = []
    i = 0
    rmap = {}
    for cnum, c in enumerate(classes):
        for category in c:
            all_categories.append(category)
            rmap[i] = cnum
            i += 1

    data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
    data.target = list(map(lambda x: rmap[x], data.target))
    data_idf = utils.model_data(data, 'part6')

    # Find Effective dimensions to retrieve data
    k = 6
    ds = range(2, 75, 1)
    svd_metrics = []
    print("Varying Dimensions")
    for d in ds:
        print("Set d = ", d)
        svd = TruncatedSVD(n_components=d)
        poly = FunctionTransformer(np.log1p)
        normalizer = Normalizer(copy=False)
        svd_pipeline = make_pipeline(svd, poly, normalizer)
        X_SVD = svd_pipeline.fit_transform(data_idf)
        kmeans = KMeans(n_clusters=k).fit(X_SVD)
        svd_metrics.append(utils.calculate_stats(data.target, kmeans.labels_))

    metric_names = [
        'homogeneity_score', 'completeness_score', 'adjusted_rand_score',
        'adjusted_mutual_info_score'
    ]

    for i, metric_name in enumerate(metric_names):
        plt.plot(ds, list(map(lambda x: x[i], svd_metrics)), label=metric_name)
    plt.xlabel('Dimensions')
    plt.ylabel('Metric Value')
    plt.legend(loc='best')
    plt.savefig('plots/part6.png', format='png')
    plt.clf()
Beispiel #5
0
    p_graph.calc_all_priorities()

    """
    bf = datetime.now()
    dcop2 = DcopAllocator(deepcopy(p_graph), logger)
    schedules2 = dcop2.allocate(deepcopy(robots), test=True)
    af = datetime.now()
    exec_time2 = (af - bf).total_seconds()
    
    ms, tt, st = utils.calculate_stats([schedules2])
    print "makespan: " + str(ms)
    print "time travelled: " + str(tt)
    print "tasks scheduled: " + str(st)
    print "exec time: " + str(exec_time2)
    utils.print_schedules([schedules2], 'DCOP2')
    """

    bf = datetime.now()
    dcop = DcopAllocator(deepcopy(p_graph), logger)
    schedules = dcop.allocate(deepcopy(robots))
    af = datetime.now()
    exec_time1 = (af - bf).total_seconds()

    ms, tt, st = utils.calculate_stats([schedules])
    print ("makespan: " + str(ms))
    print ("time travelled: " + str(tt))
    print ("tasks scheduled: " + str(st))
    print ("exec time: " + str(exec_time1))
    utils.print_schedules([schedules], 'DCOP')

    
def main(config):
    # create unique output directory for this model

    config['name'] = config['name'] + '-' + str(config['hidden_state_size'])
    if config['train_stride']:
        config['name'] = config['name'] + '-stride'
    if config['concat_labels']:
        config['name'] = config['name'] + '-concat_labels'
    if config['attention']:
        config['name'] = config['name'] + '-attention'
    if config['share_weights']:
        config['name'] = config['name'] + '-share_weights'

    config['name'] = config['name'] + '-' + config[
        'learning_rate_type'] + '-' + str(config['learning_rate'])

    timestamp = str(int(time.time()))
    config['model_dir'] = os.path.abspath(
        os.path.join(config['output_dir'], config['name'] + '-' + timestamp))
    os.makedirs(config['model_dir'])
    print('Writing checkpoints into {}'.format(config['model_dir']))

    # load the data, this requires that the *.npz files you downloaded from Kaggle be named `train.npz` and `valid.npz`
    data_train = load_data(config, 'train', config['train_stride'])
    data_valid = load_data(config, 'valid', config['eval_stride'])

    # TODO if you would like to do any preprocessing of the data, here would be a good opportunity
    stats = calculate_stats(data_train.input_)
    save_stats(stats)

    if config['normalize']:
        data_train.input_, _, _ = preprocess(data_train.input_)
        data_train.target, _, _ = preprocess(data_train.target)

        data_valid.input_, _, _ = preprocess(data_valid.input_)
        data_valid.target, _, _ = preprocess(data_valid.target)

        print('Post normalize samples shape: ', data_train.input_[0].shape)

    config['input_dim'] = data_train.input_[0].shape[-1]
    config['output_dim'] = data_train.target[0].shape[-1]

    # get input placeholders and get the model that we want to train
    seq2seq_model_class, placeholders = get_model_and_placeholders(config)

    # Create a variable that stores how many training iterations we performed.
    # This is useful for saving/storing the network
    global_step = tf.Variable(1, name='global_step', trainable=False)

    # create a training graph, this is the graph we will use to optimize the parameters
    with tf.name_scope('Training'):
        seq2seq_model = seq2seq_model_class(config,
                                            placeholders,
                                            mode='training')
        seq2seq_model.build_graph()
        print('created RNN model with {} parameters'.format(
            seq2seq_model.n_parameters))

        # configure learning rate
        if config['learning_rate_type'] == 'exponential':
            lr = tf.train.exponential_decay(
                config['learning_rate'],
                global_step=global_step,
                decay_steps=config['learning_rate_decay_steps'],
                decay_rate=config['learning_rate_decay_rate'],
                staircase=False)
            lr_decay_op = tf.identity(lr)
        elif config['learning_rate_type'] == 'linear':
            lr = tf.Variable(config['learning_rate'], trainable=False)
            lr_decay_op = lr.assign(
                tf.multiply(lr, config['learning_rate_decay_rate']))
        elif config['learning_rate_type'] == 'fixed':
            lr = config['learning_rate']
            lr_decay_op = tf.identity(lr)
        else:
            raise ValueError('learning rate type "{}" unknown.'.format(
                config['learning_rate_type']))

        with tf.name_scope('Optimizer'):
            # TODO choose the optimizer you desire here and define `train_op. The loss should be accessible through rnn_model.loss
            params = tf.trainable_variables()
            optimizer = tf.train.AdamOptimizer(config['learning_rate'])
            gradients = tf.gradients(seq2seq_model.loss, params)

            # clip the gradients to counter explosion
            clipped_gradients, _ = tf.clip_by_global_norm(
                gradients, config['gradient_clip'])

            # backprop
            train_op = optimizer.apply_gradients(zip(clipped_gradients,
                                                     params),
                                                 global_step=global_step)

    # create a graph for validation
    with tf.name_scope('Validation'):
        seq2seq_model_valid = seq2seq_model_class(config,
                                                  placeholders,
                                                  mode='validation')
        seq2seq_model_valid.build_graph()

    # Create summary ops for monitoring the training
    # Each summary op annotates a node in the computational graph and collects data data from it
    tf.summary.scalar('learning_rate', lr, collections=['training_summaries'])

    # Merge summaries used during training and reported after every step
    summaries_training = tf.summary.merge(
        tf.get_collection('training_summaries'))

    # create summary ops for monitoring the validation
    # caveat: we want to store the performance on the entire validation set, not just one validation batch
    # Tensorflow does not directly support this, so we must process every batch independently and then aggregate
    # the results outside of the model
    # so, we create a placeholder where can feed the aggregated result back into the model
    loss_valid_pl = tf.placeholder(tf.float32, name='loss_valid_pl')
    loss_valid_s = tf.summary.scalar('loss_valid',
                                     loss_valid_pl,
                                     collections=['validation_summaries'])

    # merge validation summaries
    summaries_valid = tf.summary.merge([loss_valid_s])

    # dump the config to the model directory in case we later want to see it
    export_config(config, os.path.join(config['model_dir'], 'config.txt'))

    with tf.Session() as sess:
        # Add the ops to initialize variables.
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        # Actually intialize the variables
        sess.run(init_op)

        # create file writers to dump summaries onto disk so that we can look at them with tensorboard
        train_summary_dir = os.path.join(config['model_dir'], "summary",
                                         "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir,
                                                     sess.graph)
        valid_summary_dir = os.path.join(config['model_dir'], "summary",
                                         "validation")
        valid_summary_writer = tf.summary.FileWriter(valid_summary_dir,
                                                     sess.graph)

        # create a saver for writing training checkpoints
        saver = tf.train.Saver(var_list=tf.trainable_variables(),
                               max_to_keep=config['n_keep_checkpoints'])

        # start training
        start_time = time.time()
        current_step = 0
        for e in range(config['n_epochs']):

            # reshuffle the batches
            data_train.reshuffle()

            # loop through all training batches
            for i, batch in enumerate(data_train.all_batches()):
                step = tf.train.global_step(sess, global_step)
                current_step += 1

                if config[
                        'learning_rate_type'] == 'linear' and current_step % config[
                            'learning_rate_decay_steps'] == 0:
                    sess.run(lr_decay_op)

                # we want to train, so must request at least the train_op
                fetches = {
                    'summaries': summaries_training,
                    'loss': seq2seq_model.loss,
                    'train_op': train_op
                }

                # get the feed dict for the current batch
                feed_dict = seq2seq_model.get_feed_dict(batch)

                # feed data into the model and run optimization
                training_out = sess.run(fetches, feed_dict)

                # write logs
                train_summary_writer.add_summary(training_out['summaries'],
                                                 global_step=step)

                # print training performance of this batch onto console
                time_delta = str(
                    datetime.timedelta(seconds=int(time.time() - start_time)))
                print('\rEpoch: {:3d} [{:4d}/{:4d}] time: {:>8} loss: {:.4f}'.
                      format(e + 1, i + 1, data_train.n_batches, time_delta,
                             training_out['loss']),
                      end='')

            # after every epoch evaluate the performance on the validation set
            total_valid_loss = 0.0
            n_valid_samples = 0
            for batch in data_valid.all_batches():
                fetches = {'loss': seq2seq_model_valid.loss}
                feed_dict = seq2seq_model_valid.get_feed_dict(batch)
                valid_out = sess.run(fetches, feed_dict)

                total_valid_loss += valid_out['loss'] * batch.batch_size
                n_valid_samples += batch.batch_size

            # write validation logs
            avg_valid_loss = total_valid_loss / n_valid_samples
            valid_summaries = sess.run(summaries_valid,
                                       {loss_valid_pl: avg_valid_loss})
            valid_summary_writer.add_summary(valid_summaries,
                                             global_step=tf.train.global_step(
                                                 sess, global_step))

            # print validation performance onto console
            print(' | validation loss: {:.6f}'.format(avg_valid_loss))

            # save this checkpoint if necessary
            if (e + 1) % config['save_checkpoints_every_epoch'] == 0:
                saver.save(sess, os.path.join(config['model_dir'], 'model'),
                           global_step)

            if avg_valid_loss > 10 or math.isnan(avg_valid_loss) or np.isinf(
                    avg_valid_loss):
                break

        # Training finished, always save model before exiting
        print('Training finished')
        ckpt_path = saver.save(sess, os.path.join(config['model_dir'],
                                                  'model'), global_step)
        print('Model saved to file {}'.format(ckpt_path))
        splits = utils.get_splits(dataset)

        for X_train, y_train, X_test, y_test in tqdm(splits):

            for random_state in config.random_states:

                utils.reset_random_state(random_state)

                algo = algorithm()

                algo.fit(X_train)

                y_train_pred = algo.predict(X_train)
                y_test_pred = algo.predict(X_test)

                stats = utils.calculate_stats(y_train, y_train_pred, y_test,
                                              y_test_pred)

                statslist.append(stats)

        scores[dataset["name"]][algorithm.name] = {}

        for k in statslist[0].keys():
            scores[dataset["name"]][algorithm.name][k] = 1.0 * sum(
                s[k] for s in statslist) / len(statslist)

        print(dataset["name"], algorithm.name,
              scores[dataset["name"]][algorithm.name]["train_auc"],
              scores[dataset["name"]][algorithm.name]["train_ap"],
              scores[dataset["name"]][algorithm.name]["test_auc"],
              scores[dataset["name"]][algorithm.name]["test_ap"])