def question_j(): logging.info("<Question J> Multiclass Classification") category = [ 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian' ] train, test = utils.fetch_data(category) train_idf = utils.model_data(train) test_idf = utils.model_data(test) logging.info("Creating TFxIDF Vector Representations") logging.info("Performing LSI on TFxIDF Matrices") # apply LSI to TDxIDF matrices svd = TruncatedSVD(n_components=50) train_lsi = svd.fit_transform(train_idf) test_lsi = svd.fit_transform(test_idf) logging.info("TFxIDF Matrices Transformed") logging.info("Size of Transformed Training Dataset: {0}".format( train_lsi.shape)) logging.info("Size of Transformed Testing Dataset: {0}".format( test_lsi.shape)) clf_list = [ OneVsOneClassifier(GaussianNB()), OneVsOneClassifier(svm.SVC(kernel='linear')), OneVsRestClassifier(GaussianNB()), OneVsRestClassifier(svm.SVC(kernel='linear')) ] clf_name = [ 'OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM', 'OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM' ] # perform classification for clf, clf_n in zip(clf_list, clf_name): logging.info("Training {0} Classifier ".format(clf_n)) clf.fit(train_lsi, train.target) logging.info("Testing {0} Classifier".format(clf_n)) test_predicted = clf.predict(test_lsi) utils.calculate_stats(test.target, test_predicted)
def test_summary_rank(contest='558'): responses = utils.read_responses(f'{contest}-responses.csv.zip') summary = utils.init_summary(responses) summary = utils.read_summary(f'{contest}_summary_LilUCB.csv') df = utils.calculate_stats(summary) ranks = {rank: mean for rank, mean in zip(df['rank'], df['score'])} for i in range(len(ranks) - 1): assert ranks[i + 1] >= ranks[i + 2]
def test_calculate_stats(errors={ 'rank': 325, 'score': 1e-10, 'precision': 1e-10 }): df = utils.read_summary('536_summary_LilUCB.csv') summary = utils.read_summary('536_summary_LilUCB.csv') for key in errors: if key in summary: del summary[key] summary = utils.calculate_stats(summary) for key, max_error in errors.items(): if max_error == 'allclose': error = np.abs(summary[key] - df[key]) assert error.max() < max_error
def main(): # Relabelling and stuff classes = [ computer_technologies, recreational_activity, science, miscellaneus, politics, religion ] all_categories = [] i = 0 rmap = {} for cnum, c in enumerate(classes): for category in c: all_categories.append(category) rmap[i] = cnum i += 1 data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42) data.target = list(map(lambda x: rmap[x], data.target)) data_idf = utils.model_data(data, 'part6') # Find Effective dimensions to retrieve data k = 6 ds = range(2, 75, 1) svd_metrics = [] print("Varying Dimensions") for d in ds: print("Set d = ", d) svd = TruncatedSVD(n_components=d) poly = FunctionTransformer(np.log1p) normalizer = Normalizer(copy=False) svd_pipeline = make_pipeline(svd, poly, normalizer) X_SVD = svd_pipeline.fit_transform(data_idf) kmeans = KMeans(n_clusters=k).fit(X_SVD) svd_metrics.append(utils.calculate_stats(data.target, kmeans.labels_)) metric_names = [ 'homogeneity_score', 'completeness_score', 'adjusted_rand_score', 'adjusted_mutual_info_score' ] for i, metric_name in enumerate(metric_names): plt.plot(ds, list(map(lambda x: x[i], svd_metrics)), label=metric_name) plt.xlabel('Dimensions') plt.ylabel('Metric Value') plt.legend(loc='best') plt.savefig('plots/part6.png', format='png') plt.clf()
p_graph.calc_all_priorities() """ bf = datetime.now() dcop2 = DcopAllocator(deepcopy(p_graph), logger) schedules2 = dcop2.allocate(deepcopy(robots), test=True) af = datetime.now() exec_time2 = (af - bf).total_seconds() ms, tt, st = utils.calculate_stats([schedules2]) print "makespan: " + str(ms) print "time travelled: " + str(tt) print "tasks scheduled: " + str(st) print "exec time: " + str(exec_time2) utils.print_schedules([schedules2], 'DCOP2') """ bf = datetime.now() dcop = DcopAllocator(deepcopy(p_graph), logger) schedules = dcop.allocate(deepcopy(robots)) af = datetime.now() exec_time1 = (af - bf).total_seconds() ms, tt, st = utils.calculate_stats([schedules]) print ("makespan: " + str(ms)) print ("time travelled: " + str(tt)) print ("tasks scheduled: " + str(st)) print ("exec time: " + str(exec_time1)) utils.print_schedules([schedules], 'DCOP')
def main(config): # create unique output directory for this model config['name'] = config['name'] + '-' + str(config['hidden_state_size']) if config['train_stride']: config['name'] = config['name'] + '-stride' if config['concat_labels']: config['name'] = config['name'] + '-concat_labels' if config['attention']: config['name'] = config['name'] + '-attention' if config['share_weights']: config['name'] = config['name'] + '-share_weights' config['name'] = config['name'] + '-' + config[ 'learning_rate_type'] + '-' + str(config['learning_rate']) timestamp = str(int(time.time())) config['model_dir'] = os.path.abspath( os.path.join(config['output_dir'], config['name'] + '-' + timestamp)) os.makedirs(config['model_dir']) print('Writing checkpoints into {}'.format(config['model_dir'])) # load the data, this requires that the *.npz files you downloaded from Kaggle be named `train.npz` and `valid.npz` data_train = load_data(config, 'train', config['train_stride']) data_valid = load_data(config, 'valid', config['eval_stride']) # TODO if you would like to do any preprocessing of the data, here would be a good opportunity stats = calculate_stats(data_train.input_) save_stats(stats) if config['normalize']: data_train.input_, _, _ = preprocess(data_train.input_) data_train.target, _, _ = preprocess(data_train.target) data_valid.input_, _, _ = preprocess(data_valid.input_) data_valid.target, _, _ = preprocess(data_valid.target) print('Post normalize samples shape: ', data_train.input_[0].shape) config['input_dim'] = data_train.input_[0].shape[-1] config['output_dim'] = data_train.target[0].shape[-1] # get input placeholders and get the model that we want to train seq2seq_model_class, placeholders = get_model_and_placeholders(config) # Create a variable that stores how many training iterations we performed. # This is useful for saving/storing the network global_step = tf.Variable(1, name='global_step', trainable=False) # create a training graph, this is the graph we will use to optimize the parameters with tf.name_scope('Training'): seq2seq_model = seq2seq_model_class(config, placeholders, mode='training') seq2seq_model.build_graph() print('created RNN model with {} parameters'.format( seq2seq_model.n_parameters)) # configure learning rate if config['learning_rate_type'] == 'exponential': lr = tf.train.exponential_decay( config['learning_rate'], global_step=global_step, decay_steps=config['learning_rate_decay_steps'], decay_rate=config['learning_rate_decay_rate'], staircase=False) lr_decay_op = tf.identity(lr) elif config['learning_rate_type'] == 'linear': lr = tf.Variable(config['learning_rate'], trainable=False) lr_decay_op = lr.assign( tf.multiply(lr, config['learning_rate_decay_rate'])) elif config['learning_rate_type'] == 'fixed': lr = config['learning_rate'] lr_decay_op = tf.identity(lr) else: raise ValueError('learning rate type "{}" unknown.'.format( config['learning_rate_type'])) with tf.name_scope('Optimizer'): # TODO choose the optimizer you desire here and define `train_op. The loss should be accessible through rnn_model.loss params = tf.trainable_variables() optimizer = tf.train.AdamOptimizer(config['learning_rate']) gradients = tf.gradients(seq2seq_model.loss, params) # clip the gradients to counter explosion clipped_gradients, _ = tf.clip_by_global_norm( gradients, config['gradient_clip']) # backprop train_op = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=global_step) # create a graph for validation with tf.name_scope('Validation'): seq2seq_model_valid = seq2seq_model_class(config, placeholders, mode='validation') seq2seq_model_valid.build_graph() # Create summary ops for monitoring the training # Each summary op annotates a node in the computational graph and collects data data from it tf.summary.scalar('learning_rate', lr, collections=['training_summaries']) # Merge summaries used during training and reported after every step summaries_training = tf.summary.merge( tf.get_collection('training_summaries')) # create summary ops for monitoring the validation # caveat: we want to store the performance on the entire validation set, not just one validation batch # Tensorflow does not directly support this, so we must process every batch independently and then aggregate # the results outside of the model # so, we create a placeholder where can feed the aggregated result back into the model loss_valid_pl = tf.placeholder(tf.float32, name='loss_valid_pl') loss_valid_s = tf.summary.scalar('loss_valid', loss_valid_pl, collections=['validation_summaries']) # merge validation summaries summaries_valid = tf.summary.merge([loss_valid_s]) # dump the config to the model directory in case we later want to see it export_config(config, os.path.join(config['model_dir'], 'config.txt')) with tf.Session() as sess: # Add the ops to initialize variables. init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Actually intialize the variables sess.run(init_op) # create file writers to dump summaries onto disk so that we can look at them with tensorboard train_summary_dir = os.path.join(config['model_dir'], "summary", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) valid_summary_dir = os.path.join(config['model_dir'], "summary", "validation") valid_summary_writer = tf.summary.FileWriter(valid_summary_dir, sess.graph) # create a saver for writing training checkpoints saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=config['n_keep_checkpoints']) # start training start_time = time.time() current_step = 0 for e in range(config['n_epochs']): # reshuffle the batches data_train.reshuffle() # loop through all training batches for i, batch in enumerate(data_train.all_batches()): step = tf.train.global_step(sess, global_step) current_step += 1 if config[ 'learning_rate_type'] == 'linear' and current_step % config[ 'learning_rate_decay_steps'] == 0: sess.run(lr_decay_op) # we want to train, so must request at least the train_op fetches = { 'summaries': summaries_training, 'loss': seq2seq_model.loss, 'train_op': train_op } # get the feed dict for the current batch feed_dict = seq2seq_model.get_feed_dict(batch) # feed data into the model and run optimization training_out = sess.run(fetches, feed_dict) # write logs train_summary_writer.add_summary(training_out['summaries'], global_step=step) # print training performance of this batch onto console time_delta = str( datetime.timedelta(seconds=int(time.time() - start_time))) print('\rEpoch: {:3d} [{:4d}/{:4d}] time: {:>8} loss: {:.4f}'. format(e + 1, i + 1, data_train.n_batches, time_delta, training_out['loss']), end='') # after every epoch evaluate the performance on the validation set total_valid_loss = 0.0 n_valid_samples = 0 for batch in data_valid.all_batches(): fetches = {'loss': seq2seq_model_valid.loss} feed_dict = seq2seq_model_valid.get_feed_dict(batch) valid_out = sess.run(fetches, feed_dict) total_valid_loss += valid_out['loss'] * batch.batch_size n_valid_samples += batch.batch_size # write validation logs avg_valid_loss = total_valid_loss / n_valid_samples valid_summaries = sess.run(summaries_valid, {loss_valid_pl: avg_valid_loss}) valid_summary_writer.add_summary(valid_summaries, global_step=tf.train.global_step( sess, global_step)) # print validation performance onto console print(' | validation loss: {:.6f}'.format(avg_valid_loss)) # save this checkpoint if necessary if (e + 1) % config['save_checkpoints_every_epoch'] == 0: saver.save(sess, os.path.join(config['model_dir'], 'model'), global_step) if avg_valid_loss > 10 or math.isnan(avg_valid_loss) or np.isinf( avg_valid_loss): break # Training finished, always save model before exiting print('Training finished') ckpt_path = saver.save(sess, os.path.join(config['model_dir'], 'model'), global_step) print('Model saved to file {}'.format(ckpt_path))
splits = utils.get_splits(dataset) for X_train, y_train, X_test, y_test in tqdm(splits): for random_state in config.random_states: utils.reset_random_state(random_state) algo = algorithm() algo.fit(X_train) y_train_pred = algo.predict(X_train) y_test_pred = algo.predict(X_test) stats = utils.calculate_stats(y_train, y_train_pred, y_test, y_test_pred) statslist.append(stats) scores[dataset["name"]][algorithm.name] = {} for k in statslist[0].keys(): scores[dataset["name"]][algorithm.name][k] = 1.0 * sum( s[k] for s in statslist) / len(statslist) print(dataset["name"], algorithm.name, scores[dataset["name"]][algorithm.name]["train_auc"], scores[dataset["name"]][algorithm.name]["train_ap"], scores[dataset["name"]][algorithm.name]["test_auc"], scores[dataset["name"]][algorithm.name]["test_ap"])