Beispiel #1
0
def main():

    dataset_reader = DatasetReader("/scratch/cpillsb1/cs66/data/")

    # uncomment for cancer
    # X, y, X_final, y_final, dataset = dataset_reader.load_cancer()

    X, y, X_final, y_final, dataset = dataset_reader.load_higgs()

    skf = StratifiedKFold(y, n_folds=4, shuffle=True, random_state=42)

    ii = 0
    for train, test in skf:
        x_train = X[train]
        x_test = X[test]

        y_train = y[train]
        y_test = y[test]
        nums = [5, 10, 30, 50]
        layer = Layer(RandomForestClassifier, {
            "max_depth": 1,
            "n_estimators": nums[ii]
        }, x_train, y_train, 10)
        predictions = layer.predictAll(x_train)
        lr = Layer(LogisticRegression, {
            "n_jobs": -1,
            "max_iter": 1000
        }, predictions, y_train, 1)
        network = Network([layer, lr])

        evaluate_test(network, X_final, y_final, nums[ii], dataset)

        ii += 1
def main():

    dataset_reader = DatasetReader("/scratch/cpillsb1/cs66/data/")

    # uncomment for cancer
    # X, y, X_final, y_final, dataset = dataset_reader.load_cancer()

    X, y, X_final, y_final, dataset = dataset_reader.load_higgs()

    input_s = (30,)
    batch_size = 25
    classes = 2

    num_nodes = [5,10,30,50]

    skf = StratifiedKFold(y, n_folds=4, shuffle = True, random_state=42)
    best_acc = 0
    ii = 0

    for train,test in skf:
        x_train = X[train]
        x_test = X[test]

        y_train = y[train]
        y_test = y[test]

        y_train = to_categorical(y_train, classes)
        y_test = to_categorical(y_test, classes)

        neural_net = Sequential()

        neural_net.add(Dense(num_nodes[ii], activation='sigmoid', input_shape = input_s, kernel_initializer="TruncatedNormal"))
        neural_net.add(Dropout(.01))
        neural_net.add(Dense(2, activation='softmax'))

        neural_net.compile(optimizer="RMSProp", loss = 'binary_crossentropy', metrics = ['accuracy'])

        neural_net.fit(x_train, y_train, batch_size = batch_size, epochs = 100, verbose = 0, validation_data = (x_test, y_test))

        predictions = neural_net.predict(x_test)
        predictions = [round(x[1]) for x in predictions]
        y_test = [x[1] for x in y_test]

        acc = 0.0
        for i, prediction in enumerate(predictions):
          if prediction == y_test[i]:
            acc += 1
        acc /= len(predictions)

        if acc > best_acc:
            best_classifier = neural_net
            best_num_nodes = num_nodes[ii]
            best_acc = acc

        ii += 1

    evaluate_test(best_classifier, X_final, y_final, best_num_nodes, dataset)
Beispiel #3
0
def main():

    dataset_reader = DatasetReader("/scratch/cpillsb1/cs66/data/")

    X, y, X_final, y_final, dataset = dataset_reader.load_cancer()

    # uncomment for higgs
    # X, y, X_final, y_final, dataset = dataset_reader.load_higgs()

    skf = StratifiedKFold(y, n_folds=4, shuffle=True, random_state=42)

    dtree_params = [1, 5, 30, 50]

    ii = 0
    best_acc = 0

    for train, test in skf:
        x_train = X[train]
        x_test = X[test]

        y_train = y[train]
        y_test = y[test]

        clf = DecisionTreeClassifier(max_depth=dtree_params[ii],
                                     max_features=1.0,
                                     random_state=42)
        clf.fit(x_train, y_train)

        predictions = clf.predict(x_test)
        acc = 0.0
        for i, prediction in enumerate(predictions):
            if prediction == y_test[i]:
                acc += 1
        acc /= len(predictions)

        if acc > best_acc:
            best_classifier = clf
            best_depth = dtree_params[ii]
            best_acc = acc

        ii += 1

    evaluate_test(best_classifier, X_final, y_final, best_depth, dataset)
Beispiel #4
0
def test(test_args):
    """Computes test perplexity for test data

  :param test_args: system args
  """

    start = time.time()

    data_reader = DatasetReader(test_args, train=False)
    test_data = data_reader.test_data

    # load hyperparameters and other flags
    with open(os.path.join(test_args.save_dir, 'config.pkl'), 'rb') as f:
        args = cPickle.load(f)

    assert test_data is not None, 'test data is not read!'

    args.vocab_size = data_reader.vocab_size
    print('vocab_size: {}'.format(args.vocab_size))

    print('Start testing...')

    with tf.Graph().as_default(), tf.Session(
            config=gpu_config if args.with_gpu else None) as sess:

        with tf.variable_scope('train_model', reuse=None):
            m_test = Model(args, is_training=False)

        saver = tf.train.Saver(tf.global_variables())
        tf.global_variables_initializer().run()
        ckpt = tf.train.get_checkpoint_state(args.save_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

        test_pp = run_epoch(sess, m_test, test_data, data_reader, tf.no_op())

        print('Test Perplexity: %.3f' % test_pp)
        print("Test time: %.0f min" % ((time.time() - start) / 60))
Beispiel #5
0
class KnapsackBruteForce:
    def __init__(self, n, weight, profit, max_weight):
        self.n = n
        self.weight = weight
        self.profit = profit
        self.max_weight = max_weight

    def run(self):
        self.best_value = 0
        self.total_weight = 0
        self.s = []
        self.solve(self.n - 1, self.s, 0, 0)

    def solve(self, n, s, current_w, current_v):
        if n == -1 and current_w <= self.max_weight and current_v > self.best_value:
            self.best_value = current_v
            self.total_weight = current_w
            self.s = s.copy()

        if n == -1:
            return

        self.solve(n - 1, [0] + s, current_w, current_v)
        self.solve(n - 1, [1] + s, current_w + self.weight[n],
                   current_v + self.profit[n])


dataset = DatasetReader().read('p08')
kbf = KnapsackBruteForce(len(dataset[0]), dataset[0], dataset[1], dataset[2])
ExecutionLogger().run(kbf)
def main(NetClass, key_name, scale):
    assert scale in [32, 64, 128]
    torch.set_grad_enabled(False)

    model_id = NetClass.model_id
    
    save_dir = '{}.{}'.format(simple_net_save_dir_prefix, scale)
    os.makedirs(save_dir, exist_ok=True)

    test_dataset_path = '{}/{}/test'.format(dataset_path, key_name)

    ck_name = '{}/model_{}_{}.pt'.format(save_dir, model_id, key_name)
    cm_test_name = '{}/cm_test_{}_{}.png'.format(save_dir, model_id, key_name)

    test_dataset = DatasetReader(test_dataset_path, target_hw=(scale, scale))

    net = NetClass(in_dim)

    net.load_state_dict(torch.load(ck_name, map_location='cpu'))

    net = net.to(device)

    net.eval()

    all_pred = []
    all_label = []

    for i in range(len(test_dataset)):
        ims, cls = test_dataset.get_im_patch_list_to_combind_predict(i, one_im=False)

        batch_im = torch.tensor(ims.astype(np.int32), dtype=torch.float) / 65535
        # batch_cls = torch.tensor([cls]).repeat(len(batch_im))

        batch_im = batch_im.permute(0, 3, 1, 2)

        batch_im = batch_im.to(device)
        # batch_cls = batch_cls.to(device)

        net_out = net(batch_im)
        out = torch.argmax(net_out, 1)
        all_label.append(cls)

        if out.sum(dtype=torch.float).item() > out.shape[0] * simple_thresh:
            all_pred.append(1)
        else:
            all_pred.append(0)

    _accuracy = accuracy_score(all_label, all_pred)
    _malignant_precision, _malignant_recall, _malignant_f1, _ = \
        precision_recall_fscore_support(all_label, all_pred, pos_label=1, average='binary')

    _benign_precision, _benign_recall, _benign_f1, _ = \
        precision_recall_fscore_support(all_label, all_pred, pos_label=0, average='binary')

    _accuracy = float(_accuracy)
    _malignant_precision = float(_malignant_precision)
    _malignant_recall = float(_malignant_recall)
    _malignant_f1 = float(_malignant_f1)
    _benign_precision = float(_benign_precision)
    _benign_recall = float(_benign_recall)
    _benign_f1 = float(_benign_f1)

    out_line = 'test acc: {:.3f} m_prec: {:.3f} m_rec: {:.3f} m_f1: {:.3f} '\
               'b_prec: {:.3f} b_rec: {:.3f} b_f1: {:.3f} model {}_{} x{}'.format(_accuracy,
                                            _malignant_precision, _malignant_recall, _malignant_f1,
                                            _benign_precision, _benign_recall, _benign_f1, model_id, key_name, scale)

    print(out_line)
    test_out.append(out_line)

    cm = confusion_matrix(all_label, all_pred)
    draw_confusion_matrix(cm, list(test_dataset.class2id.keys()), cm_test_name)
Beispiel #7
0
def main(NetClass, key_name):
    torch.set_grad_enabled(False)

    target_hw=(256, 256)
    model_id = NetClass.model_id

    train_dataset_path = '{}/{}/train'.format(dataset_path, key_name)
    eval_dataset_path = '{}/{}/eval'.format(dataset_path, key_name)
    test_dataset_path = '{}/{}/test'.format(dataset_path, key_name)
    
    ck_name = '{}/model_{}_{}.pt'.format(seg_net_save_dir, model_id, key_name)

    train_dataset = DatasetReader(train_dataset_path, target_hw=target_hw)
    eval_dataset = DatasetReader(eval_dataset_path, target_hw=target_hw)
    test_dataset = DatasetReader(test_dataset_path, target_hw=target_hw)
        
    net = NetClass(in_dim)
    net.load_state_dict(torch.load(ck_name, map_location='cpu'))
    net = net.to(device)
    net.eval()

    for dataset_type_id, dataset in enumerate([train_dataset, eval_dataset, test_dataset]):
        dataset_type = ['train', 'eval', 'test'][dataset_type_id]

        if dataset_type not in big_dict:
            big_dict[dataset_type] = {}
        
        for i in range(len(dataset)):
            im_name, im, cm, cls = dataset.get_im_patch_list_to_combind_predict(i, need_im_name=True)
            
            im_id = os.path.splitext(im_name)[0]
            if im_id not in big_dict[dataset_type]:
                big_dict[dataset_type][im_id] = {}
            
            batch_im = torch.tensor([im], dtype=torch.float) / 65535
            # batch_cm = torch.tensor([cm])
    
            batch_im = batch_im.permute(0, 3, 1, 2)
    
            batch_im = batch_im.to(device)
            # batch_cm = batch_cm.to(device)
    
            net_out = net(batch_im)
            out = torch.argmax(net_out, 1)
    
            cls1_pixel_num = (out == 1).sum(dtype=torch.float).item()
            cls2_pixel_num = (out == 2).sum(dtype=torch.float).item()

            if cls1_pixel_num + cls2_pixel_num == 0:
                out_pred = 0
            else:
                out_pred = cls2_pixel_num / (cls1_pixel_num + cls2_pixel_num)
            
            assert not np.isnan(out_pred)
            
            if 'pred' not in big_dict[dataset_type][im_id]:
                big_dict[dataset_type][im_id]['pred'] = out_pred
            else:
                raise AssertionError('Error, found pred setting in 2 times')
                
            if 'class' not in big_dict[dataset_type][im_id]:
                big_dict[dataset_type][im_id]['class'] = cls - 1
            else:
                assert big_dict[dataset_type][im_id]['class'] == cls
Beispiel #8
0
def main(NetClass, key_name, scale):
    assert scale in [32, 64, 128]
    target_hw = (scale, scale)
    model_id = NetClass.model_id

    train_dataset_path = '{}/{}/train'.format(dataset_path, key_name)
    eval_dataset_path = '{}/{}/eval'.format(dataset_path, key_name)
    test_dataset_path = '{}/{}/test'.format(dataset_path, key_name)

    ck_name = '{}.{}/model_{}_{}.pt'.format(simple_net_save_dir_prefix, scale,
                                            model_id, key_name)

    train_dataset = DatasetReader(train_dataset_path, target_hw=target_hw)
    eval_dataset = DatasetReader(eval_dataset_path, target_hw=target_hw)
    test_dataset = DatasetReader(test_dataset_path, target_hw=target_hw)

    net = NetClass(in_dim)

    net.load_state_dict(torch.load(ck_name, map_location='cpu'))

    net = net.to(device)

    net.eval()

    torch.set_grad_enabled(False)

    for dataset_type_id, dataset in enumerate(
        [train_dataset, eval_dataset, test_dataset]):
        dataset_type = ['train', 'eval', 'test'][dataset_type_id]

        if dataset_type not in big_dict:
            big_dict[dataset_type] = {}

        for i in range(len(dataset)):
            im_name, ims, cls = dataset.get_im_patch_list_to_combind_predict(
                i, one_im=False, need_im_name=True)

            im_id = os.path.splitext(im_name)[0]
            if im_id not in big_dict[dataset_type]:
                big_dict[dataset_type][im_id] = {}

            batch_im = torch.tensor(ims.astype(np.int32),
                                    dtype=torch.float) / 65535
            # batch_cls = torch.tensor([cls]).repeat(len(batch_im))

            batch_im = batch_im.permute(0, 3, 1, 2)

            batch_im = batch_im.to(device)
            # batch_cls = batch_cls.to(device)

            net_out = net(batch_im)
            net_out = torch.softmax(net_out, 1)
            net_out = net_out[:, 1]
            out_pred = torch.mean(net_out).item()

            if 'pred{}'.format(scale) not in big_dict[dataset_type][im_id]:
                big_dict[dataset_type][im_id]['pred{}'.format(
                    scale)] = out_pred
            else:
                raise AssertionError(
                    'Error, found pred{} setting in 2 times'.format(scale))

            if 'class' not in big_dict[dataset_type][im_id]:
                big_dict[dataset_type][im_id]['class'] = cls
            else:
                assert big_dict[dataset_type][im_id]['class'] == cls
        self.weight = weight
        self.profit = profit
        self.max_weight = max_weight

    def run(self):
        self.best_value = 0
        self.total_weight = 0
        self.s = []
        self.solve(self.n - 1, self.s, 0, 0)

    def solve(self, n, s, current_w, current_v):
        if current_w > self.max_weight:
            return

        if n == -1:
            if current_w <= self.max_weight and current_v > self.best_value:
                self.best_value = current_v
                self.total_weight = current_w
                self.s = s.copy()
            return

        self.solve(n - 1, [0] + s, current_w, current_v)
        self.solve(n - 1, [1] + s, current_w + self.weight[n],
                   current_v + self.profit[n])


dataset = DatasetReader().read('c09')
kbfo = KnapsackBruteForceOpt(len(dataset[0]), dataset[0], dataset[1],
                             dataset[2])
ExecutionLogger().run(kbfo)
Beispiel #10
0
def word2vec(
		files=[],
		directories=[],
		skip=[],
		save_dir=None,
		num_epochs=5,
		unigram_dictionary=None,
		noise_ratio=15,
		kernel=[1,2,3,4,5,5,4,3,2,1],
		t = 1.0e-5,
		batch_size = 1000,  # Number of *signal* examples per batch
		num_embedding_dimensions=500,
		word_embedding_init=Normal(),
		context_embedding_init=Normal(),
		learning_rate=0.1,
		momentum=0.9,
		num_processes=3,
		load_dictionary_dir=None,
		min_frequency=10,
		macrobatch_size = 100000,
		max_queue_size=0,
		verbose=True
	):

	'''
	Helper function that handles all concerns involved in training
	A word2vec model using the approach of Mikolov et al.  It surfaces
	all of the options.

	For customizations going beyond simply tweeking existing options and
	hyperparameters, substitute this function by writing your own training
	routine using the provided classes.  This function would be a starting
	point for you.
	'''

	# Make a Word2VecMinibatcher, pass through parameters sent by caller
	reader = DatasetReader(
		files=files,
		directories=directories,
		skip=skip,
		noise_ratio=noise_ratio,
		t=t,
		num_processes=num_processes,
		unigram_dictionary=unigram_dictionary,
		kernel=kernel,
		max_queue_size=max_queue_size,
		macrobatch_size=macrobatch_size,
		verbose=verbose
	)


	# Prepare the minibatch generator
	# (this produces the counter_sampler stats)
	if load_dictionary_dir is None and unigram_dictionary is None:
		if verbose:
			print 'preparing dictionaries...'
		reader.prepare(save_dir=save_dir)

	# If min_frequency was specified, prune the dictionaries
	if min_frequency is not None:
		if verbose:
			print 'prunning dictionaries...'
		reader.prune(min_frequency)

	# Make a symbolic minibatcher
	minibatcher = NoiseContrastiveTheanoMinibatcher(
		batch_size=batch_size,
		noise_ratio=noise_ratio,
		dtype="int32",
		num_dims=2
	)

	# Make a Word2VecEmbedder object, feed it the combined input.
	# Note that the full batch includes noise examples and signal_examples
	# so is larger than batch_size, which is the number of signal_examples
	# only per batch.
	full_batch_size = batch_size * (1 + noise_ratio)
	embedder = Word2VecEmbedder(
		input_var=minibatcher.get_batch(),
		batch_size=full_batch_size,
		vocabulary_size=reader.get_vocab_size(),
		num_embedding_dimensions=num_embedding_dimensions,
		word_embedding_init=word_embedding_init,
		context_embedding_init=context_embedding_init
	)

	# Architectue is ready.  Make the loss function, and use it to create 
	# the parameter updates responsible for learning
	loss = get_noise_contrastive_loss(embedder.get_output(), batch_size)
	updates = nesterov_momentum(
		loss, embedder.get_params(), learning_rate, momentum
	)

	# Include minibatcher updates, which cause the symbolic batch to move
	# through the dataset like a sliding window
	updates.update(minibatcher.get_updates())

	# Use the loss function and the updates to compile a training function.
	# Note that it takes no inputs because the dataset is fully loaded using
	# theano shared variables
	train = function([], loss, updates=updates)

	# Iterate through the dataset, training the embeddings
	for epoch in range(num_epochs):

		if verbose:
			print 'starting epoch %d' % epoch

		macrobatches = reader.generate_dataset_serial()
		macrobatch_num = 0
		for signal_macrobatch, noise_macrobatch in macrobatches:

			macrobatch_num += 1
			if verbose:
				print 'running macrobatch %d' % (macrobatch_num - 1)

			minibatcher.load_dataset(signal_macrobatch, noise_macrobatch)
			losses = []
			for batch_num in range(minibatcher.get_num_batches()):
				if verbose:
					print 'running minibatch', batch_num
				losses.append(train())
			if verbose:
				print '\taverage loss: %f' % np.mean(losses)

	# Save the model (the embeddings) if save_dir was provided
	if save_dir is not None:
		embedder.save(save_dir)

	# Return the trained embedder and the dictionary mapping tokens
	# to ids
	return embedder, reader
Beispiel #11
0
def train(args):
    """Train the data train corpus

  :param args: system args
  """

    start = time.time()
    save_dir = args.save_dir
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    with open(os.path.join(save_dir, 'config.pkl'), 'wb') as f:
        cPickle.dump(args, f)

    data_reader = DatasetReader(args)
    train_data = data_reader.train_data

    assert train_data is not None, 'training data is not read!'

    print('Number of train running words: {}'.format(len(train_data)))

    dev_data = data_reader.dev_data
    if dev_data:
        print('Number of dev set running words: {}'.format(len(dev_data)))

    out_file = os.path.join(args.save_dir, args.output)
    fout = codecs.open(out_file, "w", encoding="UTF-8")

    args.vocab_size = data_reader.vocab_size
    print('vocab size: {}'.format(args.vocab_size))
    fout.write('vocab size: {}\n'.format(str(args.vocab_size)))

    print('Start training....')

    with tf.Graph().as_default(), tf.Session(
            config=gpu_config if args.with_gpu else None) as sess:

        if args.init_scale:
            initializer = tf.random_uniform_initializer(
                -args.init_scale, +args.init_scale)
        else:
            initializer = tf.glorot_uniform_initializer()

        # build models
        with tf.variable_scope('train_model',
                               reuse=None,
                               initializer=initializer):
            m_train = Model(args)

        if dev_data:
            # reuse the same embedding matrix
            with tf.variable_scope('train_model',
                                   reuse=True,
                                   initializer=initializer):
                m_dev = Model(args, is_training=False)
        else:
            m_dev = None

        # save only the last model
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        tf.global_variables_initializer().run()

        best_pp = 10000000.0  # only used when we have dev

        e = 0
        decay_counter = 1
        lr = args.lr
        while e < args.num_epochs:
            # apply lr decay
            if e >= args.start_epoch_decay:
                lr_decay = args.decay_rate**decay_counter
                lr *= lr_decay
                decay_counter += 1

            print('Epoch: %d' % (e + 1))

            m_train.assign_lr(sess, lr)
            print('Learning rate: %.6f' % sess.run(m_train.lr))

            fout.write("Epoch: %d\n" % (e + 1))
            fout.write("Learning rate: %.3f\n" % sess.run(m_train.lr))

            train_pp = run_epoch(sess,
                                 m_train,
                                 train_data,
                                 data_reader,
                                 m_train.train_op,
                                 verbose=True)

            print('Train Perplexity: {}'.format(train_pp))
            fout.write("Train Perplexity: %.3f\n" % train_pp)

            if m_dev:
                dev_pp = run_epoch(sess, m_dev, dev_data, data_reader,
                                   tf.no_op())

                print("Valid Perplexity: %.3f\n" % dev_pp)
                fout.write("Valid Perplexity: %.3f\n" % dev_pp)

                if dev_pp < best_pp:
                    print("Achieve highest perplexity on dev set, save model.")
                    checkpoint_path = os.path.join(save_dir, 'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=e)
                    print("model saved to {}".format(checkpoint_path))
                    best_pp = dev_pp
                else:
                    break
            else:
                checkpoint_path = os.path.join(save_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=e)
                print("model saved to {}".format(checkpoint_path))

            fout.flush()

            e += 1

        print("Training time: %.0f min" % ((time.time() - start) / 60))
        fout.write("Training time: %.0f min\n" % ((time.time() - start) / 60))
        fout.flush()
Beispiel #12
0
 if train_filename == test_filename:
     print('Dev run.')
     ignore_closest = True
 else:
     ignore_closest = False
     
 debug = False
 dummy = False
 
 print('Reading model..')
 if not dummy:
     model_reader = ModelReader(model_params_filename)
     model = model_reader.model
 else:
     model = DummyContextModel()
 dataset_reader = DatasetReader(model)
 
 print('Reading train dataset..')
 train_set, train_key2ind, train_ind2key = dataset_reader.read_dataset(train_filename, train_filename+'.key', True, isolate_target_sentence)
 knn = Knn(k, train_set, train_key2ind)
 
 print('Reading test dataset..')
 test_set, test_key2ind, test_ind2key = dataset_reader.read_dataset(test_filename, test_filename+'.key', False, isolate_target_sentence)
 
 print('Starting to classify test set:')
 with open(result_filename, 'w') as o:
     for ind, key_set in enumerate(test_set):
         key = test_ind2key[ind]
         if debug:
             print('KEY:', key)
             print()
def main(NetClass, key_name):
    torch.set_grad_enabled(False)

    model_id = NetClass.model_id

    test_dataset_path = '{}/{}/test'.format(dataset_path, key_name)

    ck_32_name = '{}.32/model_{}_{}.pt'.format(simple_net_save_dir_prefix,
                                               model_id, key_name)
    ck_64_name = '{}.64/model_{}_{}.pt'.format(simple_net_save_dir_prefix,
                                               model_id, key_name)
    ck_128_name = '{}.128/model_{}_{}.pt'.format(simple_net_save_dir_prefix,
                                                 model_id, key_name)

    cm_net3_test_name = '{}_{}_{}.png'.format(
        simple_net_3_merge_test_cm_prefix, model_id, key_name)
    os.makedirs(os.path.split(cm_net3_test_name)[0], exist_ok=True)

    test_dataset_32 = DatasetReader(test_dataset_path, target_hw=(32, 32))
    test_dataset_64 = DatasetReader(test_dataset_path, target_hw=(64, 64))
    test_dataset_128 = DatasetReader(test_dataset_path, target_hw=(128, 128))

    net_32 = NetClass(in_dim)
    net_64 = NetClass(in_dim)
    net_128 = NetClass(in_dim)

    net_32.load_state_dict(
        torch.load(ck_32_name, map_location=torch.device('cpu')))
    net_64.load_state_dict(
        torch.load(ck_64_name, map_location=torch.device('cpu')))
    net_128.load_state_dict(
        torch.load(ck_128_name, map_location=torch.device('cpu')))

    net_32 = net_32.to(device)
    net_64 = net_64.to(device)
    net_128 = net_128.to(device)

    net_32.eval()
    net_64.eval()
    net_128.eval()

    all_pred = []
    all_label = []

    for i in range(len(test_dataset_32)):
        ims_32, cls_32 = test_dataset_32.get_im_patch_list_to_combind_predict(
            i, one_im=False)
        ims_64, cls_64 = test_dataset_64.get_im_patch_list_to_combind_predict(
            i, one_im=False)
        ims_128, cls_128 = test_dataset_128.get_im_patch_list_to_combind_predict(
            i, one_im=False)

        assert cls_32 == cls_64 == cls_128

        tmp_x = [[net_32, ims_32, cls_32], [net_64, ims_64, cls_64],
                 [net_128, ims_128, cls_128]]
        tmp_y = []

        for net, ims, cls in tmp_x:
            batch_im = torch.tensor(ims.astype(np.int32),
                                    dtype=torch.float) / 65535
            # batch_cls = torch.tensor([cls]).repeat(len(batch_im))

            batch_im = batch_im.permute(0, 3, 1, 2)

            batch_im = batch_im.to(device)
            # batch_cls = batch_cls.to(device)

            net_out = net(batch_im)
            out = torch.argmax(net_out, 1)

            if out.sum(
                    dtype=torch.float).item() > out.shape[0] * simple_thresh:
                tmp_y.append(1)
            else:
                tmp_y.append(0)

        all_label.append(tmp_x[0][-1])

        if np.sum(tmp_y) > simple_merge_thresh:
            all_pred.append(1)
        else:
            all_pred.append(0)

    _accuracy = accuracy_score(all_label, all_pred)
    _malignant_precision, _malignant_recall, _malignant_f1, _ = \
        precision_recall_fscore_support(all_label, all_pred, pos_label=1, average='binary')

    _benign_precision, _benign_recall, _benign_f1, _ = \
        precision_recall_fscore_support(all_label, all_pred, pos_label=0, average='binary')

    _accuracy = float(_accuracy)
    _malignant_precision = float(_malignant_precision)
    _malignant_recall = float(_malignant_recall)
    _malignant_f1 = float(_malignant_f1)
    _benign_precision = float(_benign_precision)
    _benign_recall = float(_benign_recall)
    _benign_f1 = float(_benign_f1)

    out_line = 'test acc: {:.3f} m_prec: {:.3f} m_rec: {:.3f} m_f1: {:.3f} '\
               'b_prec: {:.3f} b_rec: {:.3f} b_f1: {:.3f} model {}_{}'.format(_accuracy,
                                                    _malignant_precision, _malignant_recall, _malignant_f1,
                                                    _benign_precision, _benign_recall, _benign_f1, model_id, key_name)

    print(out_line)
    test_out.append(out_line)

    cm = confusion_matrix(all_label, all_pred)
    draw_confusion_matrix(cm, list(test_dataset_32.class2id.keys()),
                          cm_net3_test_name)
Beispiel #14
0
def word2vec(
        files=[],
        directories=[],
        skip=[],
        save_dir=None,
        num_epochs=5,
        unigram_dictionary=None,
        noise_ratio=15,
        kernel=[1, 2, 3, 4, 5, 5, 4, 3, 2, 1],
        t=1.0e-5,
        batch_size=1000,  # Number of *signal* examples per batch
        num_embedding_dimensions=500,
        word_embedding_init=Normal(),
        context_embedding_init=Normal(),
        learning_rate=0.1,
        momentum=0.9,
        num_processes=3,
        load_dictionary_dir=None,
        min_frequency=10,
        macrobatch_size=100000,
        max_queue_size=0,
        verbose=True):
    '''
	Helper function that handles all concerns involved in training
	A word2vec model using the approach of Mikolov et al.  It surfaces
	all of the options.

	For customizations going beyond simply tweeking existing options and
	hyperparameters, substitute this function by writing your own training
	routine using the provided classes.  This function would be a starting
	point for you.
	'''

    # Make a Word2VecMinibatcher, pass through parameters sent by caller
    reader = DatasetReader(files=files,
                           directories=directories,
                           skip=skip,
                           noise_ratio=noise_ratio,
                           t=t,
                           num_processes=num_processes,
                           unigram_dictionary=unigram_dictionary,
                           kernel=kernel,
                           max_queue_size=max_queue_size,
                           macrobatch_size=macrobatch_size,
                           verbose=verbose)

    # Prepare the minibatch generator
    # (this produces the counter_sampler stats)
    if load_dictionary_dir is None and unigram_dictionary is None:
        if verbose:
            print 'preparing dictionaries...'
        reader.prepare(save_dir=save_dir)

    # If min_frequency was specified, prune the dictionaries
    if min_frequency is not None:
        if verbose:
            print 'prunning dictionaries...'
        reader.prune(min_frequency)

    # Make a symbolic minibatcher
    minibatcher = NoiseContrastiveTheanoMinibatcher(batch_size=batch_size,
                                                    noise_ratio=noise_ratio,
                                                    dtype="int32",
                                                    num_dims=2)

    # Make a Word2VecEmbedder object, feed it the combined input.
    # Note that the full batch includes noise examples and signal_examples
    # so is larger than batch_size, which is the number of signal_examples
    # only per batch.
    full_batch_size = batch_size * (1 + noise_ratio)
    embedder = Word2VecEmbedder(
        input_var=minibatcher.get_batch(),
        batch_size=full_batch_size,
        vocabulary_size=reader.get_vocab_size(),
        num_embedding_dimensions=num_embedding_dimensions,
        word_embedding_init=word_embedding_init,
        context_embedding_init=context_embedding_init)

    # Architectue is ready.  Make the loss function, and use it to create
    # the parameter updates responsible for learning
    loss = get_noise_contrastive_loss(embedder.get_output(), batch_size)
    updates = nesterov_momentum(loss, embedder.get_params(), learning_rate,
                                momentum)

    # Include minibatcher updates, which cause the symbolic batch to move
    # through the dataset like a sliding window
    updates.update(minibatcher.get_updates())

    # Use the loss function and the updates to compile a training function.
    # Note that it takes no inputs because the dataset is fully loaded using
    # theano shared variables
    train = function([], loss, updates=updates)

    # Iterate through the dataset, training the embeddings
    for epoch in range(num_epochs):

        if verbose:
            print 'starting epoch %d' % epoch

        macrobatches = reader.generate_dataset_serial()
        macrobatch_num = 0
        for signal_macrobatch, noise_macrobatch in macrobatches:

            macrobatch_num += 1
            if verbose:
                print 'running macrobatch %d' % (macrobatch_num - 1)

            minibatcher.load_dataset(signal_macrobatch, noise_macrobatch)
            losses = []
            for batch_num in range(minibatcher.get_num_batches()):
                if verbose:
                    print 'running minibatch', batch_num
                losses.append(train())
            if verbose:
                print '\taverage loss: %f' % np.mean(losses)

    # Save the model (the embeddings) if save_dir was provided
    if save_dir is not None:
        embedder.save(save_dir)

    # Return the trained embedder and the dictionary mapping tokens
    # to ids
    return embedder, reader
Beispiel #15
0
def main(NetClass, key_name, dataset_type='test'):
    torch.set_grad_enabled(False)

    model_id = NetClass.model_id

    test_dataset_path = '{}/{}/{}'.format(dataset_path, key_name, dataset_type)
    test_dataset = DatasetReader(test_dataset_path)

    ck_name = '{}/model_{}_{}.pt'.format(seg_net_save_dir, model_id, key_name)
    cm_test_name = '{}/cm_{}_{}_{}.png'.format(seg_net_save_dir, dataset_type,
                                               model_id, key_name)

    net = NetClass(in_dim)
    net.load_state_dict(torch.load(ck_name))
    net = net.to(device)
    net.eval()

    all_pred = []
    all_label = []

    for i in range(len(test_dataset)):
        im, cm, cls = test_dataset.get_im_patch_list_to_combind_predict(i)

        batch_im = torch.tensor([im], dtype=torch.float) / 65535
        batch_im = batch_im.permute(0, 3, 1, 2)
        batch_im = batch_im.to(device)

        net_out = net(batch_im)
        out = torch.argmax(net_out, 1)
        all_label.append(cls)

        cls1_pixel_num = (out == 1).sum().item()
        cls2_pixel_num = (out == 2).sum().item()

        if cls1_pixel_num + cls2_pixel_num == 0:
            all_pred.append(1)
        else:
            if cls2_pixel_num / (cls1_pixel_num + cls2_pixel_num) > seg_thresh:
                all_pred.append(2)
            else:
                all_pred.append(1)

    _accuracy = accuracy_score(all_label, all_pred)
    _malignant_precision, _malignant_recall, _malignant_f1, _ = \
        precision_recall_fscore_support(all_label, all_pred, pos_label=2, average='binary')

    _benign_precision, _benign_recall, _benign_f1, _ = \
        precision_recall_fscore_support(all_label, all_pred, pos_label=1, average='binary')

    _accuracy = float(_accuracy)
    _malignant_precision = float(_malignant_precision)
    _malignant_recall = float(_malignant_recall)
    _malignant_f1 = float(_malignant_f1)
    _benign_precision = float(_benign_precision)
    _benign_recall = float(_benign_recall)
    _benign_f1 = float(_benign_f1)

    out_line = '{} acc: {:.3f} m_prec: {:.3f} m_rec: {:.3f} m_f1: {:.3f} '\
               'b_prec: {:.3f} b_rec: {:.3f} b_f1: {:.3f} model {}_{}'.format(dataset_type, _accuracy,
                                                    _malignant_precision, _malignant_recall, _malignant_f1,
                                                    _benign_precision, _benign_recall, _benign_f1, model_id, key_name)

    print(out_line)
    test_out.append(out_line)

    cm = confusion_matrix(all_label, all_pred)
    draw_confusion_matrix(cm,
                          list(test_dataset.class2id.keys())[1:], cm_test_name)
Beispiel #16
0
import numpy as np
import matplotlib.pyplot as plt
from dataset_reader import DatasetReader
from dim_reducer import DimReducer
from text_embedder import TextEmbedder
from utils import *


# Initialization
reader = DatasetReader()
embedder = TextEmbedder()
reducer = DimReducer(alg='umap')
article_list = []

# Get article_list
num_limit = 100
i = 0
for article in reader:
    article_list.append(article)
    i += 1
    if i == num_limit:
        break
corpus = get_corpus(article_list)

# Get Embeddings
embedding_list = embedder.fit_transform(corpus)
for i, embedding in enumerate(embedding_list):
    article_list[i].set_embedding(embedding)

# Getting Coordinates
embedding_list = np.array(embedding_list)
Beispiel #17
0
    if cMutantIndividual[nRandomChange] == "0":
        aMutantIndividual[nRandomChange] = "1"
    else:
        aMutantIndividual[nRandomChange] = "0"
    cNewIndividual = "".join(aMutantIndividual)

    '''
    # Troca aleatório de Cromossomos
    cNewIndividual = ""
    for cChromo in cMutantIndividual:
        nRandomChange = random.randint(0,1)
        # Se for multiplo de 2 (par) troca
        if nRandomChange == 1:
            if cChromo == "1":
                cNewIndividual += "0"
            else:
                cNewIndividual += "1"
        else:
            cNewIndividual += cChromo
    '''
    aNewPopulation.append(cNewIndividual)

    # Adiciona os pais na nova população
    aNewPopulation.append(cFather1)
    aNewPopulation.append(cFather2)

    return aNewPopulation

dataset = DatasetReader().read('c11')
kbf = KnapsackGeneticAlgorithm(len(dataset[0]), dataset[0], dataset[1], dataset[2])
ExecutionLogger().run(kbf)
def main(NetClass, key_name, scale=32):
    torch.set_grad_enabled(True)

    assert scale in [32, 64, 128]
    model_id = NetClass.model_id

    save_dir = '{}.{}'.format(simple_net_save_dir_prefix, scale)
    os.makedirs(save_dir, exist_ok=True)

    train_dataset_path = '{}/{}/train'.format(dataset_path, key_name)
    eval_dataset_path = '{}/{}/eval'.format(dataset_path, key_name)

    ck_name = '{}/model_{}_{}.pt'.format(save_dir, model_id, key_name)
    ck_extra_name = '{}/extra_{}_{}.yml'.format(save_dir, model_id, key_name)
    cm_name = '{}/cm_valid_{}_{}.png'.format(save_dir, model_id, key_name)

    logdir = '{}_{}_{}.{}'.format(simple_net_train_logs_dir_prefix, model_id,
                                  key_name, scale)
    sw = SummaryWriter(logdir)

    train_dataset = DatasetReader(train_dataset_path,
                                  is_require_cls_blance=True,
                                  target_hw=(scale, scale))
    eval_dataset = DatasetReader(eval_dataset_path, target_hw=(scale, scale))

    net = NetClass(in_dim)

    net = net.to(device)

    batch_count = train_dataset.get_batch_count(batch_size)

    optim = torch.optim.Adam(net.parameters(), 1e-3, eps=1e-8)
    optim_adjust = torch.optim.lr_scheduler.MultiStepLR(optim, [90, 180, 270],
                                                        gamma=0.1)

    max_valid_value = 0.

    class_weight_for_loss = torch.tensor([1, 1],
                                         dtype=torch.float,
                                         device=device)

    for e in range(epoch):
        net.train()
        optim_adjust.step(e)
        train_acc = 0
        train_loss = 0
        for b in range(batch_count):

            batch_im, batch_cls = train_dataset.get_batch(batch_size)

            batch_im = torch.tensor(batch_im.astype(np.int32),
                                    dtype=torch.float) / 65535
            # batch_im += (torch.rand_like(batch_im) * 0.1 - 0.05)
            batch_cls = torch.tensor(batch_cls, dtype=torch.long)

            batch_im = batch_im.permute(0, 3, 1, 2)

            batch_im = batch_im.to(device)
            batch_cls = batch_cls.to(device)

            net_out = net(batch_im)
            # net_out = net_train(batch_im)

            with torch.no_grad():
                out = torch.argmax(net_out, 1)
                acc = torch.eq(out,
                               batch_cls).sum(dtype=torch.float) / len(out)

            loss = F.cross_entropy(net_out, batch_cls, class_weight_for_loss)

            train_acc += acc.item()
            train_loss += loss.item()

            print('epoch: {} train acc: {:.3f} loss: {:.3f}'.format(
                e, acc.item(), loss.item()))
            optim.zero_grad()
            loss.backward()
            optim.step()

        train_acc = train_acc / batch_count
        train_loss = train_loss / batch_count

        sw.add_scalar('train_acc', train_acc, global_step=e)
        sw.add_scalar('train_loss', train_loss, global_step=e)

        # here to check eval
        if (e + 1) % 3 == 0:
            with torch.no_grad():
                net.eval()

                all_pred = []
                all_label = []

                for i in range(len(eval_dataset)):
                    ims, cls = eval_dataset.get_im_patch_list_to_combind_predict(
                        i, one_im=False)

                    batch_im = torch.tensor(ims.astype(np.int32),
                                            dtype=torch.float) / 65535
                    # batch_cls = torch.tensor([cls]).repeat(len(batch_im))

                    batch_im = batch_im.permute(0, 3, 1, 2)

                    batch_im = batch_im.to(device)
                    # batch_cls = batch_cls.to(device)

                    net_out = net(batch_im)
                    out = torch.argmax(net_out, 1)
                    all_label.append(cls)

                    if out.sum(dtype=torch.float).item(
                    ) > out.shape[0] * simple_thresh:
                        all_pred.append(1)
                    else:
                        all_pred.append(0)

                _accuracy = accuracy_score(all_label, all_pred)
                _malignant_precision, _malignant_recall, _malignant_f1, _ =\
                    precision_recall_fscore_support(all_label, all_pred, pos_label=1, average='binary')

                _benign_precision, _benign_recall, _benign_f1, _ =\
                    precision_recall_fscore_support(all_label, all_pred, pos_label=0, average='binary')

                _accuracy = float(_accuracy)
                _malignant_precision = float(_malignant_precision)
                _malignant_recall = float(_malignant_recall)
                _malignant_f1 = float(_malignant_f1)
                _benign_precision = float(_benign_precision)
                _benign_recall = float(_benign_recall)
                _benign_f1 = float(_benign_f1)

                sw.add_scalar('eval_acc', _accuracy, global_step=e)
                sw.add_scalar('eval_m_prec',
                              _malignant_precision,
                              global_step=e)
                sw.add_scalar('eval_m_recall',
                              _malignant_recall,
                              global_step=e)
                sw.add_scalar('eval_m_f1', _malignant_f1, global_step=e)
                sw.add_scalar('eval_b_prec', _benign_precision, global_step=e)
                sw.add_scalar('eval_b_recall', _benign_recall, global_step=e)
                sw.add_scalar('eval_b_f1', _benign_f1, global_step=e)

                print(
                    'epoch: {} eval acc: {:.3f} m_prec: {:.3f} m_rec: {:.3f} m_f1: {:.3f} '
                    'b_prec: {:.3f} b_rec: {:.3f} b_f1: {:.3f}'.format(
                        e, _accuracy, _malignant_precision, _malignant_recall,
                        _malignant_f1, _benign_precision, _benign_recall,
                        _benign_f1))

                avg_f1 = (_malignant_f1 + _benign_f1) / 2

                #if _benign_precision - _malignant_precision > 0.2:
                #    class_weight_for_loss[1] += 0.1

                if avg_f1 >= max_valid_value:
                    max_valid_value = avg_f1
                    torch.save(net.state_dict(), ck_name)
                    extra = {
                        'accuracy': _accuracy,
                        'm_precision': _malignant_precision,
                        'm_recall': _malignant_recall,
                        'm_f1': _malignant_f1,
                        'b_precision': _benign_precision,
                        'b_recall': _benign_recall,
                        'b_f1': _benign_f1,
                    }
                    yaml.safe_dump(extra, open(ck_extra_name, 'w'))
                    cm = confusion_matrix(all_label, all_pred)
                    draw_confusion_matrix(cm,
                                          list(eval_dataset.class2id.keys()),
                                          cm_name)

            # early exit
            if _accuracy == 1.:
                print('found valid acc == 1. , early exit')
                break

    sw.close()
Beispiel #19
0
    if train_filename == test_filename:
        print 'Dev run.'
        ignore_closest = True
    else:
        ignore_closest = False

    debug = False
    dummy = False

    print 'Reading model..'
    if not dummy:
        model_reader = ModelReader(model_params_filename)
        model = model_reader.model
    else:
        model = DummyContextModel()
    dataset_reader = DatasetReader(model)

    print 'Reading train dataset..'
    train_set, train_key2ind, train_ind2key = dataset_reader.read_dataset(
        train_filename, train_filename + '.key', True, isolate_target_sentence)
    knn = Knn(k, train_set, train_key2ind)

    print 'Reading test dataset..'
    test_set, test_key2ind, test_ind2key = dataset_reader.read_dataset(
        test_filename, test_filename + '.key', False, isolate_target_sentence)

    print 'Starting to classify test set:'
    with open(result_filename, 'w') as o:
        for ind, key_set in enumerate(test_set):
            key = test_ind2key[ind]
            if debug:
Beispiel #20
0
def main():
    if not (args.use_w1_w2_embeddings or args.use_paraphrase_vectors):
        raise ValueError(
            'At least one of "use_w1_w2_embeddings" or "use_paraphrase_vectors" should be set.'
        )

    # Load the datasets
    logger.info('Loading the datasets from {}'.format(args.dataset_prefix))
    train_set = DatasetReader(args.dataset_prefix + '/train.tsv')
    val_set = DatasetReader(args.dataset_prefix + '/val.tsv',
                            label2index=train_set.label2index)
    test_set = DatasetReader(args.dataset_prefix + '/test.tsv',
                             label2index=train_set.label2index)

    # Generate the feature vectors using the paraphrasing model
    logger.info('Generating feature vectors...')
    train_features, val_features, test_features = [], [], []

    if args.use_paraphrase_vectors:
        logger.info('Reading word embeddings from {}...'.format(
            args.word_embeddings_for_model))
        wv, model_words = load_binary_embeddings(
            args.word_embeddings_for_model)

        logger.info('Loading paraphrasing model from {}...'.format(
            args.paraphrase_model_dir))
        model = Model.load_model(args.language_model_dir, wv)

        model_words = ['[w1]', '[w2]', '[par]'] + model_words
        modelw2index = {w: i for i, w in enumerate(model_words)}
        UNK = modelw2index['unk']

    if args.use_w1_w2_embeddings:
        logger.info('Reading word embeddings from {}...'.format(
            args.word_embeddings_for_dist))
        wv, words = load_binary_embeddings(args.word_embeddings_for_dist)
        w2index = {w: i for i, w in enumerate(words)}
        UNK = w2index['unk']

        train_features.append(
            np.vstack([
                np.concatenate(
                    [wv[w2index.get(w1, UNK), :], wv[w2index.get(w2, UNK), :]])
                for (w1, w2) in train_set.noun_compounds
            ]))
        val_features.append(
            np.vstack([
                np.concatenate(
                    [wv[w2index.get(w1, UNK), :], wv[w2index.get(w2, UNK), :]])
                for (w1, w2) in val_set.noun_compounds
            ]))
        test_features.append(
            np.vstack([
                np.concatenate(
                    [wv[w2index.get(w1, UNK), :], wv[w2index.get(w2, UNK), :]])
                for (w1, w2) in test_set.noun_compounds
            ]))

    # Tune the hyper-parameters using the validation set
    logger.info('Classifying...')
    reg_values = [0.5, 1, 2, 5, 10]
    penalties = ['l2']
    k_values = [10, 15, 25, 50] if args.use_paraphrase_vectors else [0]
    classifiers = ['logistic', 'svm']
    f1_results = []
    descriptions = []
    models = []
    all_test_instances = []

    for k in k_values:
        curr_train_features, curr_val_features, curr_test_features = train_features, val_features, test_features
        if args.use_paraphrase_vectors:
            curr_train_features += [
                predict_paraphrases(model, train_set.noun_compounds,
                                    model_words, modelw2index, UNK, k)
            ]
            curr_val_features += [
                predict_paraphrases(model, val_set.noun_compounds, model_words,
                                    modelw2index, UNK, k)
            ]
            curr_test_features += [
                predict_paraphrases(model, test_set.noun_compounds,
                                    model_words, modelw2index, UNK, k)
            ]

        train_instances = [
            np.concatenate(list(f)) for f in zip(*curr_train_features)
        ]
        val_instances = [
            np.concatenate(list(f)) for f in zip(*curr_val_features)
        ]
        test_instances = [
            np.concatenate(list(f)) for f in zip(*curr_test_features)
        ]

        for cls in classifiers:
            for reg_c in reg_values:
                for penalty in penalties:
                    descriptions.append(
                        'K: {}, Classifier: {}, Penalty: {}, C: {:.2f}'.format(
                            k, cls, penalty, reg_c))

                    # Create the classifier
                    if cls == 'logistic':
                        classifier = LogisticRegression(
                            penalty=penalty,
                            C=reg_c,
                            multi_class='multinomial',
                            n_jobs=20,
                            solver='sag')
                    else:
                        classifier = LinearSVC(penalty=penalty,
                                               dual=False,
                                               C=reg_c)

                    logger.info(
                        'Training with classifier: {}, penalty: {}, c: {:.2f}...'
                        .format(cls, penalty, reg_c))
                    classifier.fit(train_instances, train_set.labels)
                    val_pred = classifier.predict(val_instances)
                    p, r, f1, _ = evaluate(val_set.labels,
                                           val_pred,
                                           val_set.index2label,
                                           do_full_reoprt=False)
                    logger.info(
                        'K: {}, Classifier: {}, penalty: {}, c: {:.2f}, precision: {:.3f}, recall: {:.3f}, F1: {:.3f}'
                        .format(k, cls, penalty, reg_c, p, r, f1))
                    f1_results.append(f1)
                    models.append(classifier)
                    all_test_instances.append(test_instances)

    best_index = np.argmax(f1_results)
    description = descriptions[best_index]
    classifier = models[best_index]
    logger.info('Best hyper-parameters: {}'.format(description))

    # Save the best model to a file
    logger.info('Copying the best model...')
    joblib.dump(classifier, '{}/best.pkl'.format(args.model_dir))

    # Evaluate on the test set
    logger.info('Evaluation:')
    test_instances = all_test_instances[best_index]
    test_pred = classifier.predict(test_instances)
    precision, recall, f1, support = evaluate(test_set.labels,
                                              test_pred,
                                              test_set.index2label,
                                              do_full_reoprt=True)
    logger.info('Precision: {:.3f}, Recall: {:.3f}, F1: {:.3f}'.format(
        precision, recall, f1))

    # Write the predictions to a file
    output_predictions(args.model_dir + '/predictions.tsv',
                       test_set.index2label, test_pred,
                       test_set.noun_compounds, test_set.labels)