Exemple #1
0
    def test_large_results_to_outputs(self):
        msg = ' '.join(map(str, range(DEFINED_ACTION_OUTPUTS_NUMBER + 1)))

        results = to_outputs(split(msg))

        assert results['length'] == '100'
        assert results['_99'] == '99 100'
Exemple #2
0
    def test_to_outputs(self):
        results = to_outputs(split('/release split v1'))

        expected = {
            'length': '3',
            '_0': '/release',
            '_1': 'split',
            '_2': 'v1',
        }
        assert results == expected
Exemple #3
0
 def test(self):
     self.assertEqual(main.split([1, 2, 3, 4, 5], 0), [[], [1, 2, 3, 4, 5]])
     self.assertEqual(main.split([1, 2, 3, 4, 5], 1), [[1], [2, 3, 4, 5]])
     self.assertEqual(main.split([1, 2, 3, 4, 5], 2), [[1, 2], [3, 4, 5]])
     self.assertEqual(main.split([1, 2, 3, 4, 5], 3), [[1, 2, 3], [4, 5]])
     self.assertEqual(main.split([1, 2, 3, 4, 5], 4), [[1, 2, 3, 4], [5]])
     self.assertEqual(main.split([1, 2, 3, 4, 5], 5), [[1, 2, 3, 4, 5], []])
Exemple #4
0
def optimize_regularization(data):
    c_best, f1_best = 0, 0
    data_train, data_test = main.split(data, 0.5)
    for d in range(-10, 10):
        c = 2 ** d
        theta = train(data_train, c=c)
        res = testing(data_test, theta)
        f1_cur = res['f1']
        print("Current c: %.5f" % c)
        if f1_best < f1_cur:
            c_best, f1_best = c, f1_cur
            print("Now best F1: %.5f" % f1_best)
    return c_best
Exemple #5
0
def optimize_regularization(data):
    lymbda_best, f1_best = 0, 0
    data_train, data_test = main.split(data, 0.5)
    for d in range(-7, 10):
        lymbda = 2**d
        theta = train(data_train, lymbda=lymbda)
        res = testing(data_test, theta)
        f1_cur = res['f1']
        print("Current lymbda: %.5f" % lymbda)
        if f1_best < f1_cur:
            lymbda_best, f1_best = lymbda, f1_cur
            print("Now best F1: %.5f" % f1_best)

    return lymbda_best
Exemple #6
0
def respell(data, dictionary):
	trie = Trie()
	split_data = split(data)
	
	for row in split_data:
		for word in row:
			if word in dictionary:
				trie.insert(word)
				
	for row in split_data:
		# correct single words
		for word in row:
			if word not in dictionary:
				corrected = trie.nearest_neighbor(word).next()
def optimize_regularization(data):
    lymbda_best, f1_best = 0, 0
    data_train, data_test = main.split(data, 0.5)
    for d in range(-7, 10):
        lymbda = 2 ** d
        theta = train(data_train, lymbda=lymbda)
        res = testing(data_test, theta)
        f1_cur = res['f1']
        print("Current lymbda: %.5f" % lymbda)
        if f1_best < f1_cur:
            lymbda_best, f1_best = lymbda, f1_cur
            print("Now best F1: %.5f" % f1_best)

    return lymbda_best
Exemple #8
0
def respell(data, dictionary):
    trie = Trie()
    split_data = split(data)

    for row in split_data:
        for word in row:
            if word in dictionary:
                trie.insert(word)

    for row in split_data:
        # correct single words
        for word in row:
            if word not in dictionary:
                corrected = trie.nearest_neighbor(word).next()
Exemple #9
0
 def test_maxsplit_is_not_numeric(self):
     with pytest.raises(TypeError):
         split('must fail', maxsplit='a')
     with pytest.raises(TypeError):
         split('must fail', maxsplit=None)
Exemple #10
0
 def test_maxsplit_with_long_string(self, length, maxsplit, expected):
     msg = ' '.join(map(str, range(length)))
     results = split(msg, maxsplit=maxsplit)
     assert results[-1] == expected
Exemple #11
0
 def test_maxsplit(self, msg, maxsplit, expected):
     assert split(msg, maxsplit=maxsplit) == expected
Exemple #12
0
 def test_empty_seperator(self):
     with pytest.raises(ValueError):
         split('must fail', sep='')
Exemple #13
0
 def test_seperator(self, msg, sep, expected):
     assert split(msg, sep=sep) == expected
def train_and_visualize(num_hidden,
                        dropout,
                        *args,
                        training_steps=10,
                        batch_size=93,
                        **kwargs):

    #    full = kwargs['full']
    #    vocab_dict = kwargs['vocab_dict']
    #    embedding_matrix = kwargs['embedding_matrix']

    tf.reset_default_graph()  #resets graph

    #    full, vocab_dict, embedding_matrix = segment('train.csv','train_p.csv',size=embed_size)

    # Network Parameters
    num_input = 1
    time_step = full.data.shape[1]
    num_classes = 2

    # tf Graph input
    X = tf.placeholder(tf.int32, [None, time_step])
    X_length = tf.placeholder(tf.int32, [None])
    #embedding = tf.Variable(embedding_matrix)
    Y = tf.placeholder(tf.float16, [None, num_classes])

    # Define weights
    weights = {'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))}
    biases = {'out': tf.Variable(tf.random_normal([num_classes]))}

    def RNN(x, x_length, weights, biases):
        """x: rank-1, x_length: rank-0, weights: rank-2, biases: rank-1
        rtype: rank-1"""
        batch_size_tmp = tf.shape(x)[0]
        embedding = tf.get_variable('embedding', [len(vocab_dict), embed_size])
        embed = [
            tf.nn.embedding_lookup(embedding, row)
            for row in tf.split(x, batch_size)
        ]
        embed = tf.reshape(embed, (batch_size_tmp, time_step, embed_size))
        embed = tf.unstack(embed, time_step, 1)

        lstm_cell = rnn.BasicLSTMCell(num_hidden)
        cell = tf.contrib.rnn.DropoutWrapper(lstm_cell,
                                             output_keep_prob=dropout)
        cell = rnn.MultiRNNCell([cell] * 1)

        #        pdb.set_trace()

        outputs, states = rnn.static_rnn(cell,
                                         dtype=tf.float32,
                                         sequence_length=x_length,
                                         inputs=embed)

        #        print(states)

        outputs = tf.stack(outputs)
        outputs = tf.transpose(outputs, [1, 0, 2])

        index = tf.range(0, batch_size_tmp) * \
            full.data.shape[1] + tf.reshape(x_length - 1, [batch_size_tmp])
        outputs = tf.gather(tf.reshape(outputs, [-1, num_hidden]), index)

        return tf.matmul(outputs,
                         weights['out']) + biases['out'], states, embed

    logits, states, embed = RNN(X, X_length, weights, biases)
    prediction = tf.nn.softmax(logits)
    #    tf.summary.histogram('logits', logits)

    loss_op = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y))

    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(loss_op)
    #    tf.summary.scalar('loss', loss_op)

    correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    #    tf.summary.scalar('accuracy', accuracy)

    init = tf.global_variables_initializer()

    #    merged_summary = tf.summary.merge_all()
    #    states_summary=tf.summary.tensor_summary(states_node ,states)

    #    # Start training
    #    config = tf.ConfigProto()
    #    config.gpu_options.allow_growth = True

    fold = 7  #choose validation batch
    #    with tf.Session(config=config) as sess:
    with tf.Session() as sess:
        sess.run(init)
        train, validation = split(full, fold)
        previous_valid_acc = 0
        for step in range(1, training_steps + 1):
            for i in range(1,
                           train.data.shape[0] // batch_size + 1):  #stochastic
                batch = train.sample(batch_size)
                batch_x = batch.data
                batch_y = batch.target
                batch_x_length = batch.length
                batch_x_length = batch_x_length.reshape((-1))
                #                pdb.set_trace()
                #                summary, _,states = sess.run([merged_summary, train_op, states], feed_dict={
                #                        X: batch_x, X_length: batch_x_length, Y: batch_y})
                #                pdb.set_trace()
                _, state = sess.run([train_op, states],
                                    feed_dict={
                                        X: batch_x,
                                        X_length: batch_x_length,
                                        Y: batch_y
                                    })
            training_loss = []
            training_acc = []
            for i in range(1, train.data.shape[0] // batch_size + 1):
                batch = train.sample(batch_size)
                batch_x = batch.data
                batch_y = batch.target
                batch_x_length = batch.length
                batch_x_length = batch_x_length.reshape((-1))
                loss_tmp, acc_tmp = sess.run([loss_op, accuracy],
                                             feed_dict={
                                                 X: batch_x,
                                                 X_length: batch_x_length,
                                                 Y: batch_y
                                             })
                training_loss.append(loss_tmp)
                training_acc.append(acc_tmp)
            log.info("Step " + str(step) + ", Minibatch Loss= " +
                     "{:.4f}".format(np.mean(training_loss)) +
                     ", Training Accuracy= " +
                     "{:.3f}".format(np.mean(training_acc)))
            validation_loss = []
            validation_acc = []
            for i in range(1, validation.data.shape[0] // batch_size + 1):
                batch = validation.sample(batch_size)
                batch_x = batch.data
                batch_y = batch.target
                batch_x_length = batch.length
                batch_x_length = batch_x_length.reshape((-1))
                loss_tmp, acc_tmp = sess.run([loss_op, accuracy],
                                             feed_dict={
                                                 X: batch_x,
                                                 X_length: batch_x_length,
                                                 Y: batch_y
                                             })
                validation_loss.append(loss_tmp)
                validation_acc.append(acc_tmp)
            log.info("Step " + str(step) + ", num_hidden= " +
                     "{:.4f}".format(num_hidden) + ", dropout= " +
                     "{:.3f}".format(dropout) + ", Validation Loss= " +
                     "{:.4f}".format(np.mean(validation_loss)) +
                     ", Validation Accuracy= " +
                     "{:.3f}".format(np.mean(validation_acc)))
            if np.mean(validation_acc) < previous_valid_acc:
                break
            previous_valid_acc = np.mean(validation_acc)

        #visualize activation of each neuron at each word in 20 tweets
        dropout = 0
        result = []
        num_tweets = 10  #number of tweets to observe
        batch = validation.sample(
            num_tweets)  #change back to batch_size as needed
        batch_x = batch.data
        batch_y = batch.target
        batch_x_length = batch.length
        for i in range(num_tweets):
            batch_x_input = []
            batch_y_input = np.array([batch_y[i]] * 93)
            batch_x_length_input = []
            for j in range(1, 1 + batch_x.shape[1]):
                batch_x_input.append(
                    np.append(batch_x[i][:j], [0] * (batch_x.shape[1] - j)))
                batch_x_length_input.append(j)
            for j in range(batch_x.shape[1] + 1, 93 + 1):
                batch_x_input.append(np.array([0] * batch_x.shape[1]))
                batch_x_length_input.append(0)
            batch_x_input = np.array(batch_x_input)
            batch_x_length_input = np.array(batch_x_length_input).reshape((-1))
            #            state_list, embed_list = sess.run([states, embed], feed_dict={
            #                        X: batch_x_input, X_length: batch_x_length_input, Y: batch_y_input})
            #            pdb.set_trace()
            state_list, predicted = sess.run([states, prediction],
                                             feed_dict={
                                                 X: batch_x_input,
                                                 X_length:
                                                 batch_x_length_input,
                                                 Y: batch_y_input
                                             })
            #            loss_tmp, acc_tmp = sess.run([loss_op, accuracy], feed_dict={X: batch_x, X_length: batch_x_length,
            #                                                                                 Y: batch_y})
            #            print(acc_tmp)
            #            pdb.set_trace()
            res = {}  # key: three keys for each word in tweet
            for j in range(batch_x_length[i]):
                res[j, 'state'] = state_list[0][0][j]
                res[j, 'word'] = vocab_by_value[batch_x[i][j]]
                res[j, 'predicted'] = predicted[j]
            result.append(res)

    return previous_valid_acc, result, batch_y
Exemple #15
0
    if not os.path.exists(dest):
        print 'directory \'%s\' does not exist.' % (dest)
        sys.exit()
    elif not os.path.isdir(dest):
        print 'ouptut destination \'%s\' should be a directory' % (dest)
        sys.exit()

    parts, istar = args.num, args.tar
    verbose = args.verbose

    base = osw.basename(target)
    path = osw.getpath(dest, base + '.fsplit')
    try:
        if verbose:
            print 'Splitting \'%s\' into \'%d\' parts' % (target, parts)
        main.split(target, num=parts, dest=dest)  # now split it
        if istar:
            if verbose:
                print 'Creating Tarfiles'
            tar.createTarAll(path)
            if verbose:
                print 'Tarfiles created'
        if verbose:
            print 'splits saved in \'%s\'' % (path)
            print 'Splitting Complete'
    except Exception, e:
        if os.path.isdir(path):
            shutil.rmtree(path)
        print e
    finally:
        print 'Exiting Now'