def test_large_results_to_outputs(self): msg = ' '.join(map(str, range(DEFINED_ACTION_OUTPUTS_NUMBER + 1))) results = to_outputs(split(msg)) assert results['length'] == '100' assert results['_99'] == '99 100'
def test_to_outputs(self): results = to_outputs(split('/release split v1')) expected = { 'length': '3', '_0': '/release', '_1': 'split', '_2': 'v1', } assert results == expected
def test(self): self.assertEqual(main.split([1, 2, 3, 4, 5], 0), [[], [1, 2, 3, 4, 5]]) self.assertEqual(main.split([1, 2, 3, 4, 5], 1), [[1], [2, 3, 4, 5]]) self.assertEqual(main.split([1, 2, 3, 4, 5], 2), [[1, 2], [3, 4, 5]]) self.assertEqual(main.split([1, 2, 3, 4, 5], 3), [[1, 2, 3], [4, 5]]) self.assertEqual(main.split([1, 2, 3, 4, 5], 4), [[1, 2, 3, 4], [5]]) self.assertEqual(main.split([1, 2, 3, 4, 5], 5), [[1, 2, 3, 4, 5], []])
def optimize_regularization(data): c_best, f1_best = 0, 0 data_train, data_test = main.split(data, 0.5) for d in range(-10, 10): c = 2 ** d theta = train(data_train, c=c) res = testing(data_test, theta) f1_cur = res['f1'] print("Current c: %.5f" % c) if f1_best < f1_cur: c_best, f1_best = c, f1_cur print("Now best F1: %.5f" % f1_best) return c_best
def optimize_regularization(data): lymbda_best, f1_best = 0, 0 data_train, data_test = main.split(data, 0.5) for d in range(-7, 10): lymbda = 2**d theta = train(data_train, lymbda=lymbda) res = testing(data_test, theta) f1_cur = res['f1'] print("Current lymbda: %.5f" % lymbda) if f1_best < f1_cur: lymbda_best, f1_best = lymbda, f1_cur print("Now best F1: %.5f" % f1_best) return lymbda_best
def respell(data, dictionary): trie = Trie() split_data = split(data) for row in split_data: for word in row: if word in dictionary: trie.insert(word) for row in split_data: # correct single words for word in row: if word not in dictionary: corrected = trie.nearest_neighbor(word).next()
def optimize_regularization(data): lymbda_best, f1_best = 0, 0 data_train, data_test = main.split(data, 0.5) for d in range(-7, 10): lymbda = 2 ** d theta = train(data_train, lymbda=lymbda) res = testing(data_test, theta) f1_cur = res['f1'] print("Current lymbda: %.5f" % lymbda) if f1_best < f1_cur: lymbda_best, f1_best = lymbda, f1_cur print("Now best F1: %.5f" % f1_best) return lymbda_best
def test_maxsplit_is_not_numeric(self): with pytest.raises(TypeError): split('must fail', maxsplit='a') with pytest.raises(TypeError): split('must fail', maxsplit=None)
def test_maxsplit_with_long_string(self, length, maxsplit, expected): msg = ' '.join(map(str, range(length))) results = split(msg, maxsplit=maxsplit) assert results[-1] == expected
def test_maxsplit(self, msg, maxsplit, expected): assert split(msg, maxsplit=maxsplit) == expected
def test_empty_seperator(self): with pytest.raises(ValueError): split('must fail', sep='')
def test_seperator(self, msg, sep, expected): assert split(msg, sep=sep) == expected
def train_and_visualize(num_hidden, dropout, *args, training_steps=10, batch_size=93, **kwargs): # full = kwargs['full'] # vocab_dict = kwargs['vocab_dict'] # embedding_matrix = kwargs['embedding_matrix'] tf.reset_default_graph() #resets graph # full, vocab_dict, embedding_matrix = segment('train.csv','train_p.csv',size=embed_size) # Network Parameters num_input = 1 time_step = full.data.shape[1] num_classes = 2 # tf Graph input X = tf.placeholder(tf.int32, [None, time_step]) X_length = tf.placeholder(tf.int32, [None]) #embedding = tf.Variable(embedding_matrix) Y = tf.placeholder(tf.float16, [None, num_classes]) # Define weights weights = {'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))} biases = {'out': tf.Variable(tf.random_normal([num_classes]))} def RNN(x, x_length, weights, biases): """x: rank-1, x_length: rank-0, weights: rank-2, biases: rank-1 rtype: rank-1""" batch_size_tmp = tf.shape(x)[0] embedding = tf.get_variable('embedding', [len(vocab_dict), embed_size]) embed = [ tf.nn.embedding_lookup(embedding, row) for row in tf.split(x, batch_size) ] embed = tf.reshape(embed, (batch_size_tmp, time_step, embed_size)) embed = tf.unstack(embed, time_step, 1) lstm_cell = rnn.BasicLSTMCell(num_hidden) cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=dropout) cell = rnn.MultiRNNCell([cell] * 1) # pdb.set_trace() outputs, states = rnn.static_rnn(cell, dtype=tf.float32, sequence_length=x_length, inputs=embed) # print(states) outputs = tf.stack(outputs) outputs = tf.transpose(outputs, [1, 0, 2]) index = tf.range(0, batch_size_tmp) * \ full.data.shape[1] + tf.reshape(x_length - 1, [batch_size_tmp]) outputs = tf.gather(tf.reshape(outputs, [-1, num_hidden]), index) return tf.matmul(outputs, weights['out']) + biases['out'], states, embed logits, states, embed = RNN(X, X_length, weights, biases) prediction = tf.nn.softmax(logits) # tf.summary.histogram('logits', logits) loss_op = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y)) optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss_op) # tf.summary.scalar('loss', loss_op) correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # tf.summary.scalar('accuracy', accuracy) init = tf.global_variables_initializer() # merged_summary = tf.summary.merge_all() # states_summary=tf.summary.tensor_summary(states_node ,states) # # Start training # config = tf.ConfigProto() # config.gpu_options.allow_growth = True fold = 7 #choose validation batch # with tf.Session(config=config) as sess: with tf.Session() as sess: sess.run(init) train, validation = split(full, fold) previous_valid_acc = 0 for step in range(1, training_steps + 1): for i in range(1, train.data.shape[0] // batch_size + 1): #stochastic batch = train.sample(batch_size) batch_x = batch.data batch_y = batch.target batch_x_length = batch.length batch_x_length = batch_x_length.reshape((-1)) # pdb.set_trace() # summary, _,states = sess.run([merged_summary, train_op, states], feed_dict={ # X: batch_x, X_length: batch_x_length, Y: batch_y}) # pdb.set_trace() _, state = sess.run([train_op, states], feed_dict={ X: batch_x, X_length: batch_x_length, Y: batch_y }) training_loss = [] training_acc = [] for i in range(1, train.data.shape[0] // batch_size + 1): batch = train.sample(batch_size) batch_x = batch.data batch_y = batch.target batch_x_length = batch.length batch_x_length = batch_x_length.reshape((-1)) loss_tmp, acc_tmp = sess.run([loss_op, accuracy], feed_dict={ X: batch_x, X_length: batch_x_length, Y: batch_y }) training_loss.append(loss_tmp) training_acc.append(acc_tmp) log.info("Step " + str(step) + ", Minibatch Loss= " + "{:.4f}".format(np.mean(training_loss)) + ", Training Accuracy= " + "{:.3f}".format(np.mean(training_acc))) validation_loss = [] validation_acc = [] for i in range(1, validation.data.shape[0] // batch_size + 1): batch = validation.sample(batch_size) batch_x = batch.data batch_y = batch.target batch_x_length = batch.length batch_x_length = batch_x_length.reshape((-1)) loss_tmp, acc_tmp = sess.run([loss_op, accuracy], feed_dict={ X: batch_x, X_length: batch_x_length, Y: batch_y }) validation_loss.append(loss_tmp) validation_acc.append(acc_tmp) log.info("Step " + str(step) + ", num_hidden= " + "{:.4f}".format(num_hidden) + ", dropout= " + "{:.3f}".format(dropout) + ", Validation Loss= " + "{:.4f}".format(np.mean(validation_loss)) + ", Validation Accuracy= " + "{:.3f}".format(np.mean(validation_acc))) if np.mean(validation_acc) < previous_valid_acc: break previous_valid_acc = np.mean(validation_acc) #visualize activation of each neuron at each word in 20 tweets dropout = 0 result = [] num_tweets = 10 #number of tweets to observe batch = validation.sample( num_tweets) #change back to batch_size as needed batch_x = batch.data batch_y = batch.target batch_x_length = batch.length for i in range(num_tweets): batch_x_input = [] batch_y_input = np.array([batch_y[i]] * 93) batch_x_length_input = [] for j in range(1, 1 + batch_x.shape[1]): batch_x_input.append( np.append(batch_x[i][:j], [0] * (batch_x.shape[1] - j))) batch_x_length_input.append(j) for j in range(batch_x.shape[1] + 1, 93 + 1): batch_x_input.append(np.array([0] * batch_x.shape[1])) batch_x_length_input.append(0) batch_x_input = np.array(batch_x_input) batch_x_length_input = np.array(batch_x_length_input).reshape((-1)) # state_list, embed_list = sess.run([states, embed], feed_dict={ # X: batch_x_input, X_length: batch_x_length_input, Y: batch_y_input}) # pdb.set_trace() state_list, predicted = sess.run([states, prediction], feed_dict={ X: batch_x_input, X_length: batch_x_length_input, Y: batch_y_input }) # loss_tmp, acc_tmp = sess.run([loss_op, accuracy], feed_dict={X: batch_x, X_length: batch_x_length, # Y: batch_y}) # print(acc_tmp) # pdb.set_trace() res = {} # key: three keys for each word in tweet for j in range(batch_x_length[i]): res[j, 'state'] = state_list[0][0][j] res[j, 'word'] = vocab_by_value[batch_x[i][j]] res[j, 'predicted'] = predicted[j] result.append(res) return previous_valid_acc, result, batch_y
if not os.path.exists(dest): print 'directory \'%s\' does not exist.' % (dest) sys.exit() elif not os.path.isdir(dest): print 'ouptut destination \'%s\' should be a directory' % (dest) sys.exit() parts, istar = args.num, args.tar verbose = args.verbose base = osw.basename(target) path = osw.getpath(dest, base + '.fsplit') try: if verbose: print 'Splitting \'%s\' into \'%d\' parts' % (target, parts) main.split(target, num=parts, dest=dest) # now split it if istar: if verbose: print 'Creating Tarfiles' tar.createTarAll(path) if verbose: print 'Tarfiles created' if verbose: print 'splits saved in \'%s\'' % (path) print 'Splitting Complete' except Exception, e: if os.path.isdir(path): shutil.rmtree(path) print e finally: print 'Exiting Now'