def test_to_json(self): tmp_path = 'functional/test/data/tmp/output.txt' elements = [[u'a', 1], [u'b', 2], [u'c', 3]] sequence = seq(elements) sequence.to_json(tmp_path) result = seq.json(tmp_path).to_list() self.assertEqual(elements, result) dict_expect = {u'a': 1, u'b': 2, u'c': 3} sequence.to_json(tmp_path, root_array=False) result = seq.json(tmp_path).to_dict() self.assertEqual(dict_expect, result)
def generate_blogfile(): # read the data in as json and then transform to feeds feedData = seq.json("datafiles/blogdata.json").map(lambda fe: feed(fe)).to_list() # type: list[feed] ''' get the top 500 words by word count over all words: for feed <- feeds, (word,count) <- feed.wordCount.items(): yeild (word,count) keep all wordCounts > 10 groupby + reduce word:(word1,c1),(word1,c2) -> (word1,c1,c2,c3,c4) -> (word1,sumC) keep all wordCounts that meet fake tfidf transform (word,sumC) -> word take top 500 transform to list ''' top500 = seq(feedData).flat_map(lambda f: list(f.wordCount.items())) \ .filter(lambda wc: wc[1] > 10) \ .map(lambda wc: (wc[0], 1)) \ .reduce_by_key(add) \ .filter(filter_fun) \ .order_by(lambda wc: -wc[1]) \ .take(500) \ .map(lambda wc: wc[0]) \ .to_list() # sort alphabetically top500 = sorted(top500) print(len(top500)) # write resultant to file with open("datafiles/blogtop500.txt", "w+") as out: out.write("Blog\t%s\n" % '\t'.join(top500)) for tf in sorted(feedData, key=lambda f: f.title): out.write(output(tf, top500))
def _get_sales_start(file, from_date, headers): file.seek(0) return seq.json(file)\ .filter(lambda b: b.get('sale_start') and _strpdate(b['sale_start']) > from_date)\ .sorted(key=lambda b: _strpdate(b['sale_start']), reverse=True)\ .take(50)\ .cache()\ .reverse()\ .map(lambda b: { k: unicode(v) for k, v in b.iteritems() if k in [h[0] for h in headers] })\ .to_list()
def generate_blogfile_stem(): # same as non-stem except use stemmed data feedData = seq.json("datafiles/fmeasure.json").map(lambda fe: feed(fe, True)).to_list() # type: list[feed] top500 = seq(feedData).flat_map(lambda f: list(f.stemCount.items())) \ .map(lambda wc: (wc[0], wc[1])) \ .reduce_by_key(add) \ .filter(filter_fun) \ .order_by(lambda wc: -wc[1]) \ .map(lambda wc: wc[0]) \ .to_list() top500 = sorted(top500) print(len(top500)) with open("datafiles/fmeasure_stemmed.txt", "w+") as out: out.write("Blog\t%s\n" % '\t'.join(top500)) for tf in sorted(feedData, key=lambda f: f.title): out.write(output_stem(tf, top500))
def test_json(self): list_test_path = 'functional/test/data/test_list.json' dict_test_path = 'functional/test/data/test_dict.json' list_expect = [1, 2, 3, 4, 5] dict_expect = list(six.viewitems({u'a': 1, u'b': 2, u'c': 3})) result = seq.json(list_test_path).to_list() self.assertEqual(list_expect, result) result = seq.json(dict_test_path).to_list() self.assertEqual(dict_expect, result) with open(list_test_path) as file_handle: result = seq.json(file_handle).to_list() self.assertEqual(list_expect, result) with open(dict_test_path) as file_handle: result = seq.json(file_handle).to_list() self.assertEqual(dict_expect, result) with self.assertRaises(ValueError): seq.json(1)
def test_seq_json_list(self): f = _make_str_file(u'''[1, 2, 3]''') res = seq.json(f) assert res == [1, 2, 3]
def test_seq_json_dict(self): f = _make_str_file(u'''{"a": 1, "b": 2, "c": 3}''') res = seq.json(f).sorted() assert res == [('a', 1), ('b', 2), ('c', 3)]
def train(): tf.logging.info('Applying Parameters:') tf.logging.info("Preparing data in %s" % FLAGS.data_dir) nowTime = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') tf.logging.set_verbosity(tf.logging.INFO) handlers = [ logging.FileHandler(os.path.join(FLAGS.log, nowTime + '.log')), logging.StreamHandler(sys.stdout) ] logging.getLogger('tensorflow').handlers = handlers date_set = data_utils.prepare_multi_task_data(FLAGS.data_dir) in_seq_train, out_seq_train, label_train = date_set[0] in_seq_dev, out_seq_dev, label_dev = date_set[1] in_seq_test, out_seq_test, label_test = date_set[2] vocab_path, tag_vocab_path, label_vocab_path = date_set[3] result_dir = FLAGS.train_dir + '/test_results' if not tf.gfile.IsDirectory(result_dir): tf.gfile.MakeDirs(result_dir) current_taging_valid_out_file = result_dir + '/tagging.valid.hyp.txt' current_taging_test_out_file = result_dir + '/tagging.test.hyp.txt' if not tf.gfile.Exists('data_bak/vocab.json') or not tf.gfile.Exists( 'data_bak/rev_vocab.json'): vocab, rev_vocab = data_utils.initialize_vocab(vocab_path) with tf.gfile.GFile('data_bak/vocab.json', 'w') as vocab_file, tf.gfile.GFile( 'data_bak/rev_vocab.json', 'w') as rev_vocab_file: vocab_file.write(json.dumps(vocab, ensure_ascii=False, indent=4)) rev_vocab_file.write( json.dumps(rev_vocab, ensure_ascii=False, indent=4)) else: with tf.gfile.GFile('data_bak/vocab.json', 'r') as vocab_file, tf.gfile.GFile( 'data_bak/rev_vocab.json', 'r') as rev_vocab_file: vocab = json.load(vocab_file) rev_vocab = seq.json(rev_vocab_file).map( lambda x: (int(x[0]), x[1])).to_dict() if not tf.gfile.Exists('data_bak/tag_vocab.json') or not tf.gfile.Exists( 'data_bak/rev_tag_vocab.json'): tag_vocab, rev_tag_vocab = data_utils.initialize_vocab(tag_vocab_path) with tf.gfile.GFile('data_bak/tag_vocab.json', 'w') as tag_vocab_file, \ tf.gfile.GFile('data_bak/rev_tag_vocab.json', 'w') as rev_tag_vocab_file: tag_vocab_file.write( json.dumps(tag_vocab, ensure_ascii=False, indent=4)) rev_tag_vocab_file.write( json.dumps(rev_tag_vocab, ensure_ascii=False, indent=4)) else: with tf.gfile.GFile('data_bak/tag_vocab.json', 'r') as tag_vocab_file, tf.gfile.GFile( 'data_bak/rev_tag_vocab.json', 'r') as rev_tag_vocab_file: tag_vocab = json.load(tag_vocab_file) rev_tag_vocab = seq.json(rev_tag_vocab_file).map( lambda x: (int(x[0]), x[1])).to_dict() if not tf.gfile.Exists('data_bak/label_vocab.json') or not tf.gfile.Exists( 'data_bak/rev_label_vocab.json'): label_vocab, rev_label_vocab = data_utils.initialize_vocab( label_vocab_path) with tf.gfile.GFile('data_bak/label_vocab.json', 'w') as label_vocab_file, \ tf.gfile.GFile('data_bak/rev_label_vocab.json', 'w') as rev_label_vocab_file: label_vocab_file.write( json.dumps(label_vocab, ensure_ascii=False, indent=4)) rev_label_vocab_file.write( json.dumps(rev_label_vocab, ensure_ascii=False, indent=4)) else: with tf.gfile.GFile('data_bak/label_vocab.json', 'r') as label_vocab_file, tf.gfile.GFile( 'data_bak/rev_label_vocab.json', 'r') as rev_label_vocab_file: label_vocab = json.load(label_vocab_file) rev_label_vocab = seq.json(rev_label_vocab_file).map( lambda x: (int(x[0]), x[1])).to_dict() # Read data into buckets and compute their sizes. tf.logging.info("Reading train/valid/test data (training set limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(in_seq_dev, out_seq_dev, label_dev) test_set = read_data(in_seq_test, out_seq_test, label_test) train_set = read_data(in_seq_train, out_seq_train, label_train) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) config = tf.ConfigProto( gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.23), # device_count = {'gpu': 2} ) with tf.Session(config=config) as sess: # Create model. tf.logging.info("Max sequence length: %d." % _buckets[0][0]) tf.logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model, model_test = create_model(sess, len(vocab), len(tag_vocab), len(label_vocab)) tf.logging.info("Creating model with " + "source_vocab_size=%d, target_vocab_size=%d, label_vocab_size=%d." \ % (len(vocab), len(tag_vocab), len(label_vocab))) tf.summary.scalar('loss', model.loss) tf.summary.scalar('dev_accuracy', model.best_dev_accuracy) tf.summary.scalar('dev_f1', model.best_dev_f1) tf.summary.scalar('test_accuracy', model.best_test_accuracy) tf.summary.scalar('test_f1', model.best_test_f1) model.merged = tf.summary.merge_all() model.writer = tf.summary.FileWriter( os.path.join(FLAGS.tensorboard, nowTime)) model.writer.add_graph(graph=sess.graph) train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 no_improve_step = 0 while model.global_step.eval() < FLAGS.max_training_steps: random_number_01 = np.random.random_sample() bucket_id = min([ i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() batch_data = model.get_batch(train_set, bucket_id) encoder_inputs, tags, tag_weights, batch_sequence_length, labels = batch_data if task['joint'] == 1: step_outputs = model.joint_step(sess, encoder_inputs, tags, tag_weights, labels, batch_sequence_length, bucket_id, False) _, step_loss, tagging_logits, class_logits = step_outputs elif task['tagging'] == 1: step_outputs = model.tagging_step(sess, encoder_inputs, tags, tag_weights, batch_sequence_length, bucket_id, False) _, step_loss, tagging_logits = step_outputs elif task['intent'] == 1: step_outputs = model.classification_step( sess, encoder_inputs, labels, batch_sequence_length, bucket_id, False) _, step_loss, class_logits = step_outputs summary = sess.run(model.merged, model.input_feed) model.writer.add_summary(summary, model.global_step.eval()) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: perplexity = math.exp(loss) if loss < 300 else float('inf') tf.logging.info( "global step %d step-time %.2f. Training perplexity %.2f" % (model.global_step.eval(), step_time, perplexity)) sys.stdout.flush() # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt") step_time, loss = 0.0, 0.0 def run_valid_test(data_set, mode): # mode: Eval, Test # Run evals on development/test set and print the accuracy. word_list = list() ref_tag_list = list() hyp_tag_list = list() ref_label_list = list() hyp_label_list = list() correct_count = 0 accuracy = 0.0 tagging_eval_result = dict() for bucket_id in xrange(len(_buckets)): eval_loss = 0.0 count = 0 for i in xrange(len(data_set[bucket_id])): count += 1 sample = model_test.get_one(data_set, bucket_id, i) encoder_inputs, tags, tag_weights, sequence_length, labels = sample tagging_logits = [] class_logits = [] if task['joint'] == 1: step_outputs = model_test.joint_step( sess, encoder_inputs, tags, tag_weights, labels, sequence_length, bucket_id, True) _, step_loss, tagging_logits, class_logits = step_outputs elif task['tagging'] == 1: step_outputs = model_test.tagging_step( sess, encoder_inputs, tags, tag_weights, sequence_length, bucket_id, True) _, step_loss, tagging_logits = step_outputs elif task['intent'] == 1: step_outputs = model_test.classification_step( sess, encoder_inputs, labels, sequence_length, bucket_id, True) _, step_loss, class_logits = step_outputs eval_loss += step_loss / len(data_set[bucket_id]) hyp_label = None if task['intent'] == 1: ref_label_list.append( rev_label_vocab[labels[0][0]]) hyp_label = np.argmax(class_logits[0], 0) hyp_label_list.append( rev_label_vocab[hyp_label]) if labels[0] == hyp_label: correct_count += 1 if task['tagging'] == 1: word_list.append([rev_vocab[x[0]] for x in \ encoder_inputs[:sequence_length[0]]]) ref_tag_list.append([rev_tag_vocab[x[0]] for x in \ tags[:sequence_length[0]]]) hyp_tag_list.append( [rev_tag_vocab[np.argmax(x)] for x in \ tagging_logits[:sequence_length[0]]]) accuracy = float(correct_count) * 100 / count if task['intent'] == 1: tf.logging.info("\t%s accuracy: %.2f %d/%d" \ % (mode, accuracy, correct_count, count)) sys.stdout.flush() if task['tagging'] == 1: if mode == 'Eval': taging_out_file = current_taging_valid_out_file elif mode == 'Test': taging_out_file = current_taging_test_out_file tagging_eval_result = conlleval( hyp_tag_list, ref_tag_list, word_list, taging_out_file) tf.logging.info("\t%s f1-score: %.2f" % (mode, tagging_eval_result['f1'])) sys.stdout.flush() return accuracy, tagging_eval_result # valid valid_accuracy, valid_tagging_result = run_valid_test( dev_set, 'Eval') if task['tagging'] == 1 and task['intent'] == 0: best_dev_f1 = model.best_dev_f1.eval() if valid_tagging_result['f1'] > best_dev_f1: tf.assign(model.best_dev_f1, valid_tagging_result['f1']).eval() # save the best output file subprocess.call(['mv', current_taging_valid_out_file, current_taging_valid_out_file + '.best_f1_%.2f' \ % best_dev_f1], shell=True) model.saver.save(sess, checkpoint_path, global_step=model.global_step) no_improve_step = 0 else: no_improve_step += 1 if task['tagging'] == 1 and task['intent'] == 1: best_dev_accuracy = model.best_dev_accuracy.eval() best_dev_f1 = model.best_dev_f1.eval() if valid_accuracy > best_dev_accuracy and valid_tagging_result[ 'f1'] > best_dev_f1: tf.assign(model.best_dev_accuracy, valid_accuracy).eval() tf.assign(model.best_dev_f1, valid_tagging_result['f1']).eval() subprocess.call(['mv', current_taging_valid_out_file, current_taging_valid_out_file + '.best_f1_%.2f' \ % best_dev_f1], shell=True) model.saver.save(sess, checkpoint_path, global_step=model.global_step) no_improve_step = 0 else: no_improve_step += 1 # test, run test after each validation for development purpose. test_accuracy, test_tagging_result = run_valid_test( test_set, 'Test') if task['tagging'] == 1 and task['intent'] == 0: best_test_f1 = model.best_test_f1.eval() if test_tagging_result['f1'] > best_test_f1: tf.assign(model.best_test_f1, test_tagging_result['f1']).eval() # save the best output file subprocess.call(['mv', current_taging_test_out_file, current_taging_test_out_file + '.best_f1_%.2f' \ % best_test_f1], shell=True) if task['tagging'] == 1 and task['intent'] == 1: best_test_accuracy = model.best_test_accuracy.eval() best_test_f1 = model.best_test_f1.eval() if test_accuracy > best_test_accuracy and test_tagging_result[ 'f1'] > best_test_f1: tf.assign(model.best_test_accuracy, test_accuracy).eval() tf.assign(model.best_test_f1, test_tagging_result['f1']).eval() subprocess.call(['mv', current_taging_test_out_file, current_taging_test_out_file + '.best_f1_%.2f' \ % best_test_f1], shell=True) if no_improve_step > FLAGS.no_improve_per_step: tf.logging.info("continuous no improve per step " + str(FLAGS.no_improve_per_step) + ", auto stop...") tf.logging.info("max accuracy is: " + str(model.best_dev_accuracy.eval()) + ", max f1 score is: " + str(model.best_dev_f1.eval())) break
def _get_ratings(file): file.seek(0) return seq.json(file).filter(lambda b: b.get('rate'))