def test_to_json(self):
        tmp_path = 'functional/test/data/tmp/output.txt'
        elements = [[u'a', 1], [u'b', 2], [u'c', 3]]
        sequence = seq(elements)
        sequence.to_json(tmp_path)
        result = seq.json(tmp_path).to_list()
        self.assertEqual(elements, result)

        dict_expect = {u'a': 1, u'b': 2, u'c': 3}
        sequence.to_json(tmp_path, root_array=False)
        result = seq.json(tmp_path).to_dict()
        self.assertEqual(dict_expect, result)
Example #2
0
    def test_to_json(self):
        tmp_path = 'functional/test/data/tmp/output.txt'
        elements = [[u'a', 1], [u'b', 2], [u'c', 3]]
        sequence = seq(elements)
        sequence.to_json(tmp_path)
        result = seq.json(tmp_path).to_list()
        self.assertEqual(elements, result)

        dict_expect = {u'a': 1, u'b': 2, u'c': 3}
        sequence.to_json(tmp_path, root_array=False)
        result = seq.json(tmp_path).to_dict()
        self.assertEqual(dict_expect, result)
Example #3
0
def generate_blogfile():
    # read the data in as json and then transform to feeds
    feedData = seq.json("datafiles/blogdata.json").map(lambda fe: feed(fe)).to_list()  # type: list[feed]
    '''
    get the top 500 words by word count over all words:
        for feed <- feeds, (word,count) <- feed.wordCount.items(): yeild (word,count)
        keep all wordCounts > 10
        groupby + reduce word:(word1,c1),(word1,c2) -> (word1,c1,c2,c3,c4) -> (word1,sumC)
        keep all wordCounts that meet fake tfidf
        transform (word,sumC) -> word
        take top 500
        transform to list
    '''
    top500 = seq(feedData).flat_map(lambda f: list(f.wordCount.items())) \
        .filter(lambda wc: wc[1] > 10) \
        .map(lambda wc: (wc[0], 1)) \
        .reduce_by_key(add) \
        .filter(filter_fun) \
        .order_by(lambda wc: -wc[1]) \
        .take(500) \
        .map(lambda wc: wc[0]) \
        .to_list()
    # sort alphabetically
    top500 = sorted(top500)
    print(len(top500))
    # write resultant to file
    with open("datafiles/blogtop500.txt", "w+") as out:
        out.write("Blog\t%s\n" % '\t'.join(top500))
        for tf in sorted(feedData, key=lambda f: f.title):
            out.write(output(tf, top500))
Example #4
0
def _get_sales_start(file, from_date, headers):
    file.seek(0)
    return seq.json(file)\
        .filter(lambda b: b.get('sale_start') and _strpdate(b['sale_start']) > from_date)\
        .sorted(key=lambda b: _strpdate(b['sale_start']), reverse=True)\
        .take(50)\
        .cache()\
        .reverse()\
        .map(lambda b: {
            k: unicode(v) for k, v in b.iteritems()
            if k in [h[0] for h in headers]
        })\
        .to_list()
Example #5
0
def generate_blogfile_stem():
    # same as non-stem except use stemmed data
    feedData = seq.json("datafiles/fmeasure.json").map(lambda fe: feed(fe, True)).to_list()  # type: list[feed]
    top500 = seq(feedData).flat_map(lambda f: list(f.stemCount.items())) \
        .map(lambda wc: (wc[0], wc[1])) \
        .reduce_by_key(add) \
        .filter(filter_fun) \
        .order_by(lambda wc: -wc[1]) \
        .map(lambda wc: wc[0]) \
        .to_list()
    top500 = sorted(top500)
    print(len(top500))
    with open("datafiles/fmeasure_stemmed.txt", "w+") as out:
        out.write("Blog\t%s\n" % '\t'.join(top500))
        for tf in sorted(feedData, key=lambda f: f.title):
            out.write(output_stem(tf, top500))
Example #6
0
    def test_json(self):
        list_test_path = 'functional/test/data/test_list.json'
        dict_test_path = 'functional/test/data/test_dict.json'
        list_expect = [1, 2, 3, 4, 5]
        dict_expect = list(six.viewitems({u'a': 1, u'b': 2, u'c': 3}))

        result = seq.json(list_test_path).to_list()
        self.assertEqual(list_expect, result)
        result = seq.json(dict_test_path).to_list()
        self.assertEqual(dict_expect, result)

        with open(list_test_path) as file_handle:
            result = seq.json(file_handle).to_list()
            self.assertEqual(list_expect, result)
        with open(dict_test_path) as file_handle:
            result = seq.json(file_handle).to_list()
            self.assertEqual(dict_expect, result)

        with self.assertRaises(ValueError):
            seq.json(1)
    def test_json(self):
        list_test_path = 'functional/test/data/test_list.json'
        dict_test_path = 'functional/test/data/test_dict.json'
        list_expect = [1, 2, 3, 4, 5]
        dict_expect = list(six.viewitems({u'a': 1, u'b': 2, u'c': 3}))

        result = seq.json(list_test_path).to_list()
        self.assertEqual(list_expect, result)
        result = seq.json(dict_test_path).to_list()
        self.assertEqual(dict_expect, result)

        with open(list_test_path) as file_handle:
            result = seq.json(file_handle).to_list()
            self.assertEqual(list_expect, result)
        with open(dict_test_path) as file_handle:
            result = seq.json(file_handle).to_list()
            self.assertEqual(dict_expect, result)

        with self.assertRaises(ValueError):
            seq.json(1)
Example #8
0
 def test_seq_json_list(self):
     f = _make_str_file(u'''[1, 2, 3]''')
     res = seq.json(f)
     assert res == [1, 2, 3]
Example #9
0
 def test_seq_json_dict(self):
     f = _make_str_file(u'''{"a": 1, "b": 2, "c": 3}''')
     res = seq.json(f).sorted()
     assert res == [('a', 1), ('b', 2), ('c', 3)]
Example #10
0
def train():
    tf.logging.info('Applying Parameters:')
    tf.logging.info("Preparing data in %s" % FLAGS.data_dir)
    nowTime = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
    tf.logging.set_verbosity(tf.logging.INFO)
    handlers = [
        logging.FileHandler(os.path.join(FLAGS.log, nowTime + '.log')),
        logging.StreamHandler(sys.stdout)
    ]
    logging.getLogger('tensorflow').handlers = handlers

    date_set = data_utils.prepare_multi_task_data(FLAGS.data_dir)
    in_seq_train, out_seq_train, label_train = date_set[0]
    in_seq_dev, out_seq_dev, label_dev = date_set[1]
    in_seq_test, out_seq_test, label_test = date_set[2]
    vocab_path, tag_vocab_path, label_vocab_path = date_set[3]

    result_dir = FLAGS.train_dir + '/test_results'
    if not tf.gfile.IsDirectory(result_dir):
        tf.gfile.MakeDirs(result_dir)

    current_taging_valid_out_file = result_dir + '/tagging.valid.hyp.txt'
    current_taging_test_out_file = result_dir + '/tagging.test.hyp.txt'

    if not tf.gfile.Exists('data_bak/vocab.json') or not tf.gfile.Exists(
            'data_bak/rev_vocab.json'):
        vocab, rev_vocab = data_utils.initialize_vocab(vocab_path)
        with tf.gfile.GFile('data_bak/vocab.json',
                            'w') as vocab_file, tf.gfile.GFile(
                                'data_bak/rev_vocab.json',
                                'w') as rev_vocab_file:
            vocab_file.write(json.dumps(vocab, ensure_ascii=False, indent=4))
            rev_vocab_file.write(
                json.dumps(rev_vocab, ensure_ascii=False, indent=4))
    else:
        with tf.gfile.GFile('data_bak/vocab.json',
                            'r') as vocab_file, tf.gfile.GFile(
                                'data_bak/rev_vocab.json',
                                'r') as rev_vocab_file:
            vocab = json.load(vocab_file)
            rev_vocab = seq.json(rev_vocab_file).map(
                lambda x: (int(x[0]), x[1])).to_dict()

    if not tf.gfile.Exists('data_bak/tag_vocab.json') or not tf.gfile.Exists(
            'data_bak/rev_tag_vocab.json'):
        tag_vocab, rev_tag_vocab = data_utils.initialize_vocab(tag_vocab_path)
        with tf.gfile.GFile('data_bak/tag_vocab.json', 'w') as tag_vocab_file, \
                tf.gfile.GFile('data_bak/rev_tag_vocab.json', 'w') as rev_tag_vocab_file:
            tag_vocab_file.write(
                json.dumps(tag_vocab, ensure_ascii=False, indent=4))
            rev_tag_vocab_file.write(
                json.dumps(rev_tag_vocab, ensure_ascii=False, indent=4))
    else:
        with tf.gfile.GFile('data_bak/tag_vocab.json',
                            'r') as tag_vocab_file, tf.gfile.GFile(
                                'data_bak/rev_tag_vocab.json',
                                'r') as rev_tag_vocab_file:
            tag_vocab = json.load(tag_vocab_file)
            rev_tag_vocab = seq.json(rev_tag_vocab_file).map(
                lambda x: (int(x[0]), x[1])).to_dict()

    if not tf.gfile.Exists('data_bak/label_vocab.json') or not tf.gfile.Exists(
            'data_bak/rev_label_vocab.json'):
        label_vocab, rev_label_vocab = data_utils.initialize_vocab(
            label_vocab_path)
        with tf.gfile.GFile('data_bak/label_vocab.json', 'w') as label_vocab_file, \
                tf.gfile.GFile('data_bak/rev_label_vocab.json', 'w') as rev_label_vocab_file:
            label_vocab_file.write(
                json.dumps(label_vocab, ensure_ascii=False, indent=4))
            rev_label_vocab_file.write(
                json.dumps(rev_label_vocab, ensure_ascii=False, indent=4))
    else:
        with tf.gfile.GFile('data_bak/label_vocab.json',
                            'r') as label_vocab_file, tf.gfile.GFile(
                                'data_bak/rev_label_vocab.json',
                                'r') as rev_label_vocab_file:
            label_vocab = json.load(label_vocab_file)
            rev_label_vocab = seq.json(rev_label_vocab_file).map(
                lambda x: (int(x[0]), x[1])).to_dict()

    # Read data into buckets and compute their sizes.
    tf.logging.info("Reading train/valid/test data (training set limit: %d)." %
                    FLAGS.max_train_data_size)
    dev_set = read_data(in_seq_dev, out_seq_dev, label_dev)
    test_set = read_data(in_seq_test, out_seq_test, label_test)
    train_set = read_data(in_seq_train, out_seq_train, label_train)
    train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))

    config = tf.ConfigProto(
        gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.23),
        # device_count = {'gpu': 2}
    )

    with tf.Session(config=config) as sess:
        # Create model.
        tf.logging.info("Max sequence length: %d." % _buckets[0][0])
        tf.logging.info("Creating %d layers of %d units." %
                        (FLAGS.num_layers, FLAGS.size))

        model, model_test = create_model(sess, len(vocab), len(tag_vocab),
                                         len(label_vocab))
        tf.logging.info("Creating model with " +
              "source_vocab_size=%d, target_vocab_size=%d, label_vocab_size=%d." \
              % (len(vocab), len(tag_vocab), len(label_vocab)))

        tf.summary.scalar('loss', model.loss)
        tf.summary.scalar('dev_accuracy', model.best_dev_accuracy)
        tf.summary.scalar('dev_f1', model.best_dev_f1)
        tf.summary.scalar('test_accuracy', model.best_test_accuracy)
        tf.summary.scalar('test_f1', model.best_test_f1)

        model.merged = tf.summary.merge_all()
        model.writer = tf.summary.FileWriter(
            os.path.join(FLAGS.tensorboard, nowTime))
        model.writer.add_graph(graph=sess.graph)

        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in xrange(len(train_bucket_sizes))
        ]

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0

        no_improve_step = 0
        while model.global_step.eval() < FLAGS.max_training_steps:
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in xrange(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])

            # Get a batch and make a step.
            start_time = time.time()
            batch_data = model.get_batch(train_set, bucket_id)
            encoder_inputs, tags, tag_weights, batch_sequence_length, labels = batch_data
            if task['joint'] == 1:
                step_outputs = model.joint_step(sess, encoder_inputs, tags,
                                                tag_weights, labels,
                                                batch_sequence_length,
                                                bucket_id, False)
                _, step_loss, tagging_logits, class_logits = step_outputs
            elif task['tagging'] == 1:
                step_outputs = model.tagging_step(sess, encoder_inputs, tags,
                                                  tag_weights,
                                                  batch_sequence_length,
                                                  bucket_id, False)
                _, step_loss, tagging_logits = step_outputs
            elif task['intent'] == 1:
                step_outputs = model.classification_step(
                    sess, encoder_inputs, labels, batch_sequence_length,
                    bucket_id, False)
                _, step_loss, class_logits = step_outputs

            summary = sess.run(model.merged, model.input_feed)
            model.writer.add_summary(summary, model.global_step.eval())

            step_time += (time.time() -
                          start_time) / FLAGS.steps_per_checkpoint
            loss += step_loss / FLAGS.steps_per_checkpoint
            current_step += 1

            # Once in a while, we save checkpoint, print statistics, and run evals.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                perplexity = math.exp(loss) if loss < 300 else float('inf')
                tf.logging.info(
                    "global step %d step-time %.2f. Training perplexity %.2f" %
                    (model.global_step.eval(), step_time, perplexity))
                sys.stdout.flush()
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
                step_time, loss = 0.0, 0.0

                def run_valid_test(data_set, mode):  # mode: Eval, Test
                    # Run evals on development/test set and print the accuracy.
                    word_list = list()
                    ref_tag_list = list()
                    hyp_tag_list = list()
                    ref_label_list = list()
                    hyp_label_list = list()
                    correct_count = 0
                    accuracy = 0.0
                    tagging_eval_result = dict()
                    for bucket_id in xrange(len(_buckets)):
                        eval_loss = 0.0
                        count = 0
                        for i in xrange(len(data_set[bucket_id])):
                            count += 1
                            sample = model_test.get_one(data_set, bucket_id, i)
                            encoder_inputs, tags, tag_weights, sequence_length, labels = sample
                            tagging_logits = []
                            class_logits = []
                            if task['joint'] == 1:
                                step_outputs = model_test.joint_step(
                                    sess, encoder_inputs, tags, tag_weights,
                                    labels, sequence_length, bucket_id, True)
                                _, step_loss, tagging_logits, class_logits = step_outputs
                            elif task['tagging'] == 1:
                                step_outputs = model_test.tagging_step(
                                    sess, encoder_inputs, tags, tag_weights,
                                    sequence_length, bucket_id, True)
                                _, step_loss, tagging_logits = step_outputs
                            elif task['intent'] == 1:
                                step_outputs = model_test.classification_step(
                                    sess, encoder_inputs, labels,
                                    sequence_length, bucket_id, True)
                                _, step_loss, class_logits = step_outputs
                            eval_loss += step_loss / len(data_set[bucket_id])
                            hyp_label = None
                            if task['intent'] == 1:
                                ref_label_list.append(
                                    rev_label_vocab[labels[0][0]])
                                hyp_label = np.argmax(class_logits[0], 0)
                                hyp_label_list.append(
                                    rev_label_vocab[hyp_label])
                                if labels[0] == hyp_label:
                                    correct_count += 1
                            if task['tagging'] == 1:
                                word_list.append([rev_vocab[x[0]] for x in \
                                                  encoder_inputs[:sequence_length[0]]])
                                ref_tag_list.append([rev_tag_vocab[x[0]] for x in \
                                                     tags[:sequence_length[0]]])
                                hyp_tag_list.append(
                                    [rev_tag_vocab[np.argmax(x)] for x in \
                                     tagging_logits[:sequence_length[0]]])

                    accuracy = float(correct_count) * 100 / count
                    if task['intent'] == 1:
                        tf.logging.info("\t%s accuracy: %.2f %d/%d" \
                              % (mode, accuracy, correct_count, count))
                        sys.stdout.flush()
                    if task['tagging'] == 1:
                        if mode == 'Eval':
                            taging_out_file = current_taging_valid_out_file
                        elif mode == 'Test':
                            taging_out_file = current_taging_test_out_file
                        tagging_eval_result = conlleval(
                            hyp_tag_list, ref_tag_list, word_list,
                            taging_out_file)
                        tf.logging.info("\t%s f1-score: %.2f" %
                                        (mode, tagging_eval_result['f1']))
                        sys.stdout.flush()
                    return accuracy, tagging_eval_result

                # valid
                valid_accuracy, valid_tagging_result = run_valid_test(
                    dev_set, 'Eval')
                if task['tagging'] == 1 and task['intent'] == 0:
                    best_dev_f1 = model.best_dev_f1.eval()
                    if valid_tagging_result['f1'] > best_dev_f1:
                        tf.assign(model.best_dev_f1,
                                  valid_tagging_result['f1']).eval()
                        # save the best output file
                        subprocess.call(['mv',
                                         current_taging_valid_out_file,
                                         current_taging_valid_out_file + '.best_f1_%.2f' \
                                         % best_dev_f1], shell=True)
                        model.saver.save(sess,
                                         checkpoint_path,
                                         global_step=model.global_step)
                        no_improve_step = 0
                    else:
                        no_improve_step += 1

                if task['tagging'] == 1 and task['intent'] == 1:
                    best_dev_accuracy = model.best_dev_accuracy.eval()
                    best_dev_f1 = model.best_dev_f1.eval()
                    if valid_accuracy > best_dev_accuracy and valid_tagging_result[
                            'f1'] > best_dev_f1:
                        tf.assign(model.best_dev_accuracy,
                                  valid_accuracy).eval()
                        tf.assign(model.best_dev_f1,
                                  valid_tagging_result['f1']).eval()
                        subprocess.call(['mv',
                                         current_taging_valid_out_file,
                                         current_taging_valid_out_file + '.best_f1_%.2f' \
                                         % best_dev_f1], shell=True)
                        model.saver.save(sess,
                                         checkpoint_path,
                                         global_step=model.global_step)
                        no_improve_step = 0
                    else:
                        no_improve_step += 1

                # test, run test after each validation for development purpose.
                test_accuracy, test_tagging_result = run_valid_test(
                    test_set, 'Test')
                if task['tagging'] == 1 and task['intent'] == 0:
                    best_test_f1 = model.best_test_f1.eval()
                    if test_tagging_result['f1'] > best_test_f1:
                        tf.assign(model.best_test_f1,
                                  test_tagging_result['f1']).eval()
                    # save the best output file
                    subprocess.call(['mv',
                                     current_taging_test_out_file,
                                     current_taging_test_out_file + '.best_f1_%.2f' \
                                     % best_test_f1], shell=True)

                if task['tagging'] == 1 and task['intent'] == 1:
                    best_test_accuracy = model.best_test_accuracy.eval()
                    best_test_f1 = model.best_test_f1.eval()
                    if test_accuracy > best_test_accuracy and test_tagging_result[
                            'f1'] > best_test_f1:
                        tf.assign(model.best_test_accuracy,
                                  test_accuracy).eval()
                        tf.assign(model.best_test_f1,
                                  test_tagging_result['f1']).eval()
                        subprocess.call(['mv',
                                         current_taging_test_out_file,
                                         current_taging_test_out_file + '.best_f1_%.2f' \
                                         % best_test_f1], shell=True)

                if no_improve_step > FLAGS.no_improve_per_step:
                    tf.logging.info("continuous no improve per step " +
                                    str(FLAGS.no_improve_per_step) +
                                    ", auto stop...")
                    tf.logging.info("max accuracy is: " +
                                    str(model.best_dev_accuracy.eval()) +
                                    ", max f1 score is: " +
                                    str(model.best_dev_f1.eval()))
                    break
Example #11
0
def _get_ratings(file):
    file.seek(0)
    return seq.json(file).filter(lambda b: b.get('rate'))