Ejemplo n.º 1
0
def get_joint_datasets(args):
    vocab = data_utils.get_vocab()
    train_gen_list = []
    valid_gen_list = []
    if args.mode == 'train':
        if not args.remove_open and not args.only_crowd:
            train_gen_list.append(
                #`("open", get_data_gen('train/open*.json', 'train', args, vocab, "open")))
                ("open",
                 get_data_gen('/distant_supervision/headword_train.json',
                              'train', args, vocab, "open")))
            valid_gen_list.append(
                ("open",
                 get_data_gen('/distant_supervision/headword_dev.json', 'dev',
                              args, vocab, "open")))
        if not args.remove_el and not args.only_crowd:
            valid_gen_list.append(
                ("wiki",
                 get_data_gen('/distant_supervision/el_dev.json', 'dev', args,
                              vocab, "wiki" if args.multitask else "open")))
            train_gen_list.append(
                ("wiki",
                 get_data_gen('/distant_supervision/el_train.json', 'train',
                              args, vocab,
                              "wiki" if args.multitask else "open")))
            #get_data_gen('train/el_train.json', 'train', args, vocab, "wiki" if args.multitask else "open")))
        if args.add_crowd or args.only_crowd:
            train_gen_list.append(("open",
                                   get_data_gen('/crowd/train_m.json', 'train',
                                                args, vocab, "open")))
    crowd_dev_gen = get_data_gen('/crowd/dev.json', 'dev', args, vocab, "open")
    return train_gen_list, valid_gen_list, crowd_dev_gen
Ejemplo n.º 2
0
def train(args):
    vocab = data_utils.get_vocab(vocab_file=args.vocab_file, min_freq=args.min_vocab_freq)
    # vocab = {}
    # with open(args.vocab_file, mode='r') as infile:
    #     for line in infile:
    #         w, w_id = line.split('\t')
    #         vocab[w] = int(w_id)

    print('Vocab loaded...')
    print('VOCAB SIZE = ', len(vocab))

    if args.model_type == 'transformer':
        transformer = Transformer(args=args, vocab=vocab)
        transformer.train_generator()
    elif args.model_type == 'rnn':
        rnn_params = {'rec_cell': 'lstm',
                     'encoder_dim': 800,
                     'decoder_dim': 800,
                     'num_encoder_layers': 2,
                     'num_decoder_layers': 2
                     }
        rnn = RNNSeq2Seq(args=args, rnn_params=rnn_params, vocab=vocab)
        # rnn.train()
        rnn.train_keras()
    elif args.model_type == 'han_rnn':
        han_rnn = HanRnnSeq2Seq(args=args, vocab=vocab)
        han_rnn.train()
    elif args.model_type == 'cnn':
        cnn = ConvSeq2Seq(args=args, vocab=vocab)
        cnn.train_keras()

    return
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = FLAGS.beam_num  # 在beamSample的时候,所有的beam一起丢进去,产生接上下一个词的概率

        # Load vocabularies.
        q_vocab, r_vocab = data_utils.get_vocab(FLAGS.data_dir)
        r_vocab_reversed = data_utils.get_reverse_vocab_dict(r_vocab)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        question = sys.stdin.readline()
        while question:
            if question == "quit":
                quit()
            # Get token-ids for the input sentence.
            src_i = data_utils.tokenize_sentence(q_vocab, question)
            response = model.generate_response(sess, src_i, _buckets, FLAGS.beam_num,
                                               FLAGS.samples_per_beam, FLAGS.segment_length)
            # Print out respond corresponding to outputs.
            print(" ".join([tf.compat.as_str(r_vocab_reversed[response_word]) for response_word in response]))
            print("> ", end="")
            sys.stdout.flush()
            question = sys.stdin.readline()
Ejemplo n.º 4
0
def get_datasets(data_lists, args):
    data_gen_list = []
    vocab_set = data_utils.get_vocab()
    for dataname, mode, goal in data_lists:
        data_gen_list.append(
            get_data_gen(dataname, mode, args, vocab_set, goal))
    return data_gen_list
def get_datasets(data_lists, args, eval_epoch=1):
    data_gen_list = []
    vocab_set = data_utils.get_vocab(args.embed_source)
    for dataname, mode, goal in data_lists:
        data_gen_list.append(
            get_data_gen(dataname, mode, args, vocab_set, goal, eval_epoch))
    return data_gen_list
Ejemplo n.º 6
0
def train(args):
    vocab = data_utils.get_vocab(vocab_file=args.vocab_file,
                                 min_freq=args.min_vocab_freq)

    print('Vocab loaded...')
    print('VOCAB SIZE = ', len(vocab))

    if args.model_type == 'rnn':
        print('Training RNN...')
        rnn = RNNSeq2Seq(args=args, vocab=vocab)
        rnn.train()
    elif args.model_type == 'han_rnn':
        print('Training HAN-RNN...')
        han_rnn = HanRnnSeq2Seq(args=args, vocab=vocab)
        han_rnn.train()
Ejemplo n.º 7
0
def get_datasets(data_lists, args):
  data_gen_list = []
  if args.elmo:
    vocab = (constant.CHAR_DICT, None) # dummy empty dict
    elmo = data_utils.init_elmo()
    bert = None
  elif args.bert:
    vocab = (constant.CHAR_DICT, None) # dummy empty dict 
    elmo = None
    bert = None
  else:
    vocab = data_utils.get_vocab()
    elmo = None
    bert = None
  for dataname, mode, goal in data_lists:
    data_gen_list.append(get_data_gen(dataname, mode, args, vocab, goal, elmo=elmo, bert=bert))
  return data_gen_list, elmo
Ejemplo n.º 8
0
def get_joint_datasets(args):

  if args.elmo:
    vocab = (constant.CHAR_DICT, None) # dummy empty dict
    elmo = data_utils.init_elmo()
    bert = None
  elif args.bert:
    vocab = (constant.CHAR_DICT, None) # dummy empty dict 
    elmo = None
    bert = None
  else: # glove
    vocab = data_utils.get_vocab()
    elmo = None
    bert = None
  train_gen_list = []
  valid_gen_list = []
  if args.mode in ['train', 'train_labeler']:
    if not args.remove_open and not args.only_crowd:
      train_gen_list.append(
        ("open", get_data_gen('train_full/open_train_tree*.json', 'train', args, vocab, "open", elmo=elmo, bert=bert)))
        #("open", get_data_gen('distant_supervision/headword_train_tree.json', 'train', args, vocab, "open", elmo=elmo, bert=bert)))
      valid_gen_list.append(("open", get_data_gen('distant_supervision/headword_dev_tree.json', 'dev', args, vocab, "open", elmo=elmo, bert=bert)))
    if not args.remove_el and not args.only_crowd:
      valid_gen_list.append(
        ("wiki",
         get_data_gen('distant_supervision/el_dev_tree.json', 'dev', args, vocab, "wiki" if args.multitask else "open", elmo=elmo, bert=bert)))
      train_gen_list.append(
        ("wiki",
         #get_data_gen('distant_supervision/el_train_tree.json', 'train', args, vocab, "wiki" if args.multitask else "open", elmo=elmo, bert=bert)))
         get_data_gen('train_full/el_train_full_tree.json', 'train', args, vocab, "wiki" if args.multitask else "open", elmo=elmo, bert=bert)))
    if args.add_crowd or args.only_crowd:
      train_gen_list.append(
        ("open", get_data_gen('crowd/train_m_tree.json', 'train', args, vocab, "open", elmo=elmo, bert=bert)))
    if args.add_expanded_head:
      train_gen_list.append(
        ("open", get_data_gen('train_full/open_train_1m_cls_relabeled.json', 'train', args, vocab, "open", elmo=elmo, bert=bert)))
    if args.add_expanded_el:
      train_gen_list.append(
        ("wiki", get_data_gen('train_full/el_train_1m_cls_relabeled.json', 'train', args, vocab,  "wiki" if args.multitask else "open", elmo=elmo, bert=bert)))
  #crowd_dev_gen = get_data_gen('crowd/dev.json', 'dev', args, vocab, "open")
  crowd_dev_gen = None # get_data_gen('crowd/dev_tree.json', 'dev', args, vocab, "open", elmo=elmo, bert=bert)
  return train_gen_list, valid_gen_list, crowd_dev_gen, elmo, bert, vocab
Ejemplo n.º 9
0
  FLAGS.use_subsampled_dataset = True

  #dir ubuntu
  FLAGS.raw_data_dir = "/home/usuario/datasets"

  if args.dataset=='eus':
    FLAGS.max_audio_length = 680 # obtained from sequence lengths histogram
    FLAGS.max_freq_length = 201
  elif args.dataset=='quz':
    FLAGS.max_audio_length = 100 # TBD
    FLAGS.max_freq_length = 100

  # set sentence, doc length to maximum
  output = open("tunning_"+FLAGS.data_mode+"/"+FLAGS.data_mode + "_hp_grid_tuning_%s.txt" % args.file_suffix,'w')

  vocab_dict,inverted_vocab = get_vocab()
  train_data = DataProcessor(vocab_dict,inverted_vocab,data_type="train")
  val_batch = batch_load_data(DataProcessor(vocab_dict,inverted_vocab,data_type="val"))
  
  setup_by_id = {}
  results_by_id_wer = {}
  results_by_id_cer = {}
  setup_id = 0
  best_global_wer = 200
  best_global_cer = 200
  best_setup_id_wer = -1
  best_setup_id_cer = -1

  ## FLAGS.___ = ___ # set as constant so it doesn't clutter output
  #FLAGS.use_conv2d = True
  #FLAGS.use_dropout = False 
Ejemplo n.º 10
0
def main():
    with open('config.yaml', 'r') as infile:
        config = yaml.load(infile)

    dataset = get_nli_dataset(path=config['data']['nli_data_dir'])
    all_sentences = []
    for split in ['train', 'test', 'dev']:
        all_sentences += dataset[split]['s1'] + dataset[split]['s2']
    max_seq_len = max(len(s) for s in all_sentences)
    print('Max sentence length: {0}'.format(max_seq_len))
    vocab = get_vocab(all_sentences)
    word_embedding_matrix = get_embeddings_matrix(
        embeddings_path=config['data']['embeddings_path'], vocab=vocab)
    print('Embedding Matrix Shape: {0}'.format(word_embedding_matrix.shape))

    try:
        num_train_samples = int(config['data']['num_train_samples'])
    except ValueError:
        num_train_samples = len(dataset['train']['s1']) + 1
    try:
        num_dev_samples = int(config['data']['num_dev_samples'])
    except ValueError:
        num_dev_samples = len(dataset['dev']['s1']) + 1
    try:
        num_test_samples = int(config['data']['num_test_samples'])
    except ValueError:
        num_test_samples = len(dataset['test']['s1']) + 1

    s1_train_x = get_sequences(sentences=dataset['train']['s1'],
                               max_seq_len=max_seq_len,
                               vocab=vocab)[:num_train_samples]
    s2_train_x = get_sequences(sentences=dataset['train']['s2'],
                               max_seq_len=max_seq_len,
                               vocab=vocab)[:num_train_samples]
    s1_dev_x = get_sequences(sentences=dataset['dev']['s1'],
                             max_seq_len=max_seq_len,
                             vocab=vocab)[:num_dev_samples]
    s2_dev_x = get_sequences(sentences=dataset['dev']['s2'],
                             max_seq_len=max_seq_len,
                             vocab=vocab)[:num_dev_samples]
    s1_test_x = get_sequences(sentences=dataset['test']['s1'],
                              max_seq_len=max_seq_len,
                              vocab=vocab)[:num_test_samples]
    s2_test_x = get_sequences(sentences=dataset['test']['s2'],
                              max_seq_len=max_seq_len,
                              vocab=vocab)[:num_test_samples]

    print('Train Shape: S1: {0}, S2: {1}'.format(s1_train_x.shape,
                                                 s2_train_x.shape))
    print('Dev Shape: S1: {0}, S2: {1}'.format(s1_dev_x.shape, s2_dev_x.shape))
    print('Test Shape: S1: {0}, S2: {1}'.format(s1_test_x.shape,
                                                s2_test_x.shape))

    train_y = to_categorical(dataset['train']['target'],
                             num_classes=3)[:num_train_samples]
    dev_y = to_categorical(dataset['dev']['target'],
                           num_classes=3)[:num_dev_samples]
    test_y = to_categorical(dataset['test']['target'],
                            num_classes=3)[:num_test_samples]

    print('Y Train Shape: {0}'.format(train_y.shape))
    print('Y Dev Shape: {0}'.format(dev_y.shape))
    print('Y Test Shape: {0}'.format(test_y.shape))

    if not os.path.exists(config['model']['path']):
        os.makedirs(config['model']['path'])

    nli_classifier = NLIClassifier(config=config['model'],
                                   vocab_size=len(vocab),
                                   embedding_matrix=word_embedding_matrix,
                                   max_seq_len=max_seq_len)
    fit_start_time = time()
    history = nli_classifier.fit(s1_train_x=s1_train_x,
                                 s2_train_x=s2_train_x,
                                 s1_dev_x=s1_dev_x,
                                 s2_dev_x=s2_dev_x,
                                 train_y=train_y,
                                 dev_y=dev_y)
    fit_end_time = time()
    fit_time = fit_end_time - fit_start_time
    print('Fit time: {0}'.format(round(fit_time, 3)))

    make_plots(
        history=history,
        path='{0}/NLITraining_{1}.png'.format(
            config['model']['path'], config['model']['encoder']['type']),
        title='NLI Training - {0}'.format(config['model']['encoder']['type']),
        epochs=config['model']['training']['epochs'])

    pred_y = nli_classifier.predict(s1_x=s1_test_x, s2_x=s2_test_x)

    test_y = np.argmax(test_y, axis=1)
    pred_y = np.argmax(pred_y, axis=1)
    print()
    print('Accuracy - {0}'.format(round(accuracy_score(test_y, pred_y), 3)))
    print('Classification Report - ')
    print(classification_report(y_true=test_y, y_pred=pred_y))

    encode_start_time = time()
    encoded_s1 = nli_classifier.encode(s1_x=s1_dev_x)
    encode_end_time = time()
    print('Encoded S1 shape: {0}'.format(encoded_s1.shape))
    encoding_time = encode_end_time - encode_start_time
    print('Encoding time for Dev Set: {0}'.format(round(encoding_time, 3)))
    print('Encoding time per sample: {0}'.format(
        round(encoding_time / len(s1_dev_x), 3)))
Ejemplo n.º 11
0
train_tgt = [
    line.strip().split()
    for line in codecs.open(data_path_train_tgt, 'r', encoding='utf-8')
]

dev_src = [
    line.strip().split()
    for line in codecs.open(data_path_dev_src, 'r', encoding='utf-8')
]
dev_tgt = [
    line.strip().split()
    for line in codecs.open(data_path_dev_tgt, 'r', encoding='utf-8')
]

# Get training and dev vocabularies
src_vocab, src_word2ind, src_ind2word = get_vocab(train_src)
tgt_vocab, tgt_word2ind, tgt_ind2word = get_vocab(train_tgt)

logging.info('Running experiment with seed %s ...' % (args.seed))

logging.info('Finished reading data ...')
logging.info('Number of training sentence-pairs : %d ' % (len(train_src)))
logging.info('Number of validation sentence-pairs : %d ' % (len(dev_src)))

# Create symbolic variables
src_inp = T.imatrix()
tgt_inp = T.imatrix()
tgt_op = T.imatrix()
src_lens = T.ivector()
tgt_mask = T.fmatrix()
src_mask = T.fmatrix()
Ejemplo n.º 12
0
    weight_path = utils.mkpath('weight/{}/{}'.format(MODEL_NAME, p))
    last_weight, last_epoch = utils.get_last_epoch(weight_path)
    # move on to next prompt if epoch not greater than last one saved
    if args.epoch <= last_epoch:
        continue

    train_df = data_utils.load_data(p, 'train')
    val_df = data_utils.load_data(p, 'val')
    # test_df = data_utils.load_data(p, 'test')

    print(train_df.shape)
    print(val_df.shape)
    # print(test_df.shape)

    vocab = data_utils.get_vocab(p, train_df)
    glove_path = 'glove/glove.6B.50d.txt'
    emb_matrix = data_utils.load_glove_embedding(glove_path, vocab)

    K.clear_session()
    model = models.build_glove_model(p,
                                     len(vocab),
                                     emb_matrix,
                                     glove_trainable=args.ft,
                                     drop_rate=args.drop)

    if last_weight:
        print('Loading weight :', last_weight)
        model.load_weights(last_weight)

    train_gen = data_utils.gen(MODEL_NAME,
Ejemplo n.º 13
0
    test_df = data_utils.load_data(p, 'test')

    print(test_df.shape)

    K.clear_session()
    if MODEL_NAME.startswith('elmo'):
        vocab = None
        model = models.build_elmo_model_full(p,
                                             elmo_trainable=args.ft,
                                             use_mask=args.mask,
                                             lstm_units=args.re,
                                             drop_rate=args.drop,
                                             summary=False)
    elif MODEL_NAME.startswith('glove'):
        vocab = data_utils.get_vocab(p)
        glove_path = 'glove/glove.6B.50d.txt'
        emb_matrix = data_utils.load_glove_embedding(glove_path, vocab)
        model = models.build_glove_model(p,
                                         len(vocab),
                                         emb_matrix,
                                         glove_trainable=args.ft,
                                         drop_rate=args.drop,
                                         summary=False)

    print('Loading weight :', weight)
    model.load_weights(weight)

    test_gen = data_utils.gen(MODEL_NAME,
                              p,
                              test_df,
Ejemplo n.º 14
0
def train():
    print('Applying Parameters:')
    for k, v in FLAGS.__dict__['__flags'].iteritems():
        print('%s: %s' % (k, str(v)))
    print('Preparing data in %s' % FLAGS.data_dir)
    in_seq_train, out_seq_train, in_seq_dev, in_seq_test, vocab_path = \
        data_utils.get_vocab(FLAGS.data_dir, FLAGS.in_vocab_size, FLAGS.out_vocab_size)

    result_dir = FLAGS.train_dir + 'train/test_result'
    if not os.path.isdir(result_dir):
        os.makedirs(result_dir)

    vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

    # make sure result is repeatable ???
    random.seed(42)
    np.random.seed(42)
    tf.set_random_seed(42)

    sess = tf.Session()
    sess.as_default()

    #Create model.
    print("Max sequence length: %d." % _buckets[0][0])
    print("Creating %d layers of %d units." % (FLAGS.num_layer, FLAGS.size))

    model, model_test = create_model(sess, rev_vocab, len(vocab))
    print("Creating model with source_vocab_size=%d" % len(rev_vocab))

    # Read data into buckets and compute their sizes.
    print("Reading train data (train set limit: %d)." %
          FLAGS.max_train_data_size)
    train_set = read_data(in_seq_train, out_seq_train)

    # Train Loop
    step_time, loss = 0.0, 0.0
    current_step = 0

    best_valid_score, best_test_score = 0, 0
    train_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train',
                                         sess.graph)
    valid_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/valid')

    while model.global_step.eval(sess) < FLAGS.max_traing_step:
        # Get a batch and make a step.
        start_time = time.time()
        encoder_inputs, decoder_inputs, batch_sequence_length = model.get_batch(
            train_set)
        _, step_loss, summary, decoder_logits = model.get_input_feed(
            sess, encoder_inputs, _buckets, decoder_inputs)

        train_writer.add_summary(summary, current_step)
        step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
        loss += step_loss / FLAGS.steps_per_checkpoint
        current_step += 1

        # Once in a while, we save checkpoint, print statistics, and run evals.
        if current_step % FLAGS.steps_per_checkpoint == 0:
            checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
            model.saver.save(sess, checkpoint_path)
            step_time, loss = 0.0, 0.0

    return sess, model_test, vocab
import pickle
"""
现在的问题是,数据集中规定好的那些符号和原来GNMT里面的不一样。所以需要更新一下咯。
"""


def update_dict(dict_ori, src_of_vocab):
    """
    Args: dict_ori is a dict that needs to be updated.
        src_of_vocab: str, "q" or "r"

    Returns: returns an updated dictionary that adds the special tags.
    """
    dict_new = {}
    dict_new[data_utils._PAD] = data_utils.PAD_ID  # 0
    dict_new[data_utils._GO] = data_utils.GO_ID  # 1
    dict_new[data_utils._EOS] = data_utils.EOS_ID  # 2
    dict_new[data_utils._UNK] = data_utils.UNK_ID  # 3
    for (k, v) in dict_ori.items():
        dict_new[k] = v + 3  # originaly, value starts from 1
    fw = open(src_of_vocab + "_train_vocab.pkl", 'wb')
    pickle.dump(dict_new, fw, pickle.HIGHEST_PROTOCOL)
    fw.close()
    print("the %s dict contains %d words" %
          (src_of_vocab, len(dict_new.keys())))


q_dict, r_dict = data_utils.get_vocab("dialog_data_new")
update_dict(q_dict, "q")
update_dict(r_dict, "r")