def make_test_data(): data4sentseg = [] for i in range(test_df.shape[0]): # for i in range(1000, 2000): sent_tmp = test_df.loc[i, 'sub_sents_tokenized'] single_sent = [] for sent in sent_tmp: sent_id = sentence2id(sent, vocab2index) label = ['o'] * len(sent) single_sent.append((sent_id, label)) data4sentseg.append(single_sent) return data4sentseg
target_batches = data_helper.get_target_batches() init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for epoch in range(max_epoch): all_preds = [] epoch_loss = 0 for input_batch, target_batch in zip(input_batches, target_batches): input_token_ids = [] target_token_ids = [] input_sentence_lengths = [] for input_sentence in input_batch: input_sentence, sentenceLength = data_helper.sentence2id( input_sentence, vocab=encoder_vocab, max_sentence_length=encoder_sentence_length) input_token_ids.append(input_sentence) input_sentence_lengths.append(sentenceLength) for target_sentence in target_batch: target_sentence = data_helper.sentence2id( target_sentence, vocab=decoder_vocab, max_sentence_length=decoder_sentence_length, is_target=True) target_token_ids.append(target_sentence) batch_preds, batch_loss, _ = sess.run( [predictions, loss, train_op], feed_dict={
parser.add_argument( '--char_vec_path', type=str, default='../w2v/fasttext_char_vec/fasttext_cbow_char.model.vec', help='file for word vec in fasttext') args = parser.parse_args() ## get char embeddings vocab, vocab2index, embeddings = get_fasttext(args.char_vec_path) ## read corpus and get training data # training model if args.mode == 'train': dev_percent = 0.1 sent, tag = read_corpus(args.train_data) sent_ = [sentence2id(s, vocab2index) for s in sent] tag_ = [tag2label(l) for l in tag] data_ = [it for it in zip(sent_, tag_)] data_num = len(data_) dev_ind = -int(data_num * dev_percent) train_data = data_[:dev_ind] test_data = data_[dev_ind:] test_size = len(test_data) ## paths setting timestamp = time.asctime().replace(' ', '_').replace(':', '_') output_path = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") if not os.path.exists(summary_path): os.makedirs(summary_path)
def convert_to_num(tokens_list,vocab_to_int): res = [helper.sentence2id(t,vocab_to_int) for t in tokens_list] assert len(res) == len(tokens_list) return res