コード例 #1
0
 def make_test_data():
     data4sentseg = []
     for i in range(test_df.shape[0]):
         # for i in range(1000, 2000):
         sent_tmp = test_df.loc[i, 'sub_sents_tokenized']
         single_sent = []
         for sent in sent_tmp:
             sent_id = sentence2id(sent, vocab2index)
             label = ['o'] * len(sent)
             single_sent.append((sent_id, label))
         data4sentseg.append(single_sent)
     return data4sentseg
コード例 #2
0
target_batches = data_helper.get_target_batches()

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(max_epoch):
        all_preds = []
        epoch_loss = 0
        for input_batch, target_batch in zip(input_batches, target_batches):
            input_token_ids = []
            target_token_ids = []
            input_sentence_lengths = []

            for input_sentence in input_batch:
                input_sentence, sentenceLength = data_helper.sentence2id(
                    input_sentence,
                    vocab=encoder_vocab,
                    max_sentence_length=encoder_sentence_length)
                input_token_ids.append(input_sentence)
                input_sentence_lengths.append(sentenceLength)

            for target_sentence in target_batch:
                target_sentence = data_helper.sentence2id(
                    target_sentence,
                    vocab=decoder_vocab,
                    max_sentence_length=decoder_sentence_length,
                    is_target=True)
                target_token_ids.append(target_sentence)

            batch_preds, batch_loss, _ = sess.run(
                [predictions, loss, train_op],
                feed_dict={
コード例 #3
0
parser.add_argument(
    '--char_vec_path',
    type=str,
    default='../w2v/fasttext_char_vec/fasttext_cbow_char.model.vec',
    help='file for word vec in fasttext')
args = parser.parse_args()

## get char embeddings
vocab, vocab2index, embeddings = get_fasttext(args.char_vec_path)
## read corpus and get training data

# training model
if args.mode == 'train':
    dev_percent = 0.1
    sent, tag = read_corpus(args.train_data)
    sent_ = [sentence2id(s, vocab2index) for s in sent]
    tag_ = [tag2label(l) for l in tag]
    data_ = [it for it in zip(sent_, tag_)]
    data_num = len(data_)
    dev_ind = -int(data_num * dev_percent)
    train_data = data_[:dev_ind]
    test_data = data_[dev_ind:]
    test_size = len(test_data)

    ## paths setting
    timestamp = time.asctime().replace(' ', '_').replace(':', '_')
    output_path = os.path.abspath(
        os.path.join(os.path.curdir, "runs", timestamp))
    if not os.path.exists(output_path): os.makedirs(output_path)
    summary_path = os.path.join(output_path, "summaries")
    if not os.path.exists(summary_path): os.makedirs(summary_path)
コード例 #4
0
def convert_to_num(tokens_list,vocab_to_int):
    res = [helper.sentence2id(t,vocab_to_int) for t in tokens_list]
    assert len(res) == len(tokens_list)
    return res