コード例 #1
0
def delete(neg_filepath, pos_filepath, mode='train'):
    '''
    divide a sentence into attribute markers and contents.
    '''
    neg_sentences = load_sentences(neg_filepath)
    pos_sentences = load_sentences(pos_filepath)
    if not (os.path.exists(hp.data_path + '/tf_idf.0')
            and os.path.exists(hp.data_path + '/tf_idf.1')):
        print('making tf_idf dictionary...')
        # n-gram
        neg_ngram_dict = get_ngram(neg_sentences)
        pos_ngram_dict = get_ngram(pos_sentences)
        # relative frequency
        get_tf_idf(neg_ngram_dict, pos_ngram_dict)
    neg_ngrams = dict(
        x.split('\t')
        for x in codecs.open(hp.data_path +
                             '/tf_idf.0', 'r').read().splitlines())
    pos_ngrams = dict(
        x.split('\t')
        for x in codecs.open(hp.data_path +
                             '/tf_idf.1', 'r').read().splitlines())
    # divide sentence into attribute markers and contents
    # neg:0 pos:1
    print('dividing sentences...')
    for num in ['0', '1']:
        if num == '0':
            sentences = neg_sentences
            attribute_markers = neg_ngrams
        else:
            sentences = pos_sentences
            attribute_markers = pos_ngrams
        with codecs.open(hp.data_path + '/delete/delete.' + mode + '.' + num,
                         'w', 'utf-8') as fout:
            for sentence in sentences:
                words = sentence.split(' ')
                value_dict = {}
                for i in range(len(words)):
                    for n in range(hp.ngram - 1, 0, -1):
                        if i + n > len(words):
                            continue
                        tmp_attribute_marker = ' '.join(words[i:i + n])
                        if tmp_attribute_marker in attribute_markers:
                            value_dict[
                                tmp_attribute_marker] = attribute_markers[
                                    tmp_attribute_marker]
                # if a sentence has attribute_marker
                if len(value_dict) > 0:
                    attribute_marker = max(value_dict, key=value_dict.get)
                    content = sentence.replace(attribute_marker, '')
                    if len(content) > 2:
                        fout.write(sentence + '\t' + content + '\t' +
                                   attribute_marker + '\n')
コード例 #2
0
def main():
	#load data 
	# x_train, y_train, x_test, y_test = data.load_data('treebank')

	sentences, labels, n_classes = data.load_sentences()
	n_input = sentences.shape[0]
	time_steps = sentences.shape[1]

	embeddings = embed.embeddings()
	x_train = embeddings.train_embedding(sentences.reshape(n_input*time_steps))
	x_train = x_train.reshape(n_input, time_steps, 100)
	print(x_train.shape)


	# # print(x_train.shape)
	# #replace words with embeddings
	# # path = 'glove.840B.300d.txt'
	# # x_train, y_train = embeddings.embed(path, x_train, y_train)

	# # x_train_id, y_train_id, x_test_id, y_test_id = embeddings.make_dictionary(x_train, y_train, x_test, y_test)
	# x_train = x_train.reshape(batch_size, time_steps, 100)
	train_using_lstm(x_train, labels, 400, n_classes, time_steps)
コード例 #3
0
vocab1 = build_vocab(v1, [FLAGS.train, FLAGS.test])
vocab2 = build_vocab(v2, [FLAGS.train, FLAGS.test])

embed1 = Word2VecModel(FLAGS.embed1, vocab1, FLAGS.unif)

print('Loaded word embeddings: ' + FLAGS.embed1)

if FLAGS.embed2 is None:
    print('No embed2 found, using embed1 for both')
    args.embed2 = args.embed1

embed2 = Word2VecModel(FLAGS.embed2, vocab2, FLAGS.unif)
print('Loaded word embeddings: ' + FLAGS.embed2)

ts = load_sentences(FLAGS.train, embed1.vocab, embed2.vocab, FLAGS.mxlen)
es = load_sentences(FLAGS.test, embed1.vocab, embed2.vocab, FLAGS.mxlen)
rlut1 = revlut(embed1.vocab)
rlut2 = revlut(embed2.vocab)

#with tf.device('/cpu:0'):
with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():

        if FLAGS.attn is True:
            seq2seq_creator_fn = Seq2Seq.create_lstm_attn if FLAGS.rnntype.lower(
            ) == 'lstm' else Seq2Seq.create_gru_attn
        else:
            seq2seq_creator_fn = Seq2Seq.create_lstm if FLAGS.rnntype.lower(
            ) == 'lstm' else Seq2Seq.create_gru
コード例 #4
0
            else:
                next_value = beam_multinomial(SAMPLE_PRUNE_INIT, output)
            if next_value == EOS:
                break

        sent = lookup_sentence(rlut2, dst_i.squeeze())
        print('Guess: %s' % sent)
        print(
            '------------------------------------------------------------------------'
        )


f2i = {}

seq2seq = Seq2SeqModel()
BASE = 'seq2seq'

with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        seq2seq.restore(sess, FLAGS.indir, BASE, FLAGS.mxlen)
        rlut1 = revlut(seq2seq.vocab1)
        rlut2 = revlut(seq2seq.vocab2)
        es = load_sentences(FLAGS.test, seq2seq.vocab1, seq2seq.vocab2,
                            FLAGS.mxlen, FLAGS.batchsz)
        init = tf.global_variables_initializer()

        sess.run(init)
        show_batch(seq2seq, es, sess, rlut1, rlut2, seq2seq.vocab2,
                   FLAGS.sample)
コード例 #5
0
import tensorflow as tf
import os
from tqdm import tqdm
import random
from data import load_sentences, create_data, load_vocab
from hyperparams import Hyperparams as hp
from graph import Graph
import argparse
import numpy as np

if __name__ == '__main__':
    if not os.path.exists(hp.logdir): os.makedirs(hp.logdir)

    # data load
    sents = load_sentences(hp.data_path + '/1-billion-word.train', False)
    valid_sents = load_sentences(hp.data_path + '/1-billion-word.dev', False)
    _, i2w = load_vocab()

    print('Creating datas...')
    X, Y, _, _ = create_data(sents)
    valid_X, valid_Y, _, _ = create_data(valid_sents)

    # mode
    g = Graph()
    data_size = X.shape[0]
    data_list = list(range(data_size))
    with g.graph.as_default():
        saver = tf.train.Saver()
        with tf.Session() as sess:
            # Initialize
            sess.run(tf.global_variables_initializer())
コード例 #6
0
def inference(mode):
    # data load
    neg_lines = load_sentences(hp.data_path + '/delete/delete.test.0', False)
    pos_lines = load_sentences(hp.data_path + '/delete/delete.test.1', False)
    word_w2i, word_i2w = load_vocab()
    # mode
    if mode == 'delete_only':
        # assert(DeleteOnlyGraph(X, Y, attribute_labels))
        g = DeleteOnlyGraph()
    else:
        g = DeleteAndRetrieveGraph()
    with g.graph.as_default(), tf.Session() as sess:
        sv = tf.train.Saver()
        # Restore parameters
        print("Parameter Restoring...")
        sv.restore(sess, './logdir/' + mode + '/model.ckpt')

        # Inference
        if not os.path.exists(hp.data_path + '/generate'):
            os.mkdir(hp.data_path + '/generate')
        for num in ['0', '1']:
            if num == '0':
                X, Y, A, Attribute_labels, Sources, Targets, Attributes = create_data(
                    neg_lines, 1, mode='inference')
            else:
                X, Y, A, Attribute_labels, Sources, Targets, Attributes = create_data(
                    pos_lines, 0, mode='inference')
            with codecs.open(
                    hp.data_path + '/generate/' + mode + '.test.' + num, "w",
                    "utf-8") as fout:
                for i in range(len(X)):
                    x = X[i:i + 1]
                    attri_label = np.array(Attribute_labels[i:i + 1]).reshape(
                        (-1, 1))
                    a = A[i:i + 1]
                    sources = Sources[i:i + 1]
                    targets = Targets[i:i + 1]

                    preds = np.zeros((1, hp.max_len), np.int32)
                    for j in range(hp.max_len):
                        if mode == 'delete_only':
                            _preds = sess.run(g.pred, {
                                g.x: x,
                                g.y: preds,
                                g.attributes: attri_label
                            })
                        else:
                            _preds = sess.run(g.pred, {
                                g.x: x,
                                g.y: preds,
                                g.a: a
                            })
                        preds[:, j] = _preds[:, j]

                    for source, target, pred in zip(sources, targets,
                                                    preds):  # sentence-wise
                        got = " ".join(
                            word_i2w[idx]
                            for idx in pred).split("<END>")[0].strip()
                        fout.write("- expected: " + target + "\n")
                        fout.write("- got: " + got + "\n\n")
                        fout.flush()
コード例 #7
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-mode',
                        action='store',
                        dest='mode',
                        type=str,
                        default='delete_only',
                        help='Enter train mode')
    par_args = parser.parse_args()
    hp.neural_mode = par_args.mode
    if not os.path.exists('logdir/' + hp.neural_mode):
        os.makedirs('logdir/' + hp.neural_mode)

    # data load
    neg_lines = load_sentences(hp.data_path + '/delete/delete.train.0', False)
    pos_lines = load_sentences(hp.data_path + '/delete/delete.train.1', False)
    valid_neg_lines = load_sentences(hp.data_path + '/delete/delete.dev.0',
                                     False)
    valid_pos_lines = load_sentences(hp.data_path + '/delete/delete.dev.1',
                                     False)

    print('Creating datas...')
    neg_X, neg_Y, neg_A, neg_attribute_labels, _, _, _ = create_data(
        neg_lines, 0)
    pos_X, pos_Y, pos_A, pos_attribute_labels, _, _, _ = create_data(
        pos_lines, 1)
    X = np.concatenate([neg_X, pos_X], axis=0)
    Y = np.concatenate([neg_Y, pos_Y], axis=0)
    A = np.concatenate([neg_A, pos_A], axis=0)
    a_labels = np.array(neg_attribute_labels + pos_attribute_labels).reshape(
コード例 #8
0
vocab1 = build_vocab(v1, {args.train, args.test})
vocab2 = build_vocab(v2, {args.train, args.test})

embed1 = Word2VecModel(args.embed1, vocab1, args.unif)

print('Loaded word embeddings: ' + args.embed1)

if args.embed2 is None:
    print('No embed2 found, using embed1 for both')
    args.embed2 = args.embed1

embed2 = Word2VecModel(args.embed2, vocab2, args.unif)
print('Loaded word embeddings: ' + args.embed2)

ts = load_sentences(args.train, embed1.vocab, embed2.vocab, args.mxlen,
                    args.batchsz, long_0_tensor_alloc)
es = load_sentences(args.test, embed1.vocab, embed2.vocab, args.mxlen,
                    args.batchsz, long_0_tensor_alloc)
rlut1 = revlut(embed1.vocab)
rlut2 = revlut(embed2.vocab)

Seq2SeqModelType = Seq2SeqAttnModel if args.attn else Seq2SeqModel
print(Seq2SeqModelType)
seq2seq = Seq2SeqModelType(embed1, embed2, args.mxlen, args.hsz, args.layers,
                           args.rnntype)

trainer = Trainer(gpu, seq2seq, args.optim, args.eta, args.mom)

err_min = 1
last_improved = 0
reset = 0
コード例 #9
0
ファイル: script.py プロジェクト: howardh/scribe
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_arguments()

    data = load_data()
    sorted_indices = np.argsort([len(d) for d in data])

    data = [data[i] for i in sorted_indices]
    normalized_data, m, s = normalize_strokes(data)
    m = m.tolist()
    s = s.tolist()

    sentences = load_sentences()
    sentences = [sentences[i] for i in sorted_indices]
    alphabet, alphabet_dict = compute_alphabet(sentences)
    sentence_vars = [
        Variable(torch.from_numpy(sentence_to_vectors(
            s, alphabet_dict)).float().cuda(),
                 requires_grad=False)
        for s in tqdm(sentences, desc="Converting Sentences")
    ]

    # Create RNN
    m = [0.41261038184165955, -0.006002499256283045]
    s = [2.0667049884796143, 1.8475052118301392]
    if args.unconditioned:
        rnn = GeneratorRNN(num_components=20, mean=m, std=s).cuda()
    elif args.conditioned:
コード例 #10
0
from hyperparams import Hyperparams as hp
from data import load_sentences, make_dict
import os
from delete import delete

print("Make word dictionary...")
neg_lines = load_sentences(hp.data_path + '/sentiment.train.0', True)
pos_lines = load_sentences(hp.data_path + '/sentiment.train.1', True)
make_dict(neg_lines + pos_lines)

# delete file
print("Make delete file...")
if not os.path.exists(hp.data_path + '/delete'):
    os.makedirs(hp.data_path + '/delete')
    delete(hp.data_path + '/sentiment.train.0', hp.data_path + '/sentiment.train.1', mode='train')
    delete(hp.data_path + '/sentiment.dev.0', hp.data_path + '/sentiment.dev.1', mode='dev')
    delete(hp.data_path + '/sentiment.test.0', hp.data_path + '/sentiment.test.1', mode='test')
コード例 #11
0
unif = 0 if FLAGS.static else FLAGS.unif
#w2vModel = Word2VecModel(FLAGS.glove_embed_file, vocab, unif)
w2vModel = models[int(sys.argv[1])](FLAGS.glove_embed_file, vocab, unif)

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.data_file)

print(len(x_text))

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
max_document_length = 100
# vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
# x = np.array(list(vocab_processor.fit_transform(x_text)))
dataset = load_sentences([(y1, x) for x, y1 in zip(x_text, y)], w2vModel.vocab,
                         FLAGS.clean, FLAGS.chars, max_document_length)
x_train = dataset.x
print(dataset.x.shape)
y_train = dataset.y

x_text_test, y_test = data_helpers.load_data_and_labels(FLAGS.test_file)
dataset2 = load_sentences([(y1, x) for x, y1 in zip(x_text_test, y_test)],
                          w2vModel.vocab, FLAGS.clean, FLAGS.chars,
                          max_document_length)
x_test = dataset2.x
print(dataset2.x.shape)
y_test = dataset2.y

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y_train)))
コード例 #12
0
def template_based():
    if not os.path.exists(hp.data_path + '/generate'):
        os.makedirs(hp.data_path + '/generate')
    print('template_based...')
    for num in ['0', '1']:
        if num == '0':
            neg_lines = load_sentences(hp.data_path + '/delete/delete.test.0',
                                       False)
            pos_lines = load_sentences(hp.data_path + '/delete/delete.train.1',
                                       False)
            neg_sentences_contents_dict = dict(
                x.split('\t')[:2] for x in neg_lines)
            pos_sentences_contents_dict = dict(
                x.split('\t')[:2] for x in pos_lines)
            pos_sentences_marker_dict = dict(
                [x.split('\t')[0], x.split('\t')[2]] for x in pos_lines)
            sentences1 = neg_sentences_contents_dict
            sentences2 = pos_sentences_contents_dict
            marker2 = pos_sentences_marker_dict
        else:
            neg_lines = load_sentences(hp.data_path + '/delete/delete.train.0',
                                       False)
            pos_lines = load_sentences(hp.data_path + '/delete/delete.test.1',
                                       False)
            neg_sentences_contents_dict = dict(
                x.split('\t')[:2] for x in neg_lines)
            pos_sentences_contents_dict = dict(
                x.split('\t')[:2] for x in pos_lines)
            neg_sentences_marker_dict = dict(
                [x.split('\t')[0], x.split('\t')[2]] for x in neg_lines)
            sentences1 = pos_sentences_contents_dict
            sentences2 = neg_sentences_contents_dict
            marker2 = neg_sentences_marker_dict
        with codecs.open(hp.data_path + '/generate/template_based.test.' + num,
                         'w', 'utf-8') as fout:
            for sentence1 in sentences1:
                dist_dict = {}
                # Search up to hp.max_candidates randomly.
                frag_sentences2 = random.sample(sentences2.keys(),
                                                hp.max_candidates)
                sentence1_content = sentences1[sentence1]
                for sentence2 in frag_sentences2:
                    # distance between pos_content and neg_content
                    dist_dict[sentence2] = levenshtein_distance(
                        sentence1_content, sentences2[sentence2])
                min_sentence = min(dist_dict, key=dist_dict.get)
                nearest_marker = marker2[min_sentence]
                sentence1_list = sentence1.split(' ')
                sentence1_content_list = sentences1[sentence1].split(' ')
                # Insert attribute markers in contents
                index = 0
                for idx in range(len(sentence1_list)):
                    if sentence1_list[idx] != sentence1_content_list[idx]:
                        index = idx
                        break
                generated_sentence = ' '.join(sentence1_content_list[:index]) + ' ' + \
                                     nearest_marker + ' ' + ' '.join(sentence1_content_list[index:])
                generated_sentence = generated_sentence.replace('  ', ' ')
                fout.write("- expected: " + sentence1 + "\n")
                fout.write("- got: " + generated_sentence + "\n\n")
                fout.flush()
コード例 #13
0
def retrieve_only(dist_mode='levenshtein'):

    print('retrieve_only with ' + dist_mode + ' distance...')
    for num in ['0', '1']:
        if num == '0':
            neg_lines = load_sentences(hp.data_path + '/delete/delete.test.0',
                                       False)
            pos_lines = load_sentences(hp.data_path + '/delete/delete.train.1',
                                       False)
            neg_sentences_dict = dict(x.split('\t')[:2] for x in neg_lines)
            pos_sentences_dict = dict(x.split('\t')[:2] for x in pos_lines)
            sentences1 = neg_sentences_dict
            sentences2 = pos_sentences_dict
        else:
            neg_lines = load_sentences(hp.data_path + '/delete/delete.train.0',
                                       False)
            pos_lines = load_sentences(hp.data_path + '/delete/delete.test.1',
                                       False)
            neg_sentences_dict = dict(x.split('\t')[:2] for x in neg_lines)
            pos_sentences_dict = dict(x.split('\t')[:2] for x in pos_lines)
            sentences1 = pos_sentences_dict
            sentences2 = neg_sentences_dict
        with codecs.open(hp.data_path + '/generate/retrieve_only.test.' + num,
                         'w', 'utf-8') as fout:
            # Levenshtein distance
            if dist_mode == 'levenshtein':
                for sentence1 in sentences1:
                    dist_dict = {}
                    # Search up to hp.max_candidates randomly.
                    frag_sentences2 = random.sample(sentences2.keys(),
                                                    hp.max_candidates)
                    for sentence2 in frag_sentences2:
                        # distance between pos_content and neg_content
                        dist_dict[sentence2] = levenshtein_distance(
                            sentences1[sentence1], sentences2[sentence2])
                    nearest_sentence = min(dist_dict, key=dist_dict.get)
                    fout.write("- expected: " + sentence1 + "\n")
                    fout.write("- got: " + nearest_sentence + "\n\n")
                    fout.flush()

            # Embedding distance between sentence1,sentence2 by using "universal sentence encoder[1]"
            # but it's too slow and not good performance
            if dist_mode == 'embedding':
                embed = hub.Module(
                    "https://tfhub.dev/google/universal-sentence-encoder/1")
                with tf.Session() as session:
                    session.run([
                        tf.global_variables_initializer(),
                        tf.tables_initializer()
                    ])
                    embedded_sentences1 = session.run(
                        embed(sentences1.values()))
                    for sentence1, embedded_sentence1 in zip(
                            sentences1.keys(), embedded_sentences1):
                        dist_dict = {}
                        # Search up to hp.max_candidates randomly.
                        frag_sentences2 = random.sample(
                            sentences2.keys(), hp.max_candidates)
                        frag_contents2 = []
                        for frag_sentence2 in frag_sentences2:
                            frag_contents2.append(sentences2[frag_sentence2])
                        embedded_sentences2 = session.run(
                            embed(frag_contents2))
                        for idx, embedded_sentence2 in enumerate(
                                embedded_sentences2):
                            dist_dict[idx] = np.inner(embedded_sentence1,
                                                      embedded_sentence2)
                        nearest_idx = max(dist_dict, key=dist_dict.get)
                        nearest_sentence = frag_sentences2[nearest_idx]
                        fout.write("- expected: " + sentence1 + "\n")
                        fout.write("- got: " + nearest_sentence + "\n\n")
                        fout.flush()
コード例 #14
0
ファイル: main.py プロジェクト: arpit9295/ce7455
# Constants

# paths to files
# To stored mapping file
mapping_file = './data/mapping.pkl'

# To stored model
name = parameters['name']
model_name = models_path + name  # get_name(parameters)

if not os.path.exists(models_path):
    os.makedirs(models_path)

# ##### Load data and preprocess

train_sentences = load_sentences(parameters['train'], parameters['zeros'])
test_sentences = load_sentences(parameters['test'], parameters['zeros'])
dev_sentences = load_sentences(parameters['dev'], parameters['zeros'])

update_tag_scheme(train_sentences, parameters['tag_scheme'])
update_tag_scheme(dev_sentences, parameters['tag_scheme'])
update_tag_scheme(test_sentences, parameters['tag_scheme'])

print(train_sentences[0])
print(dev_sentences[0])
print(test_sentences[0])

dico_words, word_to_id, id_to_word = to_word_mapping(train_sentences,
                                                     parameters['lower'])
dico_chars, char_to_id, id_to_char = to_char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = to_tag_mapping(train_sentences)