Python TextDataの例、corpus.textdata.TextData Pythonの例

コード例 #1

0

ファイルを表示

def parse_args():
    """
    Parse the arguments from the given command line
    Args:
        args (list<str>): List of arguments to parse. If None, the default sys.argv will be parsed
    """

    parser = argparse.ArgumentParser()
    parser.add_argument('--test', type=int, default=0, help='Test mode')
    parser.add_argument(
        '--maxLength',
        type=int,
        default=40,
        help=
        'Maximum length of the sentence (for input and output), define number of maximum step of the RNN'
    )
    parser.add_argument(
        '--filterVocab',
        type=int,
        default=1,
        help=
        'Remove rarely used words (by default words used only once). 0 to keep all words.'
    )
    parser.add_argument(
        '--vocabularySize',
        type=int,
        default=40000,
        help='Limit the number of words in the vocabulary (0 for unlimited)')
    parser.add_argument('--corpus',
                        choices=TextData.corpusChoices(),
                        default=TextData.corpusChoices()[0],
                        help='Corpus on which extract the dataset.')
    parser.add_argument('--rootDir',
                        type=str,
                        default='corpus',
                        help='Folder where to look for the models and data')
    parser.add_argument(
        '--datasetTag',
        type=str,
        default='',
        help=
        'Add a tag to the dataset (file where to load the vocabulary and the precomputed samples, not the original corpus). Useful to manage multiple versions. Also used to define the file used for the lightweight format.'
    )  # The samples are computed from the corpus if it does not exist already. There are saved in \'data/samples/\'
    parser.add_argument(
        '--skipLines',
        action='store_true',
        default=True,
        help=
        'Generate training samples by only using even conversation lines as questions (and odd lines as answer). Useful to train the network on a particular person.'
    )
    args = parser.parse_args()
    return args

コード例 #2

0

ファイルを表示

def main():
    # global text_data
    args = parse_args()
    text_data = TextData(args)
    try:
        if args.test:
            gen_test_interactive(text_data)
        else:
            # Step 1: Pre train the Generator and get the GEN_0 model
            gen_pre_train(text_data)

            # Step 2: GEN model test
            # gen_test_interactive(text_data)

            # Step 3: Pre train the Discriminator and get the DISC_0 model
            # disc_pre_train(text_data)

            # Step 4: Train the GEN model and DISC model using AL/RL
            # al_train(text_data)

            # Step 5: GEN model test
            # gen_test_interactive(text_data)

            # integration test
            # connection.start_server(text_data, True)
    except KeyboardInterrupt:
        pass

コード例 #3

0

ファイルを表示

ファイル: text-gan - Step2.py プロジェクト: lens7/Generative-Adversarial-Networks-for-Text-generation-

def main():
    # global text_data
    args = parse_args()
    text_data = TextData(args)
    try:
        if args.test:
            gen_test_interactive(text_data)
        else:
            #gen_pre_train(text_data)

            gen_test_interactive(text_data)

            #disc_pre_train(text_data)

            #al_train(text_data)

            gen_test_interactive(text_data)

            connection.start_server(text_data, True)
    except KeyboardInterrupt:
        pass

コード例 #4

0

ファイルを表示

ファイル: Create_Joint_emb_input.py プロジェクト: AndreiNosov/Chatbot_KVNN

from corpus.textdata import TextData
import csv
from collections import Counter
import numpy as np
textdata = TextData("data/kvret_train_public.json",
                    "data/kvret_dev_public.json",
                    "data/kvret_test_public.json")

texts = []

for i in textdata.trainingSamples:
    texts.append(textdata.sequence2str(i[0], clean=True).split(" "))
    texts.append(textdata.sequence2str(i[1], clean=True).split(" "))

with open("data/samples/emb_in.txt", "w") as output:
    writer = csv.writer(output,
                        lineterminator='\n',
                        delimiter=' ',
                        quotechar=',',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerows(texts)

comatrix = Counter()
for i in textdata.trainingSamples:
    for triple in i[2]:
        comatrix[(triple[0], triple[2])] += 1
kb = []
kb_comb = []
for (ei, ej) in comatrix:
    kb.append([textdata.id2word[ei], textdata.id2word[ej]])
ALPHA = 0.55

コード例 #5

0

ファイルを表示

ファイル: plot_embeddings.py プロジェクト: AndreiNosov/Chatbot_KVNN

import matplotlib.pyplot as plt
import hypertools as hyp
import numpy as np
from sklearn.decomposition import PCA
from corpus.textdata import TextData

glove_filename = "data/samples/jointEmbedding.txt"

textdata = TextData("data/kvret_train_public.json",
                    "data/kvret_dev_public.json",
                    "data/kvret_test_public.json", glove_filename)

x = []
s = []

plotting_sentences = [
    "i have a doctor appointment next month on the 13 th at 11 am with tom please set a reminder",
    "navigate to my friend's home please do you know the address of friend's home actually i need to go home the quickest route please,the quickest route home is 4_miles away with heavy_traffic located at 5671_barringer_street",
    "what's the forecast in carson for this weekend,in carson it will be foggy on saturday and dew on sunday"
]

# plt.scatter(representation[0], representation[1])
# plt.annotate(word, xy=(representation[0], representation[1]), xytext=(5, 2),
#              textcoords='offset points', ha='right', va='bottom')

for sentence in plotting_sentences:
    for word in sentence.split(" "):
        x.append(textdata.word_to_embedding_dict[word])
        s.append(word)

hyp.plot(np.array(x),

コード例 #6

0

ファイルを表示

ファイル: tensorflow_main.py プロジェクト: AndreiNosov/Chatbot_KVNN

def main(attention, attention_architecture):
    # sys.stdout = open('trained_model/log.txt', 'w')
    print (tf.__version__)
    steps_per_eval=10



    if attention_architecture:
        attention_architecture = attention_architecture  # standard || KVAttention
    else:
        attention_architecture ="standard" #standard || KVAttention

    train_file = 'data/kvret_train_public.json'
    valid_file = 'data/kvret_dev_public.json'
    test_file = 'data/kvret_test_public.json'
    model_dir="trained_model"

    textData = TextData(train_file, valid_file, test_file)
    voc_size = textData.getVocabularySize()

    batch_size = 256
    eos=1
    maxLengthEnco = textData.getInputMaxLength()
    maxLengthDeco = textData.getTargetMaxLength()
    print ("Max Decodder", textData.getTargetMaxLength())

    train_graph = tf.Graph()

    with train_graph.as_default(), tf.container("train"):

        model_device_fn = None
        with tf.device(model_device_fn):
            if attention_architecture == "standard":
                model = Seq2Seq(
                    200,
                    vocab_size=voc_size,
                    encoder_len=maxLengthEnco,
                    decoder_len=maxLengthDeco,
                    batch_size=batch_size,
                    stop_symbols=eos,
                    use_attn=attention
                )
                if attention:
                    model_dir = "trained_model/AttnSeq2Seq"
                else:
                    model_dir = "trained_model/Seq2Seq"
            elif attention_architecture == "KVAttention":
                model_creator = Seq2SeqKV

        #train_sess = tf.Session()
        config_proto = get_config_proto(
            log_device_placement=False,
            num_intra_threads=0,
            num_inter_threads=0)

        train_sess = tf.Session(target="", config=config_proto, graph=train_graph)

    if not os.path.exists(model_dir+"/stats.txt"):
        with open(model_dir+"/stats.txt", "a") as myfile:
            myfile.write("Training_Epoch\t Epoch_loss\t Epoch_Bleu\t Validation_loss\t Valid_Bleu\n")


    """Create translation model and initialize or load parameters in session."""
    latest_ckpt = tf.train.latest_checkpoint(model_dir)
    if latest_ckpt:
        model.saver.restore(train_sess, latest_ckpt)
        print("Model restored.")
        print("Current Global step", model.global_step.eval(train_sess))
        global_step = model.global_step.eval(train_sess)
    else:
        start_time = time.time()
        with train_graph.as_default():
            train_sess.run(tf.global_variables_initializer())
        global_step = 0


    n_epoch = 2000

    epoch_step = global_step
    loss_history = []
    valid_loss_history = []
    while epoch_step < n_epoch:
        try:
            epoch_step += 1
            all_predicted = []
            target_batches = []
            epoch_loss = 0
            batches = textData.getBatches(batch_size)

            for current_step in range(0,len(batches)):
                nextBatch = batches[current_step]
                # Training pass
                feedDict = {}

                model.update_feed_dict(feedDict, nextBatch.encoderSeqs,
                                                    nextBatch.decoderSeqs, nextBatch.targetSeqs,
                                                    nextBatch.weights)

                [out, batch_predictions, batch_loss, _] = train_sess.run([model.outputs, model.predictions, model.total_loss, model.training_op], feed_dict=feedDict)

                loss_history.append(batch_loss)
                epoch_loss += batch_loss
                all_predicted.append(batch_predictions)
                target_batches.append(np.transpose(nextBatch.decoderSeqs))

            train_sess.run(model.increment_global_step)

            if epoch_step % steps_per_eval == 0:

                candidates, references =textData.get_candidates(target_batches,all_predicted)
                training_metric_score = nltk.translate.bleu_score.corpus_bleu(references, candidates)
                eval_loss, eval_metric_score = evaluate(model, textData, train_sess)

                with open(model_dir + "/stats.txt", "a") as myfile:
                    myfile.write(str(epoch_step)+"\t "+str(epoch_loss)+"\t "+str(training_metric_score)+"\t "+str(eval_loss)+"\t "+str(eval_metric_score)+"\n")
                # Save checkpoint
                model.saver.save(
                    train_sess,
                    os.path.join(model_dir, "translate.ckpt"),
                    global_step=epoch_step)

            print('Epoch', epoch_step)
            print('Training loss', epoch_loss)

        except (KeyboardInterrupt, SystemExit):  # If the user press Ctrl+C while testing progress
            print('Interruption detected, exiting the program...')
            break

    model.global_step=epoch_step
    # Save checkpoint
    model.saver.save(train_sess,
                os.path.join(model_dir, "translate.ckpt"),
                global_step=epoch_step)

コード例 #7

0

ファイルを表示

ファイル: test.py プロジェクト: AndreiNosov/Chatbot_KVNN

#
from torch.autograd import Variable

from corpus.textdata import TextData
import torch
import torch
#
glove_filename = "data/samples/joint_model"

textdata = TextData("data/kvret_train_public.json", "data/kvret_dev_public.json",
                    "data/kvret_test_public.json")

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# x = textdata.word_to_embedding_dict["parking_garage"].reshape(1, -1)
# y = textdata.word_to_embedding_dict["dish_parking"].reshape(1, -1)
# y2 = textdata.word_to_embedding_dict["clear_sky"].reshape(1, -1)


# print(cosine_similarity(x,y2))
batches = textdata.getBatches(120, valid=True, transpose=False)

for batch in batches:
    input_batch = Variable(torch.LongTensor(batch.encoderSeqs)).transpose(0, 1)
    for i in batch.encoderMaskSeqs:
        print(len(i))
    target_batch = Variable(torch.LongTensor(batch.targetSeqs)).transpose(0, 1)

    input_batch_mask = Variable(torch.FloatTensor(batch.encoderMaskSeqs)).transpose(0, 1)
    target_batch_mask = Variable(torch.FloatTensor(batch.decoderMaskSeqs)).transpose(0, 1)