Ejemplo n.º 1
0
def main(args):
    # Dataset functions
    vocab = Vocabulary('./data/vocabulary.json', padding=args.padding)
    kb_vocab = Vocabulary('./data/vocabulary.json', padding=4)
    print('Loading datasets.')
    training = Data(args.training_data, vocab, kb_vocab)
    validation = Data(args.validation_data, vocab, kb_vocab)
    training.load()
    validation.load()
    training.transform()
    training.kb_out()
    validation.transform()
    validation.kb_out()
    print('Datasets Loaded.')
    print('Compiling Model.')

    model = KVMMModel(pad_length=args.padding,
                      embedding_size=args.embedding,
                      vocab_size=vocab.size(),
                      batch_size=batch_size,
                      n_chars=vocab.size(),
                      n_labels=vocab.size(),
                      encoder_units=200,
                      decoder_units=200).to(device)

    print(model)
    model_optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    print_every = 100
    start = time.time()
    n_iters = 500000

    iter = 0
    while iter < n_iters:
        training_data = training.generator(batch_size)
        input_tensors = training_data[0][0]
        target_tensors = training_data[1]
        kbs = training_data[0][1]
        iter += 1
        loss = train(input_tensors, target_tensors, kbs, model,
                     model_optimizer, criterion, vocab, kb_vocab)
        print_loss_total += loss
        plot_loss_total += loss
        if iter % print_every == 0:
            validation_data = validation.generator(batch_size)
            validation_inputs = validation_data[0][0]
            validation_kbs = validation_data[0][1]
            validation_targets = validation_data[1]
            accuracy = evaluate(model, validation_inputs, validation_targets,
                                validation_kbs)
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f - val_accuracy %f' %
                  (timeSince(start, iter / n_iters), iter,
                   iter / n_iters * 100, print_loss_avg, accuracy))
            torch.save(model.state_dict(), 'model_weights.pytorch')
Ejemplo n.º 2
0
def main(args):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    # Dataset functions
    vocab = Vocabulary('./data/vocabulary.json', padding=args.padding)
    vocab = Vocabulary('./data/vocabulary.json', padding=args.padding)
    kb_vocab = Vocabulary('./data/vocabulary.json', padding=4)
    print('Loading datasets.')
    training = Data(args.training_data, vocab, kb_vocab)
    validation = Data(args.validation_data, vocab, kb_vocab)
    training.load()
    validation.load()
    training.transform()
    training.kb_out()
    validation.transform()
    validation.kb_out()
    print('Datasets Loaded.')
    print('Compiling Model.')

    model = memnn(pad_length=args.padding,
                  embedding_size=args.embedding,
                  vocab_size=vocab.size(),
                  batch_size=args.batch_size,
                  n_chars=vocab.size(),
                  n_labels=vocab.size(),
                  embedding_learnable=True,
                  encoder_units=200,
                  decoder_units=200,
                  trainable=True)

    model.summary()
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=[
                      'accuracy',
                  ])
    print('Model Compiled.')
    print('Training. Ctrl+C to end early.')

    try:
        model.fit_generator(generator=training.generator(args.batch_size),
                            steps_per_epoch=300,
                            validation_data=validation.generator(
                                args.batch_size),
                            validation_steps=10,
                            workers=1,
                            verbose=1,
                            epochs=args.epochs)

    except KeyboardInterrupt as e:
        print('Model training stopped early.')
    model.save_weights("model_weights_nkbb.hdf5")

    print('Model training complete.')
Ejemplo n.º 3
0
        predicted.append(run_example(model, kbs, vocabulary, example))
        outdf['input'].append(example)
        outdf['output'].append(predicted[-1])
    return predicted


if __name__ == "__main__":
    no_unks = False
    dialog_type = "schedule"
    file_name = "-" + dialog_type + "-epoch-80"  # "-2409"
    pad_length = 20
    df = pd.read_csv("../data/test_data - " + dialog_type + ".csv",
                     delimiter=";")
    inputs = list(df["input"])
    outputs = list(df["output"])
    vocab = Vocabulary('../data/vocabulary-full.json', padding=pad_length)

    kb_vocabulary = Vocabulary('../data/vocabulary-full.json', padding=4)

    model = memnn(pad_length=20,
                  embedding_size=200,
                  batch_size=1,
                  vocab_size=vocab.size(),
                  n_chars=vocab.size(),
                  n_labels=vocab.size(),
                  embedding_learnable=True,
                  encoder_units=200,
                  decoder_units=200)
    weights_file = "../weights/model_weights_nkbb" + file_name + ".hdf5"
    model.load_weights(weights_file, by_name=True)
Ejemplo n.º 4
0
def main_examples(dialog_type,
                  underscore,
                  kb,
                  iteration=500000,
                  delimiter=",",
                  final_folder="",
                  test_mode=""):
    if final_folder == "":
        final_folder = dialog_type[3:]
    try:
        df = pd.read_csv("data/test_data" + underscore + test_mode +
                         dialog_type + kb + ".csv",
                         encoding="ISO-8859-1",
                         delimiter=delimiter)
    except FileNotFoundError:
        df = pd.read_csv("../data/test_data" + underscore + test_mode +
                         dialog_type + kb + ".csv",
                         encoding="ISO-8859-1",
                         delimiter=delimiter)
    inputs = list(df["input"])
    outputs = list(df["output"])
    actual_clusters = list(df["actual_cluster"])
    try:
        vocab = Vocabulary('data/vocabulary-train' + dialog_type +
                           ' - perfect decomposition - preprocessedFinal.json',
                           padding=pad_length)
        kb_vocabulary = Vocabulary(
            'data/vocabulary-train' + dialog_type +
            ' - perfect decomposition - preprocessedFinal.json',
            padding=4)
    except FileNotFoundError:
        vocab = Vocabulary('../data/vocabulary-train' + dialog_type +
                           ' - perfect decomposition - preprocessedFinal.json',
                           padding=pad_length)
        kb_vocabulary = Vocabulary(
            '../data/vocabulary-train' + dialog_type +
            ' - perfect decomposition - preprocessedFinal.json',
            padding=4)

    model = KVMMModel(pad_length=20,
                      embedding_size=200,
                      batch_size=1,
                      vocab_size=vocab.size(),
                      n_chars=vocab.size(),
                      n_labels=vocab.size(),
                      encoder_units=200,
                      decoder_units=200).to(device)
    weights_file = "final-" + final_folder + "/model_weights_" + dialog_type[
        3:] + "_iter_" + str(iteration) + ".pytorch"
    try:
        try:
            model.load_state_dict(torch.load(weights_file))
        except RuntimeError:
            model.load_state_dict(torch.load(weights_file, map_location='cpu'))
    except FileNotFoundError:
        try:
            model.load_state_dict(torch.load("../" + weights_file))
        except RuntimeError:
            model.load_state_dict(
                torch.load("../" + weights_file, map_location='cpu'))

    kbfile = "data/normalised_kbtuples.csv"
    try:
        df = pd.read_csv(kbfile)
    except FileNotFoundError:
        df = pd.read_csv("../" + kbfile)
    kbs = list(df["subject"] + " " + df["relation"])
    # print(kbs[:3])
    kbs = np.array(list(map(kb_vocabulary.string_to_int, kbs)))
    kbs = np.repeat(kbs[np.newaxis, :, :], 1, axis=0)
    data = run_examples(model, kbs, vocab, inputs, outputs)
    df = pd.DataFrame(
        columns=["inputs", "outputs", "prediction", "actual_cluster"])
    d = {'outputs': [], 'inputs': [], 'predictions': [], "actual_cluster": []}
    for index, (i, o, p) in enumerate(zip(inputs, outputs, data)):
        d["outputs"].append(str(o))
        d["inputs"].append(str(i))
        d["predictions"].append(str(p))
        d["actual_cluster"].append(actual_clusters[index])
    df = pd.DataFrame(d)
    return df
Ejemplo n.º 5
0
    """
        All Accuracy
        https://github.com/rasmusbergpalm/normalization/blob/master/train.py#L10
    """
    return K.mean(
        K.all(K.equal(K.max(y_true, axis=-1),
                      K.cast(K.argmax(y_pred, axis=-1), K.floatx())),
              axis=1))


# Configuration
training_data = './training.csv'
validation_data = './validation.csv'

# Dataset functions
input_vocab = Vocabulary('./human_vocab.json', padding=config.padding)
output_vocab = Vocabulary('./machine_vocab.json', padding=config.padding)

print('Loading datasets...')

training = Data(training_data, input_vocab, output_vocab)
validation = Data(validation_data, input_vocab, output_vocab)
training.load()
validation.load()
training.transform()
validation.transform()

print('Datasets Loaded.')


def build_models(pad_length=config.padding,
Ejemplo n.º 6
0
def main(args):
    # Dataset functions
    vocab = Vocabulary(args.vocabulary_data, padding=args.padding)
    kb_vocab = Vocabulary(args.vocabulary_data, padding=4)  # 7
    print('Loading datasets.')
    # Callback.__init__(self)
    if args.training_data.find("schedule") != -1:
        train_file_name = "schedule"
    elif args.training_data.find("navigate") != -1:
        train_file_name = "navigate"
    elif args.training_data.find("weather") != -1:
        train_file_name = "weather"
    elif args.training_data.find("ubuntu") != -1:
        train_file_name = "ubuntu"
    elif args.training_data.find("original") != -1:
        train_file_name = "original"
    else:
        train_file_name = "unknown"
    if args.save_path == "default":
        args.save_path = "weights/model_weights_" + train_file_name
    training = Data(args.training_data, vocab, kb_vocab,
                    args.generated_training_data)
    validation = Data(args.validation_data, vocab, kb_vocab)
    training.load()
    validation.load()
    training.transform()
    training.kb_out()
    validation.transform()
    validation.kb_out()
    print('Datasets Loaded.')
    print('Compiling Model.')

    model = KVMMModel(pad_length=args.padding,
                      embedding_size=args.embedding,
                      vocab_size=vocab.size(),
                      batch_size=batch_size,
                      n_chars=vocab.size(),
                      n_labels=vocab.size(),
                      encoder_units=200,
                      decoder_units=200).to(device)

    print(model)
    # Training using Adam Optimizer
    model_optimizer = optim.Adam(model.parameters(), lr=0.001)
    # Training using cross-entropy loss
    criterion = nn.CrossEntropyLoss()

    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    print_every = 100
    save_every = 10000
    start = time.time()
    n_iters = 500000

    iter = 0
    while iter < n_iters:
        training_data = training.generator(batch_size)
        input_tensors = training_data[0][0]
        target_tensors = training_data[1]
        kbs = training_data[0][1]
        iter += 1
        loss = train(input_tensors, target_tensors, kbs, model,
                     model_optimizer, criterion, vocab, kb_vocab)
        print_loss_total += loss
        plot_loss_total += loss
        if iter % print_every == 0:
            validation_data = validation.generator(batch_size)
            validation_inputs = validation_data[0][0]
            validation_kbs = validation_data[0][1]
            validation_targets = validation_data[1]
            accuracy = evaluate(model, validation_inputs, validation_targets,
                                validation_kbs)
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f - val_accuracy %f' %
                  (timeSince(start, iter / n_iters), iter,
                   iter / n_iters * 100, print_loss_avg, accuracy))
            if iter % save_every == 0:
                torch.save(model.state_dict(),
                           args.save_path + "_iter_" + str(iter) + ".pytorch")
Ejemplo n.º 7
0
            if len(sort)>k:
                ndf["u"+str(k+1)].append(sort[k][0])
            else:
                ndf["u" + str(k + 1)].append("None")

    ndf = pd.DataFrame(ndf)
    ndf.to_csv("cl_reranked_with_lstmfk_kb_t1.csv")
    print("Saved to file")
            #tmp.append(res[0])
        #reranked.append(tmp)


if __name__ == "__main__":
    pad_length = 20
    df = pd.read_csv("../data/ranked_responsesfk_kb.csv",encoding="latin1")
    vocab = Vocabulary('../fkdata/vocabl_fk.json', padding=pad_length)
    model = dualenc(pad_length=20,
                  embedding_size=100,
                  batch_size=1,
                  vocab_size=vocab.size(),
                  n_chars=vocab.size(),
                  n_labels=vocab.size(),
                  encoder_units=256,
                  decoder_units=256)
    weights_file = "../cl_modellstm_weightsfk_kb.hdf5"
    model.load_weights(weights_file, by_name=True)
    run_examples(model,vocab,df)
    #print(data[:3])
    '''ndf = {"input": [], "response": [], "u1": [], "u2": [], "u3": [], "u4": [], "u5": [], "u6": [], "u7": [], "u8": [],"u9": [], "u10": []}
    ndf = {"input": [], "response": [], "ranked": []}
    for i,(inp,out) in enumerate(zip(df["input"],df["response"])):
Ejemplo n.º 8
0
                            required=False, default=20, type=int)

    named_args.add_argument('-t', '--training-data', metavar='|',
                            help="""Location of training data""",
                            required=False, default='./data/train_data.csv')

    named_args.add_argument('-v', '--validation-data', metavar='|',
                            help="""Location of validation data""",
                            required=False, default='./data/val_data.csv')

    named_args.add_argument('-b', '--batch-size', metavar='|',
                            help="""Location of validation data""",
                            required=False, default=100, type=int)
    args = parser.parse_args()
    print(args)
    vocab = Vocabulary('./data/vocabulary.json',
                              padding=args.padding)
    model = memnn(pad_length=args.padding,
                      embedding_size=args.embedding,
                      vocab_size=vocab.size(),
                      batch_size=args.batch_size,
                      n_chars=vocab.size(),
                      n_labels=vocab.size(),
                      embedding_learnable=True,
                      encoder_units=200,
                      decoder_units=200,trainable=True)
    model.load_weights("model_weights_nkbb.hdf5")

    test_data = load_test_data()
    model.predict(test_data)  # ("How is the weather in San Francisco today?")