Ejemplo n.º 1
0
def load_data(config):
    """
    Load training examples and pretrained word embeddings from disk.
    Return training inputs, labels and pretrianed embeddings.
    """
    # Load raw data
    wq_file = config["webquestions_examples_file"]
    n_neg_sample = config["num_classes"] - 1
    x_u, x_r, y, max_len = dh.get_training_examples_for_softmax(wq_file, n_neg_sample)
    # Pad sentences
    pad = lambda x: dh.pad_sentences(x, max_len)
    pad_lst = lambda x: map(pad, x)
    x_u = map(pad, x_u)
    x_r = map(pad_lst, x_r)
    # Load tokens and pretrained embeddings
    we_file = config["word_embeddings_file"]
    voc_size = config["vocabulary_size"]
    embedding_size = config["embedding_size"]
    tokens, U = dh.get_pretrained_wordvec_from_file(we_file, (voc_size, embedding_size))
    # Represent sentences as list(nparray) of ints
    dctize = lambda word: tokens[word] if tokens.has_key(word) else tokens["pad"]
    dctizes = lambda words: map(dctize, words)
    dctizess = lambda wordss: map(dctizes, wordss)
    x_u_i = np.array(map(dctizes, x_u))
    x_r_i = np.array(map(dctizess, x_r))
    y = np.array(y)
    
    return (x_u_i, x_r_i, y, max_len, U)
Ejemplo n.º 2
0
def load_data(config):
    """
    Load training examples and pretrained word embeddings from disk.
    Return training inputs, labels and pretrianed embeddings.
    """
    # Load raw data
    wq_file = config["webquestions_examples_file"]
    n_neg_sample = config["num_classes"] - 1
    x_u, x_r, y, max_len = dh.get_training_examples_for_softmax(
        wq_file, n_neg_sample)
    # Pad sentences
    pad = lambda x: dh.pad_sentences(x, max_len)
    pad_lst = lambda x: map(pad, x)
    x_u = map(pad, x_u)
    x_r = map(pad_lst, x_r)
    # Load tokens and pretrained embeddings
    we_file = config["word_embeddings_file"]
    voc_size = config["vocabulary_size"]
    embedding_size = config["embedding_size"]
    tokens, U = dh.get_pretrained_wordvec_from_file(we_file,
                                                    (voc_size, embedding_size))
    # Represent sentences as list(nparray) of ints
    dctize = lambda word: tokens[word] if tokens.has_key(word) else tokens[
        "pad"]
    dctizes = lambda words: map(dctize, words)
    dctizess = lambda wordss: map(dctizes, wordss)
    x_u_i = np.array(map(dctizes, x_u))
    x_r_i = np.array(map(dctizess, x_r))
    y = np.array(y)

    return (x_u_i, x_r_i, y, max_len, U)
Ejemplo n.º 3
0
def prepare_sentences(sentences, vocabulary, max_length):
    print(sentences)
    sentences_processed = process_sentences(sentences)
    sentences_padded, _ = pad_sentences(sentences_processed,
                                        sequence_length=max_length)
    x, _ = build_input_data(sentences_padded, 0, vocabulary)
    return x
Ejemplo n.º 4
0
def preencode(df):
    sentences =  make_text_matrix(df)
    s = [x.split() for x in sentences['text'].values]
    l = sentences['target'].values
    sentences_padded = pad_sentences(s)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, l, vocabulary)
    return x,y,vocabulary,vocabulary_inv
Ejemplo n.º 5
0
def preprocess(model):
    dict_sentences = {}
    reverse_dict = {}
    match_dictionary = {}
    pair_list = []
    import sys
    i = 0
    k = 0
    maxlen = 0
    # this reads in one line at a time from stdin
    for line in sys.stdin:
        i+=1
        tokens = line.split("\t")
        sent1 = tokens[0]
        sent2 = tokens[1]

        if clean_sent_cond(sent1) or clean_sent_cond(sent2):
            continue
        else:
            k += 1

        if not sent1 in dict_sentences:
            dict_sentences[sent1] = len(dict_sentences) + 1
        if not sent2 in dict_sentences:
            dict_sentences[sent2] = len(dict_sentences) + 1
        index_1 = dict_sentences[sent1]
        index_2 = dict_sentences[sent2]

        if not index_1 in match_dictionary:
            match_dictionary[index_1] = []
        if not index_2 in match_dictionary:
            match_dictionary[index_2] = []
        match_dictionary[index_1].append(index_2)
        match_dictionary[index_2].append(index_1)
        pair_list.append((index_1, index_2))

        if i % 10000 == 0:
            print(str(k) + "/" + str(i))
        if k == 500000:
            break;

    i = 0
    for entry in dict_sentences:
        simple_sent1 = filter(lambda x: len(x) > 1, data_helpers.clean_str(entry).split(" "))
        sent1 = data_helpers.build_input_data(data_helpers.pad_sentences([simple_sent1], 40, padding_word="<PAD/>"),
                                          model.vocab)
        reverse_dict[dict_sentences[entry]] = sent1
        if i % 10000 == 0:
            print(i)
        i += 1

    random.shuffle(pair_list)
    pickle.dump(reverse_dict, open("sentences_small_x", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
    print("writing sentences " + str(len(reverse_dict)))
    pickle.dump(match_dictionary, open("pairs_index_small_x", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
    print("writing map " + str(len(match_dictionary)))
    pickle.dump(pair_list, open("pairs_list_small_x", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
    print("pairs " + str(len(pair_list)))
Ejemplo n.º 6
0
def transform_testdata(test_strs, vocabulary):
    test_strs = [
        Lemmatizer(data_helpers.clean_str(sent)) for sent in test_strs
    ]
    test_strs = [s.split(" ") for s in test_strs]

    test_strs_padded = data_helpers.pad_sentences(test_strs,
                                                  testStringLength=90)
    x = np.array([[
        vocabulary[word] if word in vocabulary else vocabulary["UNKNOWN_WORD"]
        for word in sentence
    ] for sentence in test_strs_padded])
    return x
Ejemplo n.º 7
0
    def predict(self, x_raw):
        x_raw = list(x_raw)
        x_raw = [s.strip() for s in x_raw]
        x_raw = [list(s) for s in x_raw]
        x_pad, _ = data_helpers.pad_sentences(x_raw, sequence_length)
        x_test = np.array([[vocabulary.get(word, 0) for word in sentence]
                           for sentence in x_pad])

        # Get the placeholders from the graph by name
        input_x = self.graph.get_operation_by_name("input_x").outputs[0]
        # input_y = graph.get_operation_by_name("input_y").outputs[0]
        dropout_keep_prob = self.graph.get_operation_by_name(
            "dropout_keep_prob").outputs[0]

        # Tensors we want to evaluate
        predictions = self.graph.get_operation_by_name(
            "output/predictions").outputs[0]

        # Generate batches for one epoch
        batches = data_helpers.batch_iter(list(x_test),
                                          FLAGS.batch_size,
                                          1,
                                          shuffle=False)

        # Collect the predictions here
        all_predictions = []

        for x_test_batch in batches:
            batch_predictions = self.sess.run(predictions, {
                input_x: x_test_batch,
                dropout_keep_prob: 1.0
            })
            all_predictions = np.concatenate(
                [all_predictions, batch_predictions])

        return all_predictions
Ejemplo n.º 8
0
from data_helpers import get_labels
from data_helpers import plot_confusion_matrix
from data_helpers import multiclass_roc_auc_score
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})


print('Loading annotated social text data')
x_text, y_class = load_data_and_labels()
y = get_labels()

sentences_padded, sequence_length = pad_sentences(x_text)

# global variebles
embedding_dim = 200
num_filters = 512
drop = 0.5
epochs = 1
batch_size = 100

# define 10-fold cross validation test harness
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
cvscores = []
auc_scores = []


print('10 fold CV starting')
Ejemplo n.º 9
0
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file,
                                              FLAGS.negative_data_file)
x_eval = data_helpers.load_test_data(FLAGS.test_data_file)

# Pad sentences
sentences_padded_all, max_length = data_helpers.pad_sentences(x_text + x_eval)
sentences_padded, max_length = data_helpers.pad_sentences(x_text, max_length)

# Build vocabulary
vocabulary, vocabulary_inv = data_helpers.build_vocab(sentences_padded_all)
x, y = data_helpers.build_input_data(sentences_padded, y, vocabulary)

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
Ejemplo n.º 10
0
#======================== preprocess data ========================
#

#TODO: After complete all training, use argparse to store the params.
positive_data_file = "./data/rt-polaritydata/rt-polarity.pos"
negtive_data_file = "./data/rt-polaritydata/rt-polarity.neg"

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(positive_data_file,
                                              negtive_data_file)

# Pad sentence
print("Padding sentences...")
x_text = data_helpers.pad_sentences(x_text)
print("The sequence length is: ", len(x_text[0]))

# Build vocabulary
vocabulary, vocabulary_inv = data_helpers.build_vocab(x_text)

# Represent sentence with word index, using word index to represent a sentence
x = data_helpers.build_index_sentence(x_text, vocabulary)
y = y.argmax(
    axis=1)  # y: [1, 1, 1, ...., 0, 0, 0]. 1 for positive, 0 for negative

# Shuffle data
np.random.seed(42)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
Ejemplo n.º 11
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)
    ctx_len = int(params['context_length'])

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    sys.stderr.write('Loading training instances ... ')
    transcript_contexts = []
    for call in trainset:
        for i, (log_utter, translations, label_utter) in enumerate(call):
            transcript = data_helpers.tokenize_and_lower(
                log_utter['transcript'])
            transcript_contexts += [transcript]

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))
            # train_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])]
            train_utters += [(transcript, log_utter['speaker'], sa_label_list,
                              log_utter['utter_index'])]

    sys.stderr.write('Done\n')

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    sys.stderr.write('Loading testing instances ... ')
    transcript_contexts = []
    for call in testset:
        for i, (log_utter, translations, label_utter) in enumerate(call):
            try:
                translation = data_helpers.tokenize_and_lower(
                    translations['translated'][0]['hyp'])
            except:
                translation = ''
            transcript_contexts += [translation]

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))
            # test_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])]
            test_utters += [(translation, log_utter['speaker'], sa_label_list,
                             log_utter['utter_index'])]

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # build vocabulary
    utters = [utter[0].split(' ') for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    utters = [utter[0].split(' ') for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    # make windowed input data as context
    train_inputs = data_helpers.build_windowed_input(train_inputs, ctx_len)
    test_inputs = data_helpers.build_windowed_input(test_inputs, ctx_len)

    # build labels
    sa_train_labels = [utter[2] for utter in train_utters]
    sa_test_labels = [utter[2] for utter in test_utters]
    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels + sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)
    # np.random.shuffle(tourist_test_indices)
    # np.random.shuffle(guide_test_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_labels = train_labels[tourist_train_indices]

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_labels = train_labels[guide_train_indices]

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_labels = test_labels[tourist_test_indices]

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_labels = test_labels[guide_test_indices]

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer,
                          tourist_train_inputs, tourist_train_labels,
                          tourist_test_inputs, tourist_test_labels)

    run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer,
                          guide_train_inputs, guide_train_labels,
                          guide_test_inputs, guide_test_labels)

    print("")
Ejemplo n.º 12
0
    model_dir = 'word2vec_models'
    model_name = "{:d}features_{:d}minwords_{:d}context_{:s}".format(embedding_dim, min_word_count, context,
                                                                     model_variation)
    print ("Loading word2vec model: " + model_name)
    model_name = os.path.join(model_dir, model_name)
    eb = word2vec.Word2Vec.load(model_name)

    print ("Loading CNN")
    arch = 'imdb_' + model_variation + '7_arch.json'
    weights = 'imdb_' + model_variation + '7.h5'
    model = model_from_json(open(arch).read())
    model.load_weights(weights)

    print ("padding senetences")
    pad_size = model.input_shape[1]
    sentences = data_helpers.pad_sentences(sentences, sequence_length=pad_size)

    x = []
    sentence = sentences[0]
    for word in sentence:
        try:
            vect = eb[word]
        except:
            vect = eb['<PAD/>']
        x.append(vect)

    x = np.asarray(x)
    pred = model.predict_classes(x, batch_size=1)
    print pred
    # for sentence in sentences:
    #     temp = []
Ejemplo n.º 13
0
def main(argv):
    parser = argparse.ArgumentParser(description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='')
    parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='')
    parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH',  help='')
    parser.add_argument('--roletype', dest='roletype', action='store', choices=['guide',  'tourist'], required=True,  help='speaker')

    args = parser.parse_args()
    threshold_predictor = None

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True)
    sys.stderr.write('Loading training instances ... ')
    for call in trainset:
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() != args.roletype:
                continue
            transcript = data_helpers.tokenize_and_lower(log_utter['transcript'])

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list = sorted(set(sa_label_list))
            train_utters += [(transcript, log_utter['speaker'], sa_label_list)]
    sys.stderr.write('Done\n')

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True)
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() != args.roletype:
                continue
            try:
                translation = data_helpers.tokenize_and_lower(translations['translated'][0]['hyp'])
            except:
                translation = ''

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list = sorted(set(sa_label_list))
            test_utters += [(translation, log_utter['speaker'], sa_label_list)]

    pprint(train_utters[:2])
    pprint(test_utters[:2])

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)
    num_epochs = int(params['num_epochs'])
    validation_split = float(params['validation_split'])
    batch_size = int(params['batch_size'])
    multilabel = params['multilabel']=="true"

    # build vocabulary
    sents = [utter[0].split(' ') for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_sents = data_helpers.pad_sentences(sents, max_sent_len)
    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_sents)
    print("vocabulary size: %d" % len(vocabulary))
    # params['max_sent_len'] = max_sent_len

    # build inputs
    train_inputs = data_helpers.build_input_data(pad_sents, vocabulary)

    test_sents = [utter[0].split(' ') for utter in test_utters]
    test_pad_sents = data_helpers.pad_sentences(test_sents, max_sent_len)
    test_inputs = data_helpers.build_input_data(test_pad_sents, vocabulary)

    # build labels
    sa_train_labels = [utter[2] for utter in train_utters]
    sa_test_labels = [utter[2] for utter in test_utters]
    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels+sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)

    # split and shuffle data
    indices = np.arange(train_inputs.shape[0])
    np.random.shuffle(indices)
    train_inputs = train_inputs[indices]
    train_labels = train_labels[indices]
    num_validation = int(validation_split * train_inputs.shape[0])

    # x_train = train_inputs[:-num_validation]
    # y_train = train_labels[:-num_validation]
    # x_val = train_inputs[-num_validation:]
    # y_val = train_labels[-num_validation:]
    x_train = train_inputs
    y_train = train_labels

    x_test = test_inputs
    y_test = test_labels

    # construct a pytorch data_loader
    x_train = torch.from_numpy(x_train).long()
    y_train = torch.from_numpy(y_train).float()
    dataset_tensor = data_utils.TensorDataset(x_train, y_train)
    train_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=True, num_workers=4,
                                         pin_memory=False)

    x_test = torch.from_numpy(x_test).long()
    y_test = torch.from_numpy(y_test).long()
    dataset_tensor = data_utils.TensorDataset(x_test, y_test)
    test_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=False, num_workers=4,
                                         pin_memory=False)


    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    # load model
    model = SluConvNet(params, embedding_matrix, len(vocabulary), y_train.shape[1])

    if torch.cuda.is_available():
        model = model.cuda()
    learning_rate = float(params['learning_rate'])
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    loss_fn = nn.MultiLabelSoftMarginLoss()
    # loss_fn = nn.BCEWithLogitsLoss()

    for epoch in range(num_epochs):
        model.train()   # set the model to training mode (apply dropout etc)
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = autograd.Variable(inputs), autograd.Variable(labels)
            if torch.cuda.is_available():
                inputs, labels = inputs.cuda(), labels.cuda()

            preds = model(inputs)
            if torch.cuda.is_available():
                preds = preds.cuda()

            loss = loss_fn(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                print("current loss: %.4f" % loss)

        model.eval()        # set the model to evaluation mode
        # if threshold_predictor is None:
        threshold_predictor = train_threshold(model, train_loader, y_train.numpy())
        # count_predictor = train_count(model, train_loader, y_train.numpy())
        true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel, threshold_predictor)
        # true_acts, pred_acts, metrics = evaluate_count(model, label_binarizer, test_loader, y_test, multilabel, count_predictor)
        print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2]))

    # end of training
    true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel)
    print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2]))

    with open(("pred_result_%s.txt" % args.roletype), "w") as f:
        for pred_act, true_act in zip(pred_acts, true_acts):
            f.write("pred: %s\ntrue: %s\n\n" % (', '.join(pred_act), ', '.join(true_act)))
Ejemplo n.º 14
0
def sent_embedding(sent):
    sent = remove_stop_word(sent)
    sent = split_sentence([sent])
    sent = pad_sentences(sent)
    sent_vec = sentence_word2vec(sent)
    return sent_vec
print("")

# CHANGE THIS: Load data. Load your own data here
x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file,
                                                  FLAGS.negative_data_file)
y_test = np.argmax(y_test, axis=1)
vocabulary = pickle.load(
    open(
        os.path.abspath(os.path.join(FLAGS.checkpoint_dir, "..", "vocab.txt")),
        "rb"))
sequence_length = pickle.load(
    open(os.path.abspath(os.path.join(FLAGS.checkpoint_dir, "..", "len.txt")),
         "rb"))

# Map data into vocabulary
x_pad, _ = data_helpers.pad_sentences(x_raw, sequence_length)
x_test = np.array([[vocabulary.get(word, 0) for word in sentence]
                   for sentence in x_pad])
x_readable = np.array([[word.encode('utf-8') for word in sentence]
                       for sentence in x_raw])

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
Ejemplo n.º 16
0
                num_correct += sum(batchY == np.argmax(m.cnn_exec.outputs[0].asnumpy(), axis=1))
                num_total += len(batchY)

            dev_acc = num_correct * 100 / float(num_total)
            print('Iter [%d] Train: Time: %.3fs, Training Accuracy: %.3f \
                    --- Dev Accuracy thus far: %.3f' % (iteration, train_time, train_acc, dev_acc), file=logs)


if __name__ == '__main__':
    mix_model = mix_cnn()
    print('Loading data...')
    # word2vec = data_helpers.load_google_word2vec('/Users/guo/TrainData/google300/GoogleNews-vectors-negative300.bin')

    word2vec = data_helpers.load_pretrained_word2vec('VecForMR_.txt')
    sentences, labels = data_helpers.load_data_and_labels()
    sentences_padded = data_helpers.pad_sentences(sentences)
    x, y = data_helpers.build_input_data_with_word2vec(sentences_padded, labels, word2vec)
    mix_model.dic = data_helpers.buildGram(sentences, min1=6, min2=7)
    mix_model.initTheta()
    x_sent, mix_model.idf = data_helpers.buildDocsTFIDF(mix_model.dic, sentences)
    x_sent = np.array(x_sent)
    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    cv = 1
    cv_length = len(y)/10
    sample_test = shuffle_indices[cv_length * cv:cv_length * (cv + 1)]
    sample_train = np.concatenate((shuffle_indices[:cv_length * cv], shuffle_indices[cv_length * (cv + 1):]))

    # x_shuffled = x[shuffle_indices]
    # y_shuffled = y[shuffle_indices]
Ejemplo n.º 17
0
def main(_):
    text, y_test = data_helpers.load_test_and_labels(FLAGS.test_file)
    with open('vocab_index.pkl', 'rb') as tr_file:
        train_int_to_vab = pickle.load(tr_file)
    #print (train_int_to_vab)
    train_to_int = {word: word_i for word_i, word in train_int_to_vab.items()}
    test_ids = [[
        train_to_int.get(term, train_to_int['<UNK>']) for term in line
    ] for line in text]
    x_test = data_helpers.pad_sentences(test_ids, 20)
    print(x_test[:3])
    print("\nEvaluating...\n")

    # Evaluation
    # ==================================================
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            # Generate batches for one epoch
            batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size,
                                              1)

            # Collect the predictions here
            all_predictions = []
            batch_predictions = sess.run(predictions, {
                input_x: x_test,
                dropout_keep_prob: 1.0
            })

            all_predictions = np.concatenate(
                [all_predictions, batch_predictions])

            #for x_test_batch in batches:

    print(all_predictions)
    # Print accuracy if y_test is defined
    if y_test is not None:
        correct_predictions = float(sum(all_predictions == y_test))
        print("Total number of test examples: {}".format(len(y_test)))
        print("Accuracy: {:g}".format(correct_predictions /
                                      float(len(y_test))))

    # Save the evaluation to a csv
    predictions_human_readable = np.column_stack(
        (np.array(text), all_predictions))
    out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")
    print("Saving evaluation to {0}".format(out_path))
    with open(out_path, 'w') as f:
        csv.writer(f).writerows(predictions_human_readable)
def main(_):
    # Load data
    print("Loading data...")
    x_, y = data_helpers.load_data_and_labels(FLAGS.train_file)
    train_int_to_vab, train_to_int = data_helpers.cret_dict(x_)
    #保存对应的词和词索引
    #print (train_int_to_vab)
    embeding_matric = data_helpers.word_matric(train_to_int)
    #存储所有字的文件,以便测试加载
    pickle.dump(train_int_to_vab, open('./vocab_index.pkl', 'wb'))
    #print(train_int_to_vab)

    train_ids = [[
        train_to_int.get(term, train_to_int['<UNK>']) for term in line
    ] for line in x_]
    x_ = data_helpers.pad_sentences(train_ids, 20)
    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x_[shuffle_indices]
    y = np.array(y)
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    y_train = np_utils.to_categorical(y_train)
    y_dev = np_utils.to_categorical(y_dev)

    # Training
    # ==================================================

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(sequence_length=x_train.shape[1],
                          num_classes=y_train.shape[1],
                          vocab_size=len(train_int_to_vab),
                          embedding_size=FLAGS.embedding_dim,
                          filter_sizes=list(
                              map(int, FLAGS.filter_sizes.split(","))),
                          num_filters=FLAGS.num_filters,
                          word_embedding_matrix=embeding_matric,
                          l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            def train_step(x_batch, y_batch):
                """
                A single training step
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, accuracy = sess.run(
                    [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))
                if writer:
                    writer.add_summary(summaries, step)

            # Generate batches
            batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                              FLAGS.batch_size,
                                              FLAGS.num_epochs)
            # Training loop. For each batch...
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    dev_step(x_dev, y_dev, writer=dev_summary_writer)
                    print("")
                if current_step % FLAGS.checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))
Ejemplo n.º 19
0
    model_dir = 'word2vec_models'
    model_name = "{:d}features_{:d}minwords_{:d}context_{:s}".format(
        embedding_dim, min_word_count, context, model_variation)
    print("Loading word2vec model: " + model_name)
    model_name = os.path.join(model_dir, model_name)
    eb = word2vec.Word2Vec.load(model_name)

    print("Loading CNN")
    arch = 'imdb_' + model_variation + '7_arch.json'
    weights = 'imdb_' + model_variation + '7.h5'
    model = model_from_json(open(arch).read())
    model.load_weights(weights)

    print("padding senetences")
    pad_size = model.input_shape[1]
    sentences = data_helpers.pad_sentences(sentences, sequence_length=pad_size)

    x = []
    sentence = sentences[0]
    for word in sentence:
        try:
            vect = eb[word]
        except:
            vect = eb['<PAD/>']
        x.append(vect)

    x = np.asarray(x)
    pred = model.predict_classes(x, batch_size=1)
    print pred
    # for sentence in sentences:
    #     temp = []
Ejemplo n.º 20
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    sys.stderr.write('Loading training instances ... ')
    for call in trainset:
        context_utters = []
        context_utter_str = '<PAD/>'
        context_labels = []
        context_label = ['INI_OPENING']
        last_speaker = None
        for (log_utter, translations, label_utter) in call:
            transcript = data_helpers.tokenize_and_lower(
                log_utter['transcript'])
            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))

            if last_speaker is not None and log_utter[
                    'speaker'] != last_speaker:
                if len(context_utters) > 0:
                    context_utter_str = ' <pause> '.join(context_utters)
                    context_label = context_labels[-1]
                else:
                    context_utter_str = '<PAD/>'
                    context_label = ['INI_OPENING']

                context_utters = []
                context_labels = []
                last_speaker = None

            if last_speaker is None or log_utter['speaker'] == last_speaker:
                context_utters += [transcript]  # cumulate context utters
                context_labels += [sa_label_list]

            last_speaker = log_utter['speaker']
            train_utters += [
                (transcript, context_utter_str, log_utter['speaker'],
                 sa_label_list, log_utter['utter_index'], context_label)
            ]
            # train_utters += [(transcript, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)]

    sys.stderr.write('Done\n')

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        context_utters = []
        context_utter_str = '<PAD/>'
        context_labels = []
        context_label = ['INI_OPENING']
        last_speaker = None
        for (log_utter, translations, label_utter) in call:
            try:
                translation = data_helpers.tokenize_and_lower(
                    translations['translated'][0]['hyp'])
            except:
                translation = ''

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))

            if last_speaker is not None and log_utter[
                    'speaker'] != last_speaker:
                if len(context_utters) > 0:
                    context_utter_str = ' <pause> '.join(context_utters)
                    context_label = context_labels[-1]
                else:
                    context_utter_str = ''
                    context_label = ['INI_OPENING']

                context_utters = []
                context_labels = []
                last_speaker = None

            if last_speaker is None or log_utter['speaker'] == last_speaker:
                context_utters += [translation]  # cumulate context utters
                context_labels += [sa_label_list]

            last_speaker = log_utter['speaker']

            test_utters += [
                (translation, context_utter_str, log_utter['speaker'],
                 sa_label_list, log_utter['utter_index'], context_label)
            ]
            # test_utters += [(translation, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)]

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)

    # build vocabulary
    utters = [utter[0].split(' ') for utter in train_utters]
    ctx_utters = [utter[1].split(' ') for utter in train_utters]
    print("max context utter length: %d " %
          max([len(ctx_utter) for ctx_utter in ctx_utters]))
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_ctx_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)
    train_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters,
                                                     vocabulary)

    utters = [utter[0].split(' ') for utter in test_utters]
    ctx_utters = [utter[1].split(' ') for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)
    test_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters, vocabulary)

    # build labels
    sa_train_labels = [utter[3] for utter in train_utters]
    sa_test_labels = [utter[3] for utter in test_utters]
    sa_train_ctx_labels = [utter[5] for utter in train_utters]
    sa_test_ctx_labels = [utter[5] for utter in test_utters]

    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels + sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)
    train_ctx_labels = label_binarizer.transform(sa_train_ctx_labels)
    test_ctx_labels = label_binarizer.transform(sa_test_ctx_labels)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[2].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[2].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[2].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[2].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_ctx_inputs = train_ctx_inputs[tourist_train_indices]
    tourist_train_labels = train_labels[tourist_train_indices]
    tourist_train_ctx_labels = train_ctx_labels[tourist_train_indices]

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_ctx_inputs = train_ctx_inputs[guide_train_indices]
    guide_train_labels = train_labels[guide_train_indices]
    guide_train_ctx_labels = train_ctx_labels[guide_train_indices]

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_ctx_inputs = test_ctx_inputs[tourist_test_indices]
    tourist_test_labels = test_labels[tourist_test_indices]
    tourist_test_ctx_labels = test_ctx_labels[tourist_test_indices]

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_ctx_inputs = test_ctx_inputs[guide_test_indices]
    guide_test_labels = test_labels[guide_test_indices]
    guide_test_ctx_labels = test_ctx_labels[guide_test_indices]

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 tourist_train_inputs, tourist_train_ctx_inputs,
                 tourist_train_labels, tourist_train_ctx_labels,
                 tourist_test_inputs, tourist_test_ctx_inputs,
                 tourist_test_labels, tourist_test_ctx_labels)

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 guide_train_inputs, guide_train_ctx_inputs,
                 guide_train_labels, guide_train_ctx_labels, guide_test_inputs,
                 guide_test_ctx_inputs, guide_test_labels,
                 guide_test_ctx_labels)

    print("")
def main(_):
    # Load data
    print("Loading data...")

    x_, y = data_helpers.build_train_data(FLAGS.label_file, FLAGS.train_file)
    train_int_to_vab, train_to_int = data_helpers.cret_dict(x_)
    #保存对应的词和词索引

    #存储所有字的文件,以便测试加载
    pickle.dump(train_int_to_vab, open('./vocab_index.pkl', 'wb'))

    train_ids = [[
        train_to_int.get(term, train_to_int['<UNK>']) for term in line
    ] for line in x_]
    x_ = data_helpers.pad_sentences(train_ids, 20)
    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x_[shuffle_indices]
    y = np.array(y)
    y_shuffled = y[shuffle_indices]
    folids_list = data_helpers.cross_validation_split_for_smp(
        x_shuffled, y_shuffled)
    for i in range(10):

        if not os.path.exists('save_model/' + str(i) + '/'):
            os.makedirs(os.path.join('save_model', str(i)))
        else:
            continue

    for i in range(10):
        best_acc = 0.0
        print(i)
        print('##################')
        x_train, y_train, x_dev, y_dev = folids_list[i]

        y_train = np_utils.to_categorical(y_train)
        y_dev = np_utils.to_categorical(y_dev)

        # ==================================================

        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                allow_soft_placement=FLAGS.allow_soft_placement,
                log_device_placement=FLAGS.log_device_placement)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                rnn = TextRNN(sequence_length=x_train.shape[1],
                              num_classes=y_train.shape[1],
                              vocab_size=len(train_int_to_vab),
                              batch_size=FLAGS.batch_size,
                              embedding_size=FLAGS.embedding_dim,
                              hidden_size=FLAGS.hidden_size,
                              num_layers=FLAGS.num_layers
                              #word_embedding_matrix=embeding_matric
                              )

                # Define Training procedure
                global_step = tf.Variable(0,
                                          name="global_step",
                                          trainable=False)
                optimizer = tf.train.AdamOptimizer(1e-3)
                grads_and_vars = optimizer.compute_gradients(rnn.loss)
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step=global_step)

                saver = tf.train.Saver(tf.global_variables(),
                                       max_to_keep=FLAGS.num_checkpoints)

                # Initialize all variables
                sess.run(tf.global_variables_initializer())

                def train_step(x_batch, y_batch):
                    """
                    A single training step
                    """
                    feed_dict = {
                        rnn.input_x: x_batch,
                        rnn.input_y: y_batch,
                        rnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                    }
                    _, step, loss, accuracy = sess.run(
                        [train_op, global_step, rnn.loss, rnn.accuracy],
                        feed_dict)
                    return step, loss, accuracy

                def dev_step(x_batch, y_batch):
                    """
                    Evaluates model on a dev set
                    """
                    feed_dict = {
                        rnn.input_x: x_batch,
                        rnn.input_y: y_batch,
                        rnn.dropout_keep_prob: 1.0
                    }
                    step, loss, accuracy = sess.run(
                        [global_step, rnn.loss, rnn.accuracy], feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    print('dev')
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))
                    return accuracy

                def save_best_model(sess, path):
                    path = saver.save(sess, path)

                for epoch in range(FLAGS.num_epochs):
                    print('epoch', epoch)
                    # Generate batches
                    for batch_i, (x_batch, y_batch) in enumerate(
                            data_helpers.get_batches(y_train, x_train,
                                                     FLAGS.batch_size)):

                        step, train_loss, train_accuracy = train_step(
                            x_batch, y_batch)
                        #print('step',step)
                        if batch_i % FLAGS.evaluate_every == 0:
                            time_str = datetime.datetime.now().isoformat()
                            print("{}: step {}, loss {:g}, acc {:g}".format(
                                time_str, step, train_loss, train_accuracy))

                        #=====================
                    accuracy = dev_step(x_dev, y_dev)
                    if accuracy > best_acc:
                        best_acc = accuracy
                        print('save_model' + str(i) + '/best_model.ckpt')
                        save_best_model(
                            sess, 'save_model/' + str(i) + '/best_model.ckpt')
Ejemplo n.º 22
0
# Define Parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
tf.flags.DEFINE_string("sentence", "the movie was bad", "sentence to classify")

FLAGS = tf.flags.FLAGS

#######################################################################################################################
# process the raw sentence
new_review = data_helpers.clean_senetnce(FLAGS.sentence)

# load vocabulary
sentences, _ = data_helpers.load_data_and_labels()
sequence_length = max(len(x) for x in sentences)
sentences_padded = data_helpers.pad_sentences(sentences)
vocabulary, vocabulary_inv = data_helpers.build_vocab(sentences_padded)

num_padding = sequence_length - len(new_review)
new_sentence = new_review + ["<PAD/>"] * num_padding

# convert sentence to input matrix
array = []
for word in new_sentence:
    try:
        word_vector=vocabulary[word]
    except KeyError:
        word_vector=vocabulary["<PAD/>"]
    array.append(word_vector)
x=np.array([array])
Ejemplo n.º 23
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--devset',
                        dest='devset',
                        action='store',
                        metavar='DEVSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)

    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    devset = dataset_walker.dataset_walker(args.devset,
                                           dataroot=args.dataroot,
                                           labels=True,
                                           translations=True)
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset(
        trainset, devset, testset)

    train_utters += dev_utters

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # build vocabulary
    utters = [[char for char in utter[0]] for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    utters = [[char for char in utter[0]] for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    # build labels
    sa_train_labels = [utter[2] for utter in train_utters]
    sa_test_labels = [utter[2] for utter in test_utters]
    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels + sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)
    # np.random.shuffle(tourist_test_indices)
    # np.random.shuffle(guide_test_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_labels = train_labels[tourist_train_indices]

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_labels = train_labels[guide_train_indices]

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_labels = test_labels[tourist_test_indices]

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_labels = test_labels[guide_test_indices]

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 tourist_train_inputs, tourist_train_labels,
                 tourist_test_inputs, tourist_test_labels)

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 guide_train_inputs, guide_train_labels, guide_test_inputs,
                 guide_test_labels)

    print("")
Ejemplo n.º 24
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--devset',
                        dest='devset',
                        action='store',
                        metavar='DEVSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)

    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    devset = dataset_walker.dataset_walker(args.devset,
                                           dataroot=args.dataroot,
                                           labels=True,
                                           translations=True)
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset_multitask(
        trainset, devset, testset)

    train_utters += dev_utters

    context_case = 1
    # 여기다가 previous labels context 를 구성하는 코드를 작성하자!
    # 1) 이전 화행 N개 (speaker 구분안함)
    # 2) 이전 턴의 상대방 발화들의 모든 화행 (n개)
    if context_case == 1:

        pass

    else:
        pass

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # build vocabulary
    utters = [[char for char in utter[0]] for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    utters = [[char for char in utter[0]] for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    # build labels
    train_labels_category = [utter[3] for utter in train_utters]
    test_labels_category = [utter[3] for utter in test_utters]
    train_labels_attr = [utter[4] for utter in train_utters]
    test_labels_attr = [utter[4] for utter in test_utters]
    train_labels_sa = [utter[5] for utter in train_utters]
    test_labels_sa = [utter[5] for utter in test_utters]

    label_binarizer_category = preprocessing.MultiLabelBinarizer()
    label_binarizer_category.fit(train_labels_category + test_labels_category)

    label_binarizer_attr = preprocessing.MultiLabelBinarizer()
    label_binarizer_attr.fit(train_labels_attr + test_labels_attr)

    label_binarizer_sa = preprocessing.MultiLabelBinarizer()
    label_binarizer_sa.fit(train_labels_sa + test_labels_sa)

    train_labels_category = label_binarizer_category.transform(
        train_labels_category)
    test_labels_category = label_binarizer_category.transform(
        test_labels_category)
    train_labels_attr = label_binarizer_attr.transform(train_labels_attr)
    test_labels_attr = label_binarizer_attr.transform(test_labels_attr)
    train_labels_sa = label_binarizer_sa.transform(train_labels_sa)
    test_labels_sa = label_binarizer_sa.transform(test_labels_sa)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)
    # np.random.shuffle(tourist_test_indices)
    # np.random.shuffle(guide_test_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_labels_category = train_labels_category[
        tourist_train_indices]
    tourist_train_labels_attr = train_labels_attr[tourist_train_indices]
    tourist_train_labels_sa = train_labels_sa[tourist_train_indices]
    tourist_train_labels = (tourist_train_labels_category,
                            tourist_train_labels_attr, tourist_train_labels_sa)

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_labels_category = train_labels_category[guide_train_indices]
    guide_train_labels_attr = train_labels_attr[guide_train_indices]
    guide_train_labels_sa = train_labels_sa[guide_train_indices]
    guide_train_labels = (guide_train_labels_category, guide_train_labels_attr,
                          guide_train_labels_sa)

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_labels_category = test_labels_category[tourist_test_indices]
    tourist_test_labels_attr = test_labels_attr[tourist_test_indices]
    tourist_test_labels_sa = test_labels_sa[tourist_test_indices]
    tourist_test_labels = (tourist_test_labels_category,
                           tourist_test_labels_attr, tourist_test_labels_sa)

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_labels_category = test_labels_category[guide_test_indices]
    guide_test_labels_attr = test_labels_attr[guide_test_indices]
    guide_test_labels_sa = test_labels_sa[guide_test_indices]
    guide_test_labels = (guide_test_labels_category, guide_test_labels_attr,
                         guide_test_labels_sa)

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa,
                 tourist_train_inputs, tourist_train_labels,
                 tourist_test_inputs, tourist_test_labels)

    run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa,
                 guide_train_inputs, guide_train_labels, guide_test_inputs,
                 guide_test_labels)
Ejemplo n.º 25
0
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

x_raw = data_helpers.load_test_data(
    '/Users/Winnerineast/Documents/haodaifu/NewData/tobetrained.csv')

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocabulary, vocabulary_inv, max_length = data_helpers.restore_vocabulary(
    vocab_path)
sentences_padded, tmp_length = data_helpers.pad_sentences(x_raw, max_length)
x_test, y_test = data_helpers.build_input_data(sentences_padded, None,
                                               vocabulary)

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():