Exemple #1
0
def get_data(keys=[], name="cross_domain"):
    if name == "cross_domain":
        Vocab = 
    elif name == "mix_domain":   
        Vocab = 
    else:
        print("Invalid dataset name")

    trustworthy_reviews   = get_reviews(review_type="trustworthy", keys=keys)
    untrustworthy_reviews = get_reviews(review_type="trustworthy", keys=keys)

    reviews = trustworthy_reviews + untrustworthy_reviews
    data = data_helpers.build_input_data(reviews, VOCAB)

    # generaate labels
    labels_trustworthy   = [[1,0] for _ in range(len(trustworthy_reviews   ))]
    labels_untrustworthy = [[0,1] for _ in range(len(untrustworthy_reviews ))]

    labels = np.array(labels_trustworthy + labels_untrustworthy)

    # data_helpers.build_vocab(TRUSTWORTHY_REVIEWS + UNTRUSTWORTHY_REVIEWS, vocab_size=30001)

    print("data len: ", data.shape[0])
    print("labels len: ", labels.shape[0])
        
    return data, labels
Exemple #2
0
def prepare_sentences(sentences, vocabulary, max_length):
    print(sentences)
    sentences_processed = process_sentences(sentences)
    sentences_padded, _ = pad_sentences(sentences_processed,
                                        sequence_length=max_length)
    x, _ = build_input_data(sentences_padded, 0, vocabulary)
    return x
Exemple #3
0
def preencode(df):
    sentences =  make_text_matrix(df)
    s = [x.split() for x in sentences['text'].values]
    l = sentences['target'].values
    sentences_padded = pad_sentences(s)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, l, vocabulary)
    return x,y,vocabulary,vocabulary_inv
Exemple #4
0
def preprocess(model):
    dict_sentences = {}
    reverse_dict = {}
    match_dictionary = {}
    pair_list = []
    import sys
    i = 0
    k = 0
    maxlen = 0
    # this reads in one line at a time from stdin
    for line in sys.stdin:
        i+=1
        tokens = line.split("\t")
        sent1 = tokens[0]
        sent2 = tokens[1]

        if clean_sent_cond(sent1) or clean_sent_cond(sent2):
            continue
        else:
            k += 1

        if not sent1 in dict_sentences:
            dict_sentences[sent1] = len(dict_sentences) + 1
        if not sent2 in dict_sentences:
            dict_sentences[sent2] = len(dict_sentences) + 1
        index_1 = dict_sentences[sent1]
        index_2 = dict_sentences[sent2]

        if not index_1 in match_dictionary:
            match_dictionary[index_1] = []
        if not index_2 in match_dictionary:
            match_dictionary[index_2] = []
        match_dictionary[index_1].append(index_2)
        match_dictionary[index_2].append(index_1)
        pair_list.append((index_1, index_2))

        if i % 10000 == 0:
            print(str(k) + "/" + str(i))
        if k == 500000:
            break;

    i = 0
    for entry in dict_sentences:
        simple_sent1 = filter(lambda x: len(x) > 1, data_helpers.clean_str(entry).split(" "))
        sent1 = data_helpers.build_input_data(data_helpers.pad_sentences([simple_sent1], 40, padding_word="<PAD/>"),
                                          model.vocab)
        reverse_dict[dict_sentences[entry]] = sent1
        if i % 10000 == 0:
            print(i)
        i += 1

    random.shuffle(pair_list)
    pickle.dump(reverse_dict, open("sentences_small_x", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
    print("writing sentences " + str(len(reverse_dict)))
    pickle.dump(match_dictionary, open("pairs_index_small_x", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
    print("writing map " + str(len(match_dictionary)))
    pickle.dump(pair_list, open("pairs_list_small_x", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
    print("pairs " + str(len(pair_list)))
    def __init__(self,
                 positive_file=real_T_file_,
                 negative_file=real_U_file_,
                 fold=FOLD,
                 is_test_data=False):  # change real_U_file to fake_U_file
        super(Data, self).__init__()
        self.fold = fold

        ###### BEGIN #####
        # load data
        trustworthy_reviews_for_training, trustworthy_reviews_for_testing, untrustworthy_reviews_for_training, untrustworthy_reviews_for_testing = load_data(
            fold=fold)
        train = trustworthy_reviews_for_training + untrustworthy_reviews_for_training
        test = trustworthy_reviews_for_testing + untrustworthy_reviews_for_testing

        # generaate labels
        train_labels1 = [[1, 0]
                         for _ in range(len(trustworthy_reviews_for_training))]
        train_labels0 = [
            [0, 1] for _ in range(len(untrustworthy_reviews_for_training))
        ]
        test_labels1 = [[1, 0]
                        for _ in range(len(trustworthy_reviews_for_testing))]
        test_labels0 = [[0, 1]
                        for _ in range(len(untrustworthy_reviews_for_testing))]

        train_labels = np.array(train_labels1 + train_labels0)
        test_labels = np.array(test_labels1 + test_labels0)

        # convert word2idx
        vocabulary, vocabulary_inv = data_helpers.build_vocab(train + test,
                                                              vocab_size=30001)
        train = torch.as_tensor(
            data_helpers.build_input_data(train, vocabulary))
        test = torch.as_tensor(data_helpers.build_input_data(test, vocabulary))

        assert (train.shape[0], train_labels.shape[0])
        assert (test.shape[0], test_labels.shape[0])

        if not is_test_data:
            self.data = train
            self.labels = train_labels
        else:
            self.data = test
            self.labels = test_labels
Exemple #6
0
    def preprocess(self, line):
        if line is None:
            return [[0]]

        if self.which_data == 'amazon':

            line = html.unescape(line)
            line = ' '.join(tkn(multi_occur_regex.sub('', line))).lower()
            sentences = [(tdh.clean_str(line).split(" "))]
            return (tdh.build_input_data(sentences, self.vocabulary))

        elif self.which_data == 'eurlex':
            pass
        elif self.which_data == 'tweets':
            line = ' '.join(tweet_tokenizer.tokenize(line))
            sentences = [(tdh.clean_str(line).split(" "))]
            return (tdh.build_input_data(sentences, self.vocabulary))

        return [[0]]
    def __init__(self, keys=[]):  # change real_U_file to fake_U_file
        super(ReviewDataset, self).__init__()

        trustworthy_reviews = get_reviews(review_type="trustworthy", keys=keys)
        untrustworthy_reviews = get_reviews(review_type="trustworthy",
                                            keys=keys)

        reviews = trustworthy_reviews + untrustworthy_reviews
        self.data = torch.as_tensor(
            data_helpers.build_input_data(reviews, vocabulary))

        # generaate labels
        labels_trustworthy = [[1, 0] for _ in range(len(trustworthy_reviews))]
        labels_untrustworthy = [[0, 1]
                                for _ in range(len(untrustworthy_reviews))]

        self.labels = np.array(labels_trustworthy + labels_untrustworthy)

        # convert word2idx
        vocabulary = get_vocab()

        print("data len: ", self.data.shape[0])
        print("labels len: ", self.labels.shape[0])
Exemple #8
0
    def __init__(self,
                 train_keys=[],
                 test_keys=[]):  # change real_U_file to fake_U_file
        super(MixedDomainDataset, self).__init__()

        trustworthy_reviews, untrustworthy_reviews = load_data()
        reviews = trustworthy_reviews + untrustworthy_reviews

        # generate labels
        labels_trustworthy = [[1, 0] for _ in range(len(trustworthy_reviews))]
        labels_untrustworthy = [[0, 1]
                                for _ in range(len(untrustworthy_reviews))]

        self.labels = np.array(labels_trustworthy + labels_untrustworthy)

        # convert word2idx
        vocabulary, vocabulary_inv = data_helpers.build_vocab(
            trustworthy_reviews + untrustworthy_reviews, vocab_size=30001)
        self.data = torch.as_tensor(
            data_helpers.build_input_data(reviews, vocabulary))

        print("data len: ", self.data.shape[0])
        print("labels len: ", self.labels.shape[0])
Exemple #9
0
# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file,
                                              FLAGS.negative_data_file)
x_eval = data_helpers.load_test_data(FLAGS.test_data_file)

# Pad sentences
sentences_padded_all, max_length = data_helpers.pad_sentences(x_text + x_eval)
sentences_padded, max_length = data_helpers.pad_sentences(x_text, max_length)

# Build vocabulary
vocabulary, vocabulary_inv = data_helpers.build_vocab(sentences_padded_all)
x, y = data_helpers.build_input_data(sentences_padded, y, vocabulary)

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
print("Vocabulary Size: {:d}".format(len(vocabulary)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

# Training
    'text': ' '.join(row[4]),
    'catgy': row[5]
} for row in test_semeval.itertuples()]
val_tweet_list = [{
    'text': ' '.join(row[4]),
    'catgy': row[5]
} for row in val_semeval.itertuples()]

trn_sents, Y_trn = tdh.load_data_and_labels(train_tweet_list, num_labels)
tst_sents, Y_tst = tdh.load_data_and_labels(test_tweet_list, num_labels)
val_sents, Y_val = tdh.load_data_and_labels(val_tweet_list, num_labels)

embedding_file = '/home/cse/phd/csz178057/scratch/squad/data/glove.6B.300d.txt'
vocabs = tdh.get_vocabs_embeddings(trn_sents, embedding_file, num_features=300)
labels_inv = list(labels_map.emoji_desc)
vocabs['labels_inv'] = labels_inv

print('\n'.join([
    '{},{}'.format(x[0], x[1]) for x in vocabs['word_counts'].most_common(None)
]),
      file=open(word_count_txt_file, 'w'))

X_trn = tdh.build_input_data(trn_sents, vocabs['vocabulary'])
X_tst = tdh.build_input_data(tst_sents, vocabs['vocabulary'])
X_val = tdh.build_input_data(val_sents, vocabs['vocabulary'])

pickle.dump({'x': X_trn, 'y': Y_trn}, open(train_output_file, 'wb'))
pickle.dump({'x': X_tst, 'y': Y_tst}, open(test_output_file, 'wb'))
pickle.dump({'x': X_val, 'y': Y_val}, open(val_output_file, 'wb'))
pickle.dump(vocabs, open(vocab_output_file, 'wb'))
Exemple #11
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)
    ctx_len = int(params['context_length'])

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    sys.stderr.write('Loading training instances ... ')
    transcript_contexts = []
    for call in trainset:
        for i, (log_utter, translations, label_utter) in enumerate(call):
            transcript = data_helpers.tokenize_and_lower(
                log_utter['transcript'])
            transcript_contexts += [transcript]

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))
            # train_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])]
            train_utters += [(transcript, log_utter['speaker'], sa_label_list,
                              log_utter['utter_index'])]

    sys.stderr.write('Done\n')

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    sys.stderr.write('Loading testing instances ... ')
    transcript_contexts = []
    for call in testset:
        for i, (log_utter, translations, label_utter) in enumerate(call):
            try:
                translation = data_helpers.tokenize_and_lower(
                    translations['translated'][0]['hyp'])
            except:
                translation = ''
            transcript_contexts += [translation]

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))
            # test_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])]
            test_utters += [(translation, log_utter['speaker'], sa_label_list,
                             log_utter['utter_index'])]

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # build vocabulary
    utters = [utter[0].split(' ') for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    utters = [utter[0].split(' ') for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    # make windowed input data as context
    train_inputs = data_helpers.build_windowed_input(train_inputs, ctx_len)
    test_inputs = data_helpers.build_windowed_input(test_inputs, ctx_len)

    # build labels
    sa_train_labels = [utter[2] for utter in train_utters]
    sa_test_labels = [utter[2] for utter in test_utters]
    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels + sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)
    # np.random.shuffle(tourist_test_indices)
    # np.random.shuffle(guide_test_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_labels = train_labels[tourist_train_indices]

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_labels = train_labels[guide_train_indices]

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_labels = test_labels[tourist_test_indices]

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_labels = test_labels[guide_test_indices]

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer,
                          tourist_train_inputs, tourist_train_labels,
                          tourist_test_inputs, tourist_test_labels)

    run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer,
                          guide_train_inputs, guide_train_labels,
                          guide_test_inputs, guide_test_labels)

    print("")
Exemple #12
0
def main(argv):
    parser = argparse.ArgumentParser(description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='')
    parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='')
    parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH',  help='')
    parser.add_argument('--roletype', dest='roletype', action='store', choices=['guide',  'tourist'], required=True,  help='speaker')

    args = parser.parse_args()
    threshold_predictor = None

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True)
    sys.stderr.write('Loading training instances ... ')
    for call in trainset:
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() != args.roletype:
                continue
            transcript = data_helpers.tokenize_and_lower(log_utter['transcript'])

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list = sorted(set(sa_label_list))
            train_utters += [(transcript, log_utter['speaker'], sa_label_list)]
    sys.stderr.write('Done\n')

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True)
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        for (log_utter, translations, label_utter) in call:
            if log_utter['speaker'].lower() != args.roletype:
                continue
            try:
                translation = data_helpers.tokenize_and_lower(translations['translated'][0]['hyp'])
            except:
                translation = ''

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list = sorted(set(sa_label_list))
            test_utters += [(translation, log_utter['speaker'], sa_label_list)]

    pprint(train_utters[:2])
    pprint(test_utters[:2])

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)
    num_epochs = int(params['num_epochs'])
    validation_split = float(params['validation_split'])
    batch_size = int(params['batch_size'])
    multilabel = params['multilabel']=="true"

    # build vocabulary
    sents = [utter[0].split(' ') for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_sents = data_helpers.pad_sentences(sents, max_sent_len)
    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_sents)
    print("vocabulary size: %d" % len(vocabulary))
    # params['max_sent_len'] = max_sent_len

    # build inputs
    train_inputs = data_helpers.build_input_data(pad_sents, vocabulary)

    test_sents = [utter[0].split(' ') for utter in test_utters]
    test_pad_sents = data_helpers.pad_sentences(test_sents, max_sent_len)
    test_inputs = data_helpers.build_input_data(test_pad_sents, vocabulary)

    # build labels
    sa_train_labels = [utter[2] for utter in train_utters]
    sa_test_labels = [utter[2] for utter in test_utters]
    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels+sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)

    # split and shuffle data
    indices = np.arange(train_inputs.shape[0])
    np.random.shuffle(indices)
    train_inputs = train_inputs[indices]
    train_labels = train_labels[indices]
    num_validation = int(validation_split * train_inputs.shape[0])

    # x_train = train_inputs[:-num_validation]
    # y_train = train_labels[:-num_validation]
    # x_val = train_inputs[-num_validation:]
    # y_val = train_labels[-num_validation:]
    x_train = train_inputs
    y_train = train_labels

    x_test = test_inputs
    y_test = test_labels

    # construct a pytorch data_loader
    x_train = torch.from_numpy(x_train).long()
    y_train = torch.from_numpy(y_train).float()
    dataset_tensor = data_utils.TensorDataset(x_train, y_train)
    train_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=True, num_workers=4,
                                         pin_memory=False)

    x_test = torch.from_numpy(x_test).long()
    y_test = torch.from_numpy(y_test).long()
    dataset_tensor = data_utils.TensorDataset(x_test, y_test)
    test_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=False, num_workers=4,
                                         pin_memory=False)


    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    # load model
    model = SluConvNet(params, embedding_matrix, len(vocabulary), y_train.shape[1])

    if torch.cuda.is_available():
        model = model.cuda()
    learning_rate = float(params['learning_rate'])
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    loss_fn = nn.MultiLabelSoftMarginLoss()
    # loss_fn = nn.BCEWithLogitsLoss()

    for epoch in range(num_epochs):
        model.train()   # set the model to training mode (apply dropout etc)
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = autograd.Variable(inputs), autograd.Variable(labels)
            if torch.cuda.is_available():
                inputs, labels = inputs.cuda(), labels.cuda()

            preds = model(inputs)
            if torch.cuda.is_available():
                preds = preds.cuda()

            loss = loss_fn(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                print("current loss: %.4f" % loss)

        model.eval()        # set the model to evaluation mode
        # if threshold_predictor is None:
        threshold_predictor = train_threshold(model, train_loader, y_train.numpy())
        # count_predictor = train_count(model, train_loader, y_train.numpy())
        true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel, threshold_predictor)
        # true_acts, pred_acts, metrics = evaluate_count(model, label_binarizer, test_loader, y_test, multilabel, count_predictor)
        print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2]))

    # end of training
    true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel)
    print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2]))

    with open(("pred_result_%s.txt" % args.roletype), "w") as f:
        for pred_act, true_act in zip(pred_acts, true_acts):
            f.write("pred: %s\ntrue: %s\n\n" % (', '.join(pred_act), ', '.join(true_act)))
Exemple #13
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--devset',
                        dest='devset',
                        action='store',
                        metavar='DEVSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)

    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    devset = dataset_walker.dataset_walker(args.devset,
                                           dataroot=args.dataroot,
                                           labels=True,
                                           translations=True)
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset(
        trainset, devset, testset)

    train_utters += dev_utters

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # build vocabulary
    utters = [[char for char in utter[0]] for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    utters = [[char for char in utter[0]] for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    # build labels
    sa_train_labels = [utter[2] for utter in train_utters]
    sa_test_labels = [utter[2] for utter in test_utters]
    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels + sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)
    # np.random.shuffle(tourist_test_indices)
    # np.random.shuffle(guide_test_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_labels = train_labels[tourist_train_indices]

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_labels = train_labels[guide_train_indices]

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_labels = test_labels[tourist_test_indices]

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_labels = test_labels[guide_test_indices]

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 tourist_train_inputs, tourist_train_labels,
                 tourist_test_inputs, tourist_test_labels)

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 guide_train_inputs, guide_train_labels, guide_test_inputs,
                 guide_test_labels)

    print("")
Exemple #14
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--devset',
                        dest='devset',
                        action='store',
                        metavar='DEVSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)

    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    devset = dataset_walker.dataset_walker(args.devset,
                                           dataroot=args.dataroot,
                                           labels=True,
                                           translations=True)
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset_multitask(
        trainset, devset, testset)

    train_utters += dev_utters

    context_case = 1
    # 여기다가 previous labels context 를 구성하는 코드를 작성하자!
    # 1) 이전 화행 N개 (speaker 구분안함)
    # 2) 이전 턴의 상대방 발화들의 모든 화행 (n개)
    if context_case == 1:

        pass

    else:
        pass

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # build vocabulary
    utters = [[char for char in utter[0]] for utter in train_utters]
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    utters = [[char for char in utter[0]] for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)

    # build labels
    train_labels_category = [utter[3] for utter in train_utters]
    test_labels_category = [utter[3] for utter in test_utters]
    train_labels_attr = [utter[4] for utter in train_utters]
    test_labels_attr = [utter[4] for utter in test_utters]
    train_labels_sa = [utter[5] for utter in train_utters]
    test_labels_sa = [utter[5] for utter in test_utters]

    label_binarizer_category = preprocessing.MultiLabelBinarizer()
    label_binarizer_category.fit(train_labels_category + test_labels_category)

    label_binarizer_attr = preprocessing.MultiLabelBinarizer()
    label_binarizer_attr.fit(train_labels_attr + test_labels_attr)

    label_binarizer_sa = preprocessing.MultiLabelBinarizer()
    label_binarizer_sa.fit(train_labels_sa + test_labels_sa)

    train_labels_category = label_binarizer_category.transform(
        train_labels_category)
    test_labels_category = label_binarizer_category.transform(
        test_labels_category)
    train_labels_attr = label_binarizer_attr.transform(train_labels_attr)
    test_labels_attr = label_binarizer_attr.transform(test_labels_attr)
    train_labels_sa = label_binarizer_sa.transform(train_labels_sa)
    test_labels_sa = label_binarizer_sa.transform(test_labels_sa)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[1].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[1].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)
    # np.random.shuffle(tourist_test_indices)
    # np.random.shuffle(guide_test_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_labels_category = train_labels_category[
        tourist_train_indices]
    tourist_train_labels_attr = train_labels_attr[tourist_train_indices]
    tourist_train_labels_sa = train_labels_sa[tourist_train_indices]
    tourist_train_labels = (tourist_train_labels_category,
                            tourist_train_labels_attr, tourist_train_labels_sa)

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_labels_category = train_labels_category[guide_train_indices]
    guide_train_labels_attr = train_labels_attr[guide_train_indices]
    guide_train_labels_sa = train_labels_sa[guide_train_indices]
    guide_train_labels = (guide_train_labels_category, guide_train_labels_attr,
                          guide_train_labels_sa)

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_labels_category = test_labels_category[tourist_test_indices]
    tourist_test_labels_attr = test_labels_attr[tourist_test_indices]
    tourist_test_labels_sa = test_labels_sa[tourist_test_indices]
    tourist_test_labels = (tourist_test_labels_category,
                           tourist_test_labels_attr, tourist_test_labels_sa)

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_labels_category = test_labels_category[guide_test_indices]
    guide_test_labels_attr = test_labels_attr[guide_test_indices]
    guide_test_labels_sa = test_labels_sa[guide_test_indices]
    guide_test_labels = (guide_test_labels_category, guide_test_labels_attr,
                         guide_test_labels_sa)

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa,
                 tourist_train_inputs, tourist_train_labels,
                 tourist_test_inputs, tourist_test_labels)

    run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa,
                 guide_train_inputs, guide_train_labels, guide_test_inputs,
                 guide_test_labels)
Exemple #15
0
def main():
    global cuda
    cuda = torch.cuda.is_available()
    if cuda:
        train_sequence.cuda = cuda
        sequence_tagger.cuda = cuda
        utils.cuda = cuda
        train_sequence_crafted.cuda = cuda

    if args.crafted:
        this_train_sequence = train_sequence_crafted
    else:
        this_train_sequence = train_sequence

    utils.log('start reading ner file ')
    (token_list, tag_list,
     raw_token_list) = utils.prepare_data(args.input, True)
    vocabs = pickle.load(open(args.vocab_path, 'rb'))
    y = list(
        map(lambda x: np.array(list(map(lambda y: vocabs['y_dict'][y], x))),
            tag_list))

    x = tdh.build_input_data(token_list, vocabs['vocabulary'])

    #extract crafted features
    train_data = utils.get_data_with_pos_tag(raw_token_list, tag_list)
    features = utils.extract_features(train_data, vocabs['uptl'],
                                      vocabs['treatment_suffix'],
                                      vocabs['disease_suffix'], vocabs['dis'])

    ds_data = {'x': x, 'y': y, 'z': features}

    ds = sequence_dataset.sequence_dataset(
        '.',
        'test',
        ds_data,
        word_counts=vocabs['word_counts'],
        vocabulary_inv=vocabs['vocabulary_inv'],
        crafted_features=args.crafted)

    val_loader = DataLoader(ds,
                            batch_sampler=data_samplers.BatchSampler(
                                list(map(lambda x: min(999999, len(x[0])),
                                         ds)),
                                256,
                                shuffle=False),
                            num_workers=4)

    vocab_size = ds.vocab_size
    embedding_init = vocabs['embedding_init']
    embedding_init = embedding_init[:vocab_size]
    if args.model == 'bilstm':
        if args.crafted:
            model = sequence_tagger.BilstmSequenceTaggerCraftedFeatures(
                len(vocabs['y_dict']),
                vocab_size,
                embedding_size=embedding_init.shape[1],
                hidden_size=args.hidden_size,
                intermediate_size=args.intermediate_size,
                embedding_init=embedding_init,
                crafted_features_size=args.num_crafted)
            criterion = nn.CrossEntropyLoss()
            if cuda:
                criterion.cuda()
            #
            my_loss_fn = lambda x, y, z, m: utils.std_loss_fn_crafted(
                x, y, z, m, criterion)
        else:
            model = sequence_tagger.BilstmSequenceTagger(
                len(vocabs['y_dict']),
                vocab_size,
                embedding_size=embedding_init.shape[1],
                hidden_size=args.hidden_size,
                intermediate_size=args.intermediate_size,
                embedding_init=embedding_init)
            criterion = nn.CrossEntropyLoss()
            if cuda:
                criterion.cuda()
            #
            my_loss_fn = lambda x, y, m: utils.std_loss_fn(x, y, m, criterion)

    else:
        model = sequence_tagger.BilstmCRFSequenceTagger(
            len(vocabs['y_dict']),
            vocab_size,
            embedding_size=embedding_init.shape[1],
            hidden_size=args.hidden_size,
            intermediate_size=args.intermediate_size,
            embedding_init=embedding_init)
        my_loss_fn = utils.lstm_crf_neg_log_likelihood_loss1

    checkpoint = torch.load(args.checkpoint)
    model.load_state_dict(checkpoint['model'])
    rec, i, all_pred = this_train_sequence.compute_sequence(-1,
                                                            model,
                                                            my_loss_fn,
                                                            val_loader,
                                                            None,
                                                            'eval',
                                                            None,
                                                            None, [],
                                                            return_preds=True)
    utils.write_output(all_pred, raw_token_list, vocabs['y_dict_inv'],
                       args.output)
                             model_name, num_features)

#### add titles to existing pickle files
test_title_file = '../data/amazon13k/test_titles.txt'
train_title_file = '../data/amazon13k/train_titles.txt'

vocabs = pickle.load(open(vocab_output_file, 'rb'))
train_data = pickle.load(open(train_output_file, 'rb'))
test_data = pickle.load(open(test_output_file, 'rb'))

train_titles = [x[:-1] for x in open(train_title_file, 'r').readlines()]
train_titles_sents = [tdh.clean_str(x).split() for x in train_titles]
test_titles = [x[:-1] for x in open(test_title_file, 'r').readlines()]
test_titles_sents = [tdh.clean_str(x).split() for x in test_titles]

X_trn_titles = tdh.build_input_data(train_titles_sents, vocabs['vocabulary'])
X_tst_titles = tdh.build_input_data(test_titles_sents, vocabs['vocabulary'])

train_data['titles'] = X_trn_titles
test_data['titles'] = X_tst_titles
pickle.dump(train_data, open(train_output_file, 'wb'))
pickle.dump(test_data, open(test_output_file, 'wb'))

#### add labels inv to existing vocabs
vocabs = pickle.load(open(vocab_output_file, 'rb'))
labels_inv = [
    x[:-1] for x in open(labels_path, 'r', errors='ignore').readlines()
]
vocabs['labels_inv'] = labels_inv
pickle.dump(vocabs, open(vocab_output_file, 'wb'))
Exemple #17
0
    X_train = list()
    X_test = list()
    for index in train:
        X_train.append(sentences_padded[index])
    for index in test:
        X_test.append(sentences_padded[index])
    y_train = y_class[train]
    y_test = y_class[test]

    # building vocabulary on train set
    print('building vocabulary on train set')
    vocabulary, vocabulary_inv = build_vocab(X_train)

    # Maps sentences to vectors based on vocabulary
    print('Mapping sentences to vectors based on vocabulary')
    X_train, y_train = build_input_data(X_train, y_train, vocabulary)
    # print(X_train.shape)
    X_test, y_test = build_input_data(X_test, y_test, vocabulary)
    # all x and y for predicting
    x, y_class = build_input_data(sentences_padded, y_class, vocabulary)
    # print(X_test.shape)
    vocabulary_size = len(vocabulary_inv)

    # building embedding matrix using GloVe word embeddings
    print('building embedding matrix using GloVe word embeddings')
    embedding_matrix = create_embedding_matrix('./dataset/myGloVe200d.txt', vocabulary, embedding_dim)

    # this returns a tensor
    print("Creating Model...")
    inputs = Input(shape=(sequence_length,), dtype='int32')
    embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=sequence_length)(inputs)
Exemple #18
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='CNN baseline for DSTC5 SAP Task')
    parser.add_argument('--trainset',
                        dest='trainset',
                        action='store',
                        metavar='TRAINSET',
                        required=True,
                        help='')
    parser.add_argument('--testset',
                        dest='testset',
                        action='store',
                        metavar='TESTSET',
                        required=True,
                        help='')
    parser.add_argument('--dataroot',
                        dest='dataroot',
                        action='store',
                        required=True,
                        metavar='PATH',
                        help='')

    args = parser.parse_args()

    train_utters = []
    trainset = dataset_walker.dataset_walker(args.trainset,
                                             dataroot=args.dataroot,
                                             labels=True,
                                             translations=True)
    sys.stderr.write('Loading training instances ... ')
    for call in trainset:
        context_utters = []
        context_utter_str = '<PAD/>'
        context_labels = []
        context_label = ['INI_OPENING']
        last_speaker = None
        for (log_utter, translations, label_utter) in call:
            transcript = data_helpers.tokenize_and_lower(
                log_utter['transcript'])
            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))

            if last_speaker is not None and log_utter[
                    'speaker'] != last_speaker:
                if len(context_utters) > 0:
                    context_utter_str = ' <pause> '.join(context_utters)
                    context_label = context_labels[-1]
                else:
                    context_utter_str = '<PAD/>'
                    context_label = ['INI_OPENING']

                context_utters = []
                context_labels = []
                last_speaker = None

            if last_speaker is None or log_utter['speaker'] == last_speaker:
                context_utters += [transcript]  # cumulate context utters
                context_labels += [sa_label_list]

            last_speaker = log_utter['speaker']
            train_utters += [
                (transcript, context_utter_str, log_utter['speaker'],
                 sa_label_list, log_utter['utter_index'], context_label)
            ]
            # train_utters += [(transcript, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)]

    sys.stderr.write('Done\n')

    test_utters = []
    testset = dataset_walker.dataset_walker(args.testset,
                                            dataroot=args.dataroot,
                                            labels=True,
                                            translations=True)
    sys.stderr.write('Loading testing instances ... ')
    for call in testset:
        context_utters = []
        context_utter_str = '<PAD/>'
        context_labels = []
        context_label = ['INI_OPENING']
        last_speaker = None
        for (log_utter, translations, label_utter) in call:
            try:
                translation = data_helpers.tokenize_and_lower(
                    translations['translated'][0]['hyp'])
            except:
                translation = ''

            speech_act = label_utter['speech_act']
            sa_label_list = []
            for sa in speech_act:
                sa_label_list += [
                    '%s_%s' % (sa['act'], attr) for attr in sa['attributes']
                ]
            sa_label_list = sorted(set(sa_label_list))

            if last_speaker is not None and log_utter[
                    'speaker'] != last_speaker:
                if len(context_utters) > 0:
                    context_utter_str = ' <pause> '.join(context_utters)
                    context_label = context_labels[-1]
                else:
                    context_utter_str = ''
                    context_label = ['INI_OPENING']

                context_utters = []
                context_labels = []
                last_speaker = None

            if last_speaker is None or log_utter['speaker'] == last_speaker:
                context_utters += [translation]  # cumulate context utters
                context_labels += [sa_label_list]

            last_speaker = log_utter['speaker']

            test_utters += [
                (translation, context_utter_str, log_utter['speaker'],
                 sa_label_list, log_utter['utter_index'], context_label)
            ]
            # test_utters += [(translation, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)]

    # pprint(train_utters[:2])
    # pprint(test_utters[:2])

    # dump_corpus(train_utters, "dstc5_train.txt")
    # dump_corpus(test_utters, "dstc5_test.txt")

    # load parameters
    params = data_helpers.load_params("parameters/cnn.txt")
    pprint(params)

    # build vocabulary
    utters = [utter[0].split(' ') for utter in train_utters]
    ctx_utters = [utter[1].split(' ') for utter in train_utters]
    print("max context utter length: %d " %
          max([len(ctx_utter) for ctx_utter in ctx_utters]))
    max_sent_len = int(params['max_sent_len'])
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len)

    vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_ctx_utters)
    print("vocabulary size: %d" % len(vocabulary))

    # build input
    train_inputs = data_helpers.build_input_data(pad_utters, vocabulary)
    train_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters,
                                                     vocabulary)

    utters = [utter[0].split(' ') for utter in test_utters]
    ctx_utters = [utter[1].split(' ') for utter in test_utters]
    pad_utters = data_helpers.pad_sentences(utters, max_sent_len)
    pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len)
    test_inputs = data_helpers.build_input_data(pad_utters, vocabulary)
    test_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters, vocabulary)

    # build labels
    sa_train_labels = [utter[3] for utter in train_utters]
    sa_test_labels = [utter[3] for utter in test_utters]
    sa_train_ctx_labels = [utter[5] for utter in train_utters]
    sa_test_ctx_labels = [utter[5] for utter in test_utters]

    label_binarizer = preprocessing.MultiLabelBinarizer()
    label_binarizer.fit(sa_train_labels + sa_test_labels)

    train_labels = label_binarizer.transform(sa_train_labels)
    test_labels = label_binarizer.transform(sa_test_labels)
    train_ctx_labels = label_binarizer.transform(sa_train_ctx_labels)
    test_ctx_labels = label_binarizer.transform(sa_test_ctx_labels)

    # split speakers into two sets
    tourist_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[2].lower() == 'tourist'
    ]
    guide_train_indices = [
        i for i, utter in enumerate(train_utters)
        if utter[2].lower() == 'guide'
    ]

    tourist_test_indices = [
        i for i, utter in enumerate(test_utters)
        if utter[2].lower() == 'tourist'
    ]
    guide_test_indices = [
        i for i, utter in enumerate(test_utters) if utter[2].lower() == 'guide'
    ]

    np.random.shuffle(tourist_train_indices)
    np.random.shuffle(guide_train_indices)

    tourist_train_inputs = train_inputs[tourist_train_indices]
    tourist_train_ctx_inputs = train_ctx_inputs[tourist_train_indices]
    tourist_train_labels = train_labels[tourist_train_indices]
    tourist_train_ctx_labels = train_ctx_labels[tourist_train_indices]

    guide_train_inputs = train_inputs[guide_train_indices]
    guide_train_ctx_inputs = train_ctx_inputs[guide_train_indices]
    guide_train_labels = train_labels[guide_train_indices]
    guide_train_ctx_labels = train_ctx_labels[guide_train_indices]

    tourist_test_inputs = test_inputs[tourist_test_indices]
    tourist_test_ctx_inputs = test_ctx_inputs[tourist_test_indices]
    tourist_test_labels = test_labels[tourist_test_indices]
    tourist_test_ctx_labels = test_ctx_labels[tourist_test_indices]

    guide_test_inputs = test_inputs[guide_test_indices]
    guide_test_ctx_inputs = test_ctx_inputs[guide_test_indices]
    guide_test_labels = test_labels[guide_test_indices]
    guide_test_ctx_labels = test_ctx_labels[guide_test_indices]

    # load pre-trained word embeddings
    embedding_dim = int(params['embedding_dim'])
    embedding_matrix = data_helpers.load_embedding(
        vocabulary, embedding_dim=embedding_dim, embedding=params['embedding'])

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 tourist_train_inputs, tourist_train_ctx_inputs,
                 tourist_train_labels, tourist_train_ctx_labels,
                 tourist_test_inputs, tourist_test_ctx_inputs,
                 tourist_test_labels, tourist_test_ctx_labels)

    run_slu_task(embedding_matrix, vocabulary, label_binarizer,
                 guide_train_inputs, guide_train_ctx_inputs,
                 guide_train_labels, guide_train_ctx_labels, guide_test_inputs,
                 guide_test_ctx_inputs, guide_test_labels,
                 guide_test_ctx_labels)

    print("")
Exemple #19
0
#load word file
x_text_train_arg1_word, x_text_train_arg2_word, y_train = data_helpers.load_word_pkl('train_word_file')
x_text_dev_arg1_word, x_text_dev_arg2_word, y_dev = data_helpers.load_word_pkl('dev_word_file')


pos_vocab,pos_embd = data_helpers.build_pos_vocab_embd('fourway_data_wu_NOTUNK/pos_list.pkl')
word_vocab,word_embd = data_helpers.build_word_vocab_embd('fourway_data_wu_NOTUNK/word_list.pkl')


# file_pkl = open("./pos_vocab_embd_word_vocab_embd.pkl", "wb")
# pickle.dump([pos_vocab,pos_embd,word_vocab,word_embd],file_pkl)
# file_pkl.close()


x_train_arg1_pos = data_helpers.build_input_data(x_text_train_arg1_pos,pos_vocab,FLAGS.max_document_length)
x_train_arg2_pos = data_helpers.build_input_data(x_text_train_arg2_pos,pos_vocab,FLAGS.max_document_length)
x_dev_arg1_pos = data_helpers.build_input_data(x_text_dev_arg1_pos,pos_vocab,FLAGS.max_document_length)
x_dev_arg2_pos = data_helpers.build_input_data(x_text_dev_arg2_pos,pos_vocab,FLAGS.max_document_length)

x_train_arg1_word = data_helpers.build_input_data(x_text_train_arg1_word, word_vocab,FLAGS.max_document_length)
x_train_arg2_word = data_helpers.build_input_data(x_text_train_arg2_word, word_vocab,FLAGS.max_document_length)
x_dev_arg1_word = data_helpers.build_input_data(x_text_dev_arg1_word,word_vocab,FLAGS.max_document_length)
x_dev_arg2_word = data_helpers.build_input_data(x_text_dev_arg2_word,word_vocab,FLAGS.max_document_length)


with open("./vocab_embd.txt","w",encoding="utf-8") as write_object:
    for v in word_vocab:
        write_object.write(str(v)+"\n")

with open("./x_train_arg1.txt", "w") as write_object:
Exemple #20
0
import data_helpers as tdh
import numpy as np
train_file = '../data/ner.txt'
train_output_file = '../data/train1.pkl'
vocab_output_file = '../data/vocab1.pkl'
word_count_txt_file = '../data/vocab_freq.txt'

num_features = 300
model_name = os.path.join(
    '/home/yatin/phd/nlp/project/xmlcnn/theano_code/word2vec_models/',
    'glove.6B.%dd.txt' % (num_features))
#model_name = os.path.join('/home/cse/phd/csz178057/scratch/squad/data', 'glove.6B.%dd.txt' % (num_features))
ner_file = train_file

(token_list, tag_list, raw_token_list) = utils.prepare_data(ner_file, False)

tag_dict = {'D': 0, 'T': 1, 'O': 2}
tag_list1 = list(
    map(lambda x: np.array(list(map(lambda y: tag_dict[y], x))), tag_list))

vocabs = tdh.get_vocabs_embeddings(token_list, model_name, num_features)
vocabs['y_dict'] = tag_dict
vocabs['y_dict_inv'] = dict([(tag_dict[k], k) for k in tag_dict])

x_trn = tdh.build_input_data(token_list, vocabs['vocabulary'])
pickle.dump({'x': x_trn, 'y': tag_list1}, open(train_output_file, 'wb'))
pickle.dump(vocabs, open(vocab_output_file, 'wb'))
print('\n'.join(
    ['{},{}'.format(x[0], x[1]) for x in word_counts.most_common(None)]),
      file=open(word_count_txt_file, 'w'))
Exemple #21
0
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

x_raw = data_helpers.load_test_data(
    '/Users/Winnerineast/Documents/haodaifu/NewData/tobetrained.csv')

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocabulary, vocabulary_inv, max_length = data_helpers.restore_vocabulary(
    vocab_path)
sentences_padded, tmp_length = data_helpers.pad_sentences(x_raw, max_length)
x_test, y_test = data_helpers.build_input_data(sentences_padded, None,
                                               vocabulary)

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))