Ejemplo n.º 1
0
 def save(self, filename):
     if hasattr(self, 'save_parm'):
         params = self.params + self.save_parm
     else:
         params = self.params
     ps = 'save: <\n'
     for p in params:
         ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
     ps += '> to ... {}'.format(filename)
     # logger.info(ps)
     # hdf5 module seems works abnormal !!
     # dd.io.save(filename, self.get_weights())
     serialize_to_file(self.get_weights(), filename)
def load_additional_testing_data(testing_names, idx2word, word2idx, config, postagging=True, process_type=1):
    test_sets           = {}

    # rule out the ones appear in testing data
    for dataset_name in testing_names:

        if os.path.exists(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl'):
            test_set = deserialize_from_file(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl')
            print('Loading testing dataset %s from %s' % (dataset_name, config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl'))
        else:
            print('Creating testing dataset %s: %s' % (dataset_name, config['path'] + '/dataset/keyphrase/' + config[
                'data_process_name'] + dataset_name + '.testing.pkl'))
            dataloader          = testing_data_loader(dataset_name, kwargs=dict(basedir=config['path']))
            records             = dataloader.get_docs()
            records, pairs, _   = utils.load_pairs(records, process_type=process_type, do_filter=False)
            test_set            = utils.build_data(pairs, idx2word, word2idx)

            test_set['record']  = records

            if postagging:
                tagged_sources = get_postag_with_record(records, pairs)
                test_set['tagged_source']   = [[t[1] for t in s] for s in tagged_sources]

                if hasattr(dataloader, 'text_postag_dir') and dataloader.__getattribute__('text_postag_dir') != None:
                    print('Exporting postagged data to %s' % (dataloader.text_postag_dir))
                    if not os.path.exists(dataloader.text_postag_dir):
                        os.makedirs(dataloader.text_postag_dir)
                    for r_, p_, s_ in zip(records, pairs, tagged_sources):
                        with open(dataloader.text_postag_dir+ '/' + r_['name'] + '.txt', 'w') as f:
                            output_str = ' '.join([w+'_'+t for w,t in s_])
                            f.write(output_str)
                else:
                    print('text_postag_dir not found, no export of postagged data')

            serialize_to_file(test_set, config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl')

        test_sets[dataset_name] = test_set

    return test_sets
Ejemplo n.º 3
0
                    T += [ftr(v) for v in X]
                elif w is 'Y':
                    T += [ftr(v) for v in Y]
                else:
                    T += [w]

            A = [word2idx[w] for w in S]
            B = [word2idx[w] for w in T]
            C = [0 if w not in S else S.index(w) + Lmax for w in T]

            instance['x'] += [S]
            instance['y'] += [T]
            instance['source'] += [A]
            instance['target'] += [B]
            instance['target_c'] += [C]

            instance['rule_id'] += [k]
            instance['rule'] += [' '.join(source) + ' -> ' + ' '.join(target)]

    return instance


train_set = build_instance()
print 'build ok.'
test_set = build_instance()
print 'build ok.'

serialize_to_file([train_set, test_set, idx2word, word2idx],
                  '/home/thoma/Work/Dial-DRL/dataset/synthetic_data_c.pkl')
# serialize_to_file([train_set, test_set], '/home/thoma/Work/Dial-DRL/dataset/synthetic_data.pkl')
                # 5. Save model
                if batch_id % 500 == 0 and batch_id > 1:
                    # save the weights every K rounds
                    agent.save(
                        config['path_experiment'] +
                        '/experiments.{0}.id={1}.epoch={2}.batch={3}.pkl'.
                        format(config['task_name'], config['timemark'], epoch,
                               batch_id))

                    # save the game(training progress) in case of interrupt!
                    optimizer_config = agent.optimizer.get_config()
                    serialize_to_file(
                        [
                            name_ordering, batch_id, loss, valid_param,
                            optimizer_config
                        ], config['path_experiment'] +
                        '/save_training_status.id={0}.epoch={1}.batch={2}.pkl'.
                        format(config['timemark'], epoch, batch_id))
                    print(optimizer_config)
                    # agent.save_weight_json(config['path_experiment'] + '/weight.print.id={0}.epoch={1}.batch={2}.json'.format(config['timemark'], epoch, batch_id))

                # 6. Stop if exceed patience
                if valid_param['valids_not_improved'] >= valid_param[
                        'patience']:
                    print("Not improved for %s epochs. Stopping..." %
                          valid_param['valids_not_improved'])
                    valid_param['early_stop'] = True
                    break
    '''
    test accuracy and f-score at the end of each epoch
Ejemplo n.º 5
0
# use character-based model [on]
# use word-based model     [off]


def build_data(data):
    instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
    for pair in data:
        source, target = pair
        A = [word2idx[w] for w in source]
        B = [word2idx[w] for w in target]
        # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
        C = [0 if w not in source else source.index(w) + Lmax for w in target]

        instance['text'] += [source]
        instance['summary'] += [target]
        instance['source'] += [A]
        instance['target'] += [B]
        # instance['cc_matrix'] += [C]
        instance['target_c'] += [C]


#    print instance['target'][5000]
#    print instance['target_c'][5000]
    return instance

train_set = build_data(training)
test_set = build_data(testing)
serialize_to_file([train_set, test_set, idx2word, word2idx],
                  './dataset/geo880/data-word-full.pkl')
                    mir_vec = mir_doc['backward_encoding']

                if encoding_name == 'forward-backward':
                    iir_vec = np.concatenate([
                        iir_doc['forward_encoding'],
                        iir_doc['backward_encoding']
                    ])
                    mir_vec = np.concatenate([
                        mir_doc['forward_encoding'],
                        mir_doc['backward_encoding']
                    ])

                sim = cosine_similarity(iir_vec, mir_vec)
                similarity_matrix[iir_doc['name']].append(
                    (mir_doc['name'], sim))
                # print('%s vs %s = %f' % (iir_doc['name'], mir_doc['name'], sim))

        serialize_to_file(similarity_matrix, similarity_matrix_file)

    for k in [1, 3, 5]:
        ndcg_k = 0
        for testing_data_i in testing_data_list:
            '''
            each testing_data_i consists of a bunch of mappings
            '''
            # print(len(testing_data_i))
            ndcg_ = evaluate_ndcg_at_k(testing_data_i, k)
            ndcg_k += ndcg_
        print(
            'NDCG@%d = %f/%d = %f' % (k, ndcg_k, len(testing_data_list),
                                      float(ndcg_k) / len(testing_data_list)))
Ejemplo n.º 7
0
    print idx2word[i].encode('utf-8')

# use character-based model [on]
# use word-based model     [off]


def build_data(data):
    instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
    for pair in data:
        source, target = pair
        A = [word2idx[w] for w in source]
        B = [word2idx[w] for w in target]
        # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
        C = [0 if w not in source else source.index(w) + Lmax for w in target]

        instance['text']      += [source]
        instance['summary']   += [target]
        instance['source']    += [A]
        instance['target']    += [B]
        # instance['cc_matrix'] += [C]
        instance['target_c'] += [C]

    print instance['target'][5000]
    print instance['target_c'][5000]
    return instance


train_set = build_data(pairs[10000:])
test_set  = build_data(pairs[:10000])
serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/weibo_data-word-cooc.pkl')
Ejemplo n.º 8
0
def obtain_dataset():
    rfile = open('/home/thoma/Work/Dial-DRL/dataset/BST_1M.txt', 'r')
    line  = rfile.readline()

    word2idx = dict()
    word2idx['<eol>'] = 0
    word2idx['<unk>'] = 1
    pairs    = []
    at       = 2
    lines    = 0
    while line:
        lines += 1
        line   = line.strip()
        source, target = line.split('->')
        source = source.split()
        target = target.split()

        for w in source:
            if w not in word2idx:
                word2idx[w] = at
                at += 1
        for w in target:
            if w not in word2idx:
                word2idx[w] = at
                at += 1
        pairs.append((source, target))
        if lines % 20000 == 0:
            print lines
        line = rfile.readline()

    idx2word = dict()
    for v, k in word2idx.items():
        idx2word[k] = v

    Lmax     = len(idx2word)
    print 'read dataset ok.'
    print Lmax
    for i in xrange(Lmax):
        print idx2word[i]

    def build_data(data):
        instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
        for pair in data:
            source, target = pair
            A = [word2idx[w] for w in source]
            B = [word2idx[w] for w in target]
            # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
            C = [0 if w not in source else source.index(w) + Lmax for w in target]

            instance['text']      += [source]
            instance['summary']   += [target]
            instance['source']    += [A]
            instance['target']    += [B]
            # instance['cc_matrix'] += [C]
            instance['target_c'] += [C]

        print instance['target'][5000]
        print instance['target_c'][5000]
        return instance

    train_set = build_data(pairs[100000:])
    test_set  = build_data(pairs[:100000])
    serialize_to_file([train_set, test_set, idx2word, word2idx],
                      '/home/thoma/Work/Dial-DRL/dataset/BST_1M.data.pkl')
Ejemplo n.º 9
0
    else:
        return word


# prepare the vocabulary
data_clean = [[replace(w) for w in l] for l in data_rep]
idx2word2 = dict(enumerate(set([w for l in data_clean for w in l]), 1))
idx2word2[0] = '<eol>'
word2idx2 = {v: k for k, v in idx2word2.items()}
Lmax = len(idx2word2)

for k in xrange(len(idx2word2)):
    print k, '\t', idx2word2[k]
print 'Max: {}'.format(Lmax)

serialize_to_file([idx2word2, word2idx2, idx2word, word2idx],
                  './dataset/bAbI/voc-b.pkl')

# get ready for the dataset.
source = [[word2idx2[w] for w in l] for l in data_clean]
target = [[
    word2idx2[w] if w not in ['<person>', '<color>', '<shape>'] else it + Lmax
    for it, w in enumerate(l)
] for l in data_clean]


def print_str(data):
    for d in data:
        print ' '.join(str(w) for w in d)


print_str(data[10000:10005])
Ejemplo n.º 10
0
    print idx2word[i].encode('utf-8')

# use character-based model [on]
# use word-based model     [off]


def build_data(data):
    instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
    for pair in data:
        source, target = pair
        A = [word2idx[w] for w in source]
        B = [word2idx[w] for w in target]
        # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
        C = [0 if w not in source else source.index(w) + Lmax for w in target]

        instance['text']      += [source]
        instance['summary']   += [target]
        instance['source']    += [A]
        instance['target']    += [B]
        # instance['cc_matrix'] += [C]
        instance['target_c'] += [C]

    print instance['target'][5000]
    print instance['target_c'][5000]
    return instance


train_set = build_data(pairs)
test_set  = build_data(tests)
serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/lcsts_data-char-full.pkl')
Ejemplo n.º 11
0
print pairs[0]


def build_data(data):
    instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
    print len(data)
    for pair in data:
        source, target = pair
        A = [word2idx[w] for w in source]
        B = [word2idx[w] for w in target]
        # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
        C = [0 if w not in source else source.index(w) + Lmax for w in target]

        instance['text'] += [source]
        instance['summary'] += [target]
        instance['source'] += [A]
        instance['target'] += [B]
        # instance['cc_matrix'] += [C]
        instance['target_c'] += [C]

    print instance['source'][4000]
    print instance['target'][4000]
    print instance['target_c'][4000]
    return instance


train_set = build_data(pairs[10000:])
test_set = build_data(pairs[:10000])
serialize_to_file([train_set, test_set, idx2word, word2idx],
                  './dataset/movie_dialogue_data.pkl')
Ejemplo n.º 12
0
print('Generating idx2word2 and word2idx2')

# Replace name/color/shape to <tag>
data_clean = [[replace(w) for w in l] for l in data_rep]
idx2word2 = dict(enumerate(set([w for l in data_clean for w in l]), 1))
idx2word2[0] = '<eol>'
word2idx2 = {v: k for k, v in idx2word2.items()}
Lmax = len(idx2word2)

for k in xrange(len(idx2word2)):
    print k, '\t', idx2word2[k]
print 'Max: {}'.format(Lmax)
# idx2word(3) and word2idx(4) are from source, size=132
# idx2word2(1) and word2idx2(2) are from target, replacing name/color/shape to <tag>, size=98
print('Exporing dicts to file')
serialize_to_file([idx2word2, word2idx2, idx2word, word2idx], config['voc'])

print('Generating source, target, origin')
# get ready for the dataset.
# source sequence is the sentence with all the real people/colors/shapes converted into tags '<person>', '<color>', '<shape>'
source = [[word2idx2[w] for w in l] for l in data_clean]
# in target, if a word is not person/color/shape, then keep it, else change it to it + Lmax (a word not present in dict)
target = [[
    word2idx2[w] if w not in ['<person>', '<color>', '<shape>'] else it + Lmax
    for it, w in enumerate(l)
] for l in data_clean]


def print_str(data):
    for d in data:
        print ' '.join(str(w) for w in d)