Esempio n. 1
0
        file = expdir + "/" + file
        os.remove(file)
    os.removedirs(expdir)

    print('ERROR: expdir already exists')
    # exit(-1)

    # tf.set_random_seed(int(time.time() * 1000))
    tf.compat.v1.set_random_seed(int(time.time() * 1000))
params = helper.GetParams(args.params, 'train', args.expdir)

logging.basicConfig(filename=os.path.join(expdir, 'logfile.txt'),
                    level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())

df = LoadData(args.data)
char_vocab = Vocab.MakeFromData(df.query_, min_count=10)
char_vocab.Save(os.path.join(args.expdir, 'char_vocab.pickle'))
params.vocab_size = len(char_vocab)
user_vocab = Vocab.MakeFromData([[u] for u in df.user], min_count=15)
user_vocab.Save(os.path.join(args.expdir, 'user_vocab.pickle'))
params.user_vocab_size = len(user_vocab)
dataset = Dataset(df,
                  char_vocab,
                  user_vocab,
                  max_len=params.max_len,
                  batch_size=params.batch_size)

val_df = LoadData(args.valdata)
valdata = Dataset(val_df,
                  char_vocab,
Esempio n. 2
0

if __name__ == '__main__':
    optimizer = {
        'sgd': tf.train.GradientDescentOptimizer,
        'adam': tf.train.AdamOptimizer,
        'ada': tf.train.AdagradOptimizer,
        'adadelta': tf.train.AdadeltaOptimizer
    }[args.optimizer]

    mLow = DynamicModel(args.expdir,
                        learning_rate=args.learning_rate,
                        threads=args.threads,
                        optimizer=optimizer)

    df = LoadData(args.data)
    users = df.groupby('user')
    avg_time = MovingAvg(0.95)

    stop = '</S>'  # decide if we stop at first space or not
    if args.partial:
        stop = ' '

    counter = 0
    for user, grp in users:
        grp = grp.sort_values('date')
        mLow.session.run(mLow.reset_user_embed)

        for i in range(len(grp)):
            row = grp.iloc[i]
            query = ''.join(row.query_[1:-1])
def main():
    args = get_cmd_args()

    args.data_path = args.directory + 'PKL/{}_train_clean.pickle'.format(
        args.dataset)
    args.target_path = args.directory + 'PKL/{}_clean_data_indexs.pickle'.format(
        args.dataset)
    args.vocab_path = args.directory + 'PKL/{}_clean_vocab.pickle'.format(
        args.dataset)
    args.model_dir = args.directory + 'Model/char/torch/{}.model'.format(
        args.model_name)
    log_file = args.directory + 'log/' + args.model_name + '_debug.log'
    logging.basicConfig(filename=log_file, filemode='w', level=logging.DEBUG)

    # test

    loaddata = LoadData(args.batch, args.data_path, args.target_path,
                        args.vocab_path, args.tokenizer, args.n_features)
    args.inp_dim = args.out_dim = len(loaddata.vocab)
    args.max_len = loaddata.max_length
    args.vocab = loaddata.vocab
    task = Train(args.inp_dim, args.out_dim, args.embedding_dim,
                 args.enc_units, args.dec_units, args.dropout, args.dropout,
                 args.epoch, args.clip, args.sparse_max, args.tf, args.max_len,
                 args.vocab, args.batch, device)

    if args.mode == 'train':
        logging.info('start training...')
        results = task.start_train(loaddata.train, loaddata.valid,
                                   args.model_dir)

        for k, v in results.items():
            print('{0}: {1}'.format(k, v))
            logging.info('{0}: {1}'.format(k, v))

        extension = '_' + str(args.model_name)

        save_path = args.directory + 'results/'

        # plot accuracy
        plot('accuracy' + extension,
             'epochs',
             'accuracy',
             results['train_acc'],
             results['val_acc'],
             'train accuracy',
             'validation accuracy',
             save_path=save_path)

        # plot loss
        plot('loss' + extension,
             'epochs',
             'loss',
             results['train_loss'],
             results['val_loss'],
             'train loss',
             'validation loss',
             save_path=save_path)

        # plot wer
        plot('wer' + extension,
             'epochs',
             'wer',
             results['wer_ocr'],
             results['wer_after'],
             'wer ocr',
             'val wer',
             save_path=save_path)

        # plot cer
        plot('cer' + extension,
             'epochs',
             'cer',
             results['cer_ocr'],
             results['cer_after'],
             'cer ocr',
             'val cer',
             save_path=save_path)

    else:
        logging.info('start testing...')
        # sent_clean = 'Mohren plagen uns ohne aufhörlich'
        # sent_res = task.translate_sent(loaddata, sent_clean)
        sent_out = task.test(loaddata, loaddata.valid, args.model_dir)

        output_file = args.directory + 'log/' + args.model_name + '_output.txt'
        print('Saving to {0}\n'.format(output_file))
        with open(output_file, 'w', encoding='utf-8') as f:
            for sent_pair in sent_out:
                f.write(sent_pair[0] + ',' + sent_pair[1] + '\n')
Esempio n. 4
0
import pygtrie
import re
import sys
import numpy as np
from dataset import LoadData
from helper import GetPrefixLen

query_trie = pygtrie.CharTrie()

dirname = '../data'
filenames = [
    'queries01.train.txt.gz', 'queries02.train.txt.gz',
    'queries03.train.txt.gz', 'queries04.train.txt.gz',
    'queries05.train.txt.gz', 'queries06.train.txt.gz'
]
df = LoadData([os.path.join(dirname, f) for f in filenames], split=False)
z = df.query_.value_counts()
z = z[z > 2]

for q, count in zip(z.index.values, z):
    query_trie[q] = count

cache = {}


def GetTopK(prefix, k=100):
    if prefix in cache:
        return cache[prefix]
    results = query_trie.items(prefix)
    queries, counts = zip(*sorted(results, key=lambda x: -x[-1]))
    cache[prefix] = queries[:k]
Esempio n. 5
0
                    dest='data',
                    default=[data_dir + "queries07.test.txt.gz"],
                    help='where to load the data')
parser.add_argument('--threads',
                    type=int,
                    default=12,
                    help='how many threads to use in tensorflow')
args = parser.parse_args()
expdir = args.expdir

# 模型加载
metamodel = MetaModel(expdir)
model = metamodel.model
metamodel.MakeSessionAndRestore(args.threads)
# 数据加载
df = LoadData(args.data)
dataset = Dataset(df,
                  metamodel.char_vocab,
                  metamodel.user_vocab,
                  max_len=metamodel.params.max_len)

total_word_count = 0
total_log_prob = 0
print(len(dataset.df), dataset.batch_size)  # 20999    24
for idx in range(0, int(len(dataset.df) / dataset.batch_size)):
    feed_dict = dataset.GetFeedDict(model)
    # 这里的session 是 获取的是 保存后的模型
    c, words_in_batch = metamodel.session.run(
        [model.avg_loss, model.words_in_batch], feed_dict)
    # c是 total_loss, words_in_batch 一个batch里字数
    total_word_count += words_in_batch  # 整个字数
Esempio n. 6
0
def main():
    args = get_cmd_args()
    Train_groups, Test_groups = load_corpus(args)
    group_num = len(Test_groups)
    for flag in range(group_num):
        print('Experiment: {}'.format(flag))
        args.model_dir = args.directory + 'Model/{}{}.model'.format(
            args.model_name, flag)
        log_file = args.directory + 'log/{}{}.log'.format(
            args.model_name, flag)
        logging.basicConfig(filename=log_file,
                            filemode='w',
                            level=logging.DEBUG)
        Train_data = Train_groups[flag]
        Test_data = Test_groups[flag]
        loaddata = LoadData(args.tokenizer, args.n_features)
        loaddata.prepare_corpus(Train_data, Test_data)
        args.inp_dim = args.out_dim = len(loaddata.vocab)
        args.max_len = loaddata.max_length
        vocab = loaddata.vocab  # get white space index
        # vocab_reverse = loaddata.vocab_reverse
        print(args.inp_dim, args.max_len, vocab.get(' '))
        # print(type(list(vocab.keys())[10]))
        task = Train(args.inp_dim, args.out_dim, args.embedding_dim,
                     args.enc_units, args.dec_units, args.dropout,
                     args.dropout, args.epoch, args.clip, args.sparse_max,
                     args.tf, loaddata, args.batch, device, args.model_dir)
        if args.mode == 'train':
            logging.info('start training...')
            task.start_train(loaddata.train, loaddata.valid)

            # also test
            logging.info('start testing themselves: ')
            task.test_in_batch(loaddata.test)
            # test other books
            test_others = [i for i in range(group_num) if i != flag]

            for j in test_others:
                logging.info('start testing other books: {}'.format(j))
                Test_other = Test_groups[j]
                test_data = loaddata.prepare_other_corpus(Test_other)
                task.test_in_batch(test_data)
        else:
            logging.info('start testing...')
            task.test_in_batch(loaddata.test)
            test_others = [i for i in range(group_num) if i != flag]

            for j in test_others:
                logging.info('start testing other books: {}'.format(j))
                Test_other = Test_groups[j]
                test_data = loaddata.prepare_other_corpus(Test_other)
                task.test_in_batch(test_data)

            for test in Test_groups:
                test_inp = [t[0] for t in test]
                test_out = [t[1] for t in test]
                translation = task.translate_in_batch(test_inp)
                out = args.directory + 'log/test_text_model{}.txt'.format(flag)
                with open(out, 'a', encoding='utf8') as f:
                    for inp, pred, truth in zip(test_inp, translation,
                                                test_out):
                        f.write(inp.decode(errors='ignore'))
                        f.write('\n')
                        f.write(pred)
                        f.write('\n')
                        f.write(truth.decode(errors='ignore'))
                        f.write('\n\n')
Esempio n. 7
0
 def loaddata(self,type):
     self.trainset = LoadData(type, transform=None)
     self.load_data = DataLoader(
         self.trainset, batch_size=self.batch_size, shuffle=True)
Esempio n. 8
0
import pygtrie
import sys
from dataset import LoadData
from helper import GetPrefixLen

import code

query_trie = pygtrie.CharTrie()

dirname = '/g/ssli/data/LowResourceLM/aol'
filenames = [
    'queries01.train.txt.gz', 'queries02.train.txt.gz',
    'queries03.train.txt.gz', 'queries04.train.txt.gz',
    'queries05.train.txt.gz', 'queries06.train.txt.gz'
]
df = LoadData([os.path.join(dirname, f) for f in filenames], split=False)
z = df.query_.value_counts()
z = z[z > 2]

for q, count in zip(z.index.values, z):
    query_trie[q] = count

cache = {}


def GetTopK(prefix, k=100):
    if prefix in cache:
        return cache[prefix]
    results = query_trie.items(prefix)
    queries, counts = zip(*sorted(results, key=lambda x: -x[-1]))
    cache[prefix] = queries[:k]