Example #1
0
File: snli.py Project: zyh2011/SACN
def main():
    Logger.GLOBAL_LOG_LEVEL = LogLevel.INFO
    #Config.backend = Backends.TENSORFLOW
    Config.backend = Backends.TORCH
    Config.cuda = True
    Config.dropout = 0.1
    Config.hidden_size = 128
    Config.embedding_size = 256
    Config.L2 = 0.00003

    do_process = False
    if do_process:
        preprocess_SNLI(delete_data=True)


    p = Pipeline('snli_example')
    vocab = p.state['vocab']
    vocab.load_from_disk()

    batch_size = 128
    if Config.backend == Backends.TENSORFLOW:
        from spodernet.backends.tfbackend import TensorFlowConfig
        TensorFlowConfig.init_batch_size(batch_size)
    train_batcher = StreamBatcher('snli_example', 'snli_train', batch_size, randomize=True, loader_threads=8)
    #train_batcher.subscribe_to_batch_prepared_event(SomeExpensivePreprocessing())
    dev_batcher = StreamBatcher('snli_example', 'snli_dev', batch_size)
    test_batcher  = StreamBatcher('snli_example', 'snli_test', batch_size)

    train_batcher.subscribe_to_events(AccuracyHook('Train', print_every_x_batches=1000))
    dev_batcher.subscribe_to_events(AccuracyHook('Dev', print_every_x_batches=1000))
    eta = ETAHook(print_every_x_batches=1000)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)

    model = Model()
    model.add(Embedding(128, vocab.num_embeddings))
    model.add(PairedBiDirectionalLSTM(128, hidden_size=256, variable_length=True, conditional_encoding=False))
    model.add(SoftmaxCrossEntropy(input_size=256*4, num_labels=3))


    t = Trainer(model)
    for i in range(10):
        t.train(train_batcher, epochs=1)
        t.evaluate(dev_batcher)
Example #2
0
def main(args, model_path):
    if args.preprocess: preprocess(args.data, delete_data=True)
    input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']
    p = Pipeline(args.data, keys=input_keys)
    p.load_vocabs()
    vocab = p.state['vocab']

    num_entities = vocab['e1'].num_token

    train_batcher = StreamBatcher(args.data, 'train', args.batch_size, randomize=True, keys=input_keys, loader_threads=args.loader_threads)
    dev_rank_batcher = StreamBatcher(args.data, 'dev_ranking', args.test_batch_size, randomize=False, loader_threads=args.loader_threads, keys=input_keys)
    test_rank_batcher = StreamBatcher(args.data, 'test_ranking', args.test_batch_size, randomize=False, loader_threads=args.loader_threads, keys=input_keys)


    if args.model is None:
        model = ConvE(args, vocab['e1'].num_token, vocab['rel'].num_token)
    elif args.model == 'conve':
        model = ConvE(args, vocab['e1'].num_token, vocab['rel'].num_token)
    elif args.model == 'distmult':
        model = DistMult(args, vocab['e1'].num_token, vocab['rel'].num_token)
    elif args.model == 'complex':
        model = Complex(args, vocab['e1'].num_token, vocab['rel'].num_token)
    elif args.model == 'interacte':
        model = InteractE(args, vocab['e1'].num_token, vocab['rel'].num_token)
    else:
        log.info('Unknown model: {0}', args.model)
        raise Exception("Unknown model!")

    train_batcher.at_batch_prepared_observers.insert(1,TargetIdx2MultiTarget(num_entities, 'e2_multi1', 'e2_multi1_binary'))


    eta = ETAHook('train', print_every_x_batches=args.log_interval)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)
    train_batcher.subscribe_to_events(LossHook('train', print_every_x_batches=args.log_interval))

    model.cuda()
    if args.resume:
        model_params = torch.load(model_path)
        print(model)
        total_param_size = []
        params = [(key, value.size(), value.numel()) for key, value in model_params.items()]
        for key, size, count in params:
            total_param_size.append(count)
            print(key, size, count)
        print(np.array(total_param_size).sum())
        model.load_state_dict(model_params)
        model.eval()
        ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation')
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
    else:
        model.init()

    total_param_size = []
    params = [value.numel() for value in model.parameters()]
    print(params)
    print(np.sum(params))

    opt = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2)
    for epoch in range(args.epochs):
        model.train()
        for i, str2var in enumerate(train_batcher):
            opt.zero_grad()
            e1 = str2var['e1']
            rel = str2var['rel']
            e2_multi = str2var['e2_multi1_binary'].float()
            # label smoothing
            e2_multi = ((1.0-args.label_smoothing)*e2_multi) + (1.0/e2_multi.size(1))

            pred = model.forward(e1, rel)
            loss = model.loss(pred, e2_multi)
            loss.backward()
            opt.step()

            train_batcher.state.loss = loss.cpu()


        print('saving to {0}'.format(model_path))
        torch.save(model.state_dict(), model_path)

        model.eval()
        with torch.no_grad():
            if epoch % 5 == 0 and epoch > 0:
                ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
            if epoch % 5 == 0:
                if epoch > 0:
                    ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation')
Example #3
0
def preprocess(dataset_name, delete_data=False):
    full_path = 'data/{0}/e1rel_to_e2_full.json'.format(dataset_name)
    train_path = 'data/{0}/e1rel_to_e2_train.json'.format(dataset_name)
    dev_ranking_path = 'data/{0}/e1rel_to_e2_ranking_dev.json'.format(dataset_name)
    test_ranking_path = 'data/{0}/e1rel_to_e2_ranking_test.json'.format(dataset_name)

    keys2keys = {}
    keys2keys['e1'] = 'e1' # entities
    keys2keys['rel'] = 'rel' # relations
    keys2keys['rel_eval'] = 'rel' # relations
    keys2keys['e2'] = 'e1' # entities
    keys2keys['e2_multi1'] = 'e1' # entity
    keys2keys['e2_multi2'] = 'e1' # entity
    input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']
    d = DatasetStreamer(input_keys)
    d.add_stream_processor(JsonLoaderProcessors())
    d.add_stream_processor(DictKey2ListMapper(input_keys))

    # process full vocabulary and save it to disk
    d.set_path(full_path)
    p = Pipeline(args.data, delete_data, keys=input_keys, skip_transformation=True)
    p.add_sent_processor(ToLower())
    p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),keys=['e2_multi1', 'e2_multi2'])
    p.add_token_processor(AddToVocab())
    p.execute(d)
    p.save_vocabs()


    # process train, dev and test sets and save them to hdf5
    p.skip_transformation = False
    for path, name in zip([train_path, dev_ranking_path, test_ranking_path], ['train', 'dev_ranking', 'test_ranking']):
        d.set_path(path)
        p.clear_processors()
        p.add_sent_processor(ToLower())
        p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),keys=['e2_multi1', 'e2_multi2'])
        p.add_post_processor(ConvertTokenToIdx(keys2keys=keys2keys), keys=['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2'])
        p.add_post_processor(StreamToHDF5(name, samples_per_file=1000, keys=input_keys))
        p.execute(d)
Example #4
0
def main():
    if Config.process: preprocess(Config.dataset, delete_data=True)
    input_keys = ['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2']
    p = Pipeline(Config.dataset, keys=input_keys)
    p.load_vocabs()
    vocab = p.state['vocab']

    num_entities = vocab['e1'].num_token
    dict_tokentoid, dict_idtotoken = vocab['e1'].tokendicts()
    dict_reltoid, dict_idtorel = vocab['rel'].tokendicts()

    num_rel = vocab['rel'].num_token 
    train_batcher = StreamBatcher(Config.dataset, 'train', Config.batch_size, randomize=True, keys=input_keys)
    dev_rank_batcher = StreamBatcher(Config.dataset, 'dev_ranking', Config.batch_size, randomize=False, loader_threads=4, keys=input_keys, is_volatile=True)
    test_rank_batcher = StreamBatcher(Config.dataset, 'test_ranking', Config.batch_size, randomize=False, loader_threads=4, keys=input_keys, is_volatile=True)


    if Config.model_name is None:
        model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'ConvE':
        model = ConvE(vocab['e1'].num_token, num_rel)
    elif Config.model_name == 'DistMult':
        model = DistMult(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'ComplEx':
        model = Complex(vocab['e1'].num_token, vocab['rel'].num_token)
    else:
        log.info('Unknown model: {0}', Config.model_name)
        raise Exception("Unknown model!")

    train_batcher.at_batch_prepared_observers.insert(1,TargetIdx2MultiTarget(num_entities, 'e2_multi1', 'e2_multi1_binary'))


    eta = ETAHook('train', print_every_x_batches=100)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)
    train_batcher.subscribe_to_events(LossHook('train', print_every_x_batches=100))

    if Config.cuda:
        model.cuda()
    if load:
        model_params = torch.load(model_path)
        print(model)
        total_param_size = []
        params = [(key, value.size(), value.numel()) for key, value in model_params.items()]
        for key, size, count in params:
            total_param_size.append(count)
            print(key, size, count)
        print(np.array(total_param_size).sum())
        model.load_state_dict(model_params)
        model.eval()
        ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation')
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
    else:
        model.init()

    total_param_size = []
    params = [value.numel() for value in model.parameters()]
    print(params)
    print(np.sum(params))

################################################ loading
    model.load_state_dict(torch.load('embeddings/auto-embeddings.pt'))


    opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=Config.learning_rate, weight_decay=Config.L2)
    # One hot encoding buffer that you create out of the loop and just keep reusing
    y_onehot_e1 = torch.FloatTensor(Config.batch_size, num_entities)
    # One hot encoding buffer that you create out of the loop and just keep reusing
    y_onehot_r = torch.FloatTensor(Config.batch_size, num_rel)

    model.eval()
    train_data =[]
    with open('data/'+Config.dataset+'/train.txt', 'r') as f:
        for i, line in enumerate(f):
            e1, rel, e2 = line.decode('utf-8').split('\t')
            e1 = e1.strip()#.lower()
            e2 = e2.strip()#.lower()
            rel = rel.strip()#.lower()
            train_data += [[e1, rel, e2]]
    print len(train_data)

    attack_list = []
    E2_list = []
    with open('data/'+Config.dataset+'/test.txt', 'r') as f:
        for i, line in enumerate(f):
            e1, rel, e2 = line.decode('utf-8').split('\t')
            e1 = e1.strip().lower()
            e2 = e2.strip().lower()
            rel = rel.strip().lower()
            attack_list += [[dict_tokentoid[e1], dict_reltoid[rel], dict_tokentoid[e2]]]
            E2_list += [e2]

    print len(attack_list)
    E2_list = set(E2_list)
    E2_dict = {}
    for i in train_data:
        if i[2].lower() in E2_list:
            if dict_tokentoid[i[2].lower()] in E2_dict: 
                E2_dict[dict_tokentoid[i[2].lower()]] += [[dict_tokentoid[i[0].lower()], dict_reltoid[i[1].lower()]]]
            else:
                E2_dict[dict_tokentoid[i[2].lower()]] = [[dict_tokentoid[i[0].lower()], dict_reltoid[i[1].lower()]]]


    str_at = []
    embd_e = model.emb_e.weight.data.cpu().numpy()
    embd_rel = model.emb_rel.weight.data.cpu().numpy()

    n_t = 0
    for trip in attack_list:
        if n_t % 500 == 0:
            print 'Number of processed triple: ', n_t

        n_t += 1
        e1 = trip[0]
        rel = trip[1]
        e2_or = trip[2]
        e1 = torch.cuda.LongTensor([e1])
        rel = torch.cuda.LongTensor([rel])
        e2 = torch.cuda.LongTensor([e2_or])
        pred = model.encoder(e1, rel)
        E2 = model.encoder_2(e2)
        
        A, B = find_best_at(-pred, E2)
        attack_ext = -A*pred+B*E2
        if e2_or in E2_dict:
            nei = E2_dict[e2_or]
            #attack = find_best_attack(E2.data.cpu().numpy(), pred.data.cpu().numpy(), nei, embd_e, embd_rel, attack_ext)
            #attack = torch.autograd.Variable(torch.from_numpy(attack)).cuda().float()
            attack = attack_ext

        else: 
            attack = attack_ext
        E1, R = model.decoder(attack)
        _, predicted_e1 = torch.max(E1, 1)
        _, predicted_R = torch.max(R, 1)

        str_at += [[str(dict_idtotoken[predicted_e1.data.cpu().numpy()[0]]), str(dict_idtorel[predicted_R.data.cpu().numpy()[0]]), str(dict_idtotoken[e2_or])]]


    new_train = str_at + train_data
    print len(new_train)
    with open('data/new_'+Config.dataset+'/train.txt', 'w') as f:
        for item in new_train:
            f.write("%s\n" % "\t".join(map(str, item)))
Example #5
0
def main():
    if Config.process: preprocess(Config.dataset, delete_data=True)
    input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']
    p = Pipeline(Config.dataset, keys=input_keys)
    p.load_vocabs()
    vocab = p.state['vocab']

    num_entities = vocab['e1'].num_token

    train_batcher = StreamBatcher(Config.dataset,
                                  'train',
                                  Config.batch_size,
                                  randomize=True,
                                  keys=input_keys)
    dev_rank_batcher = StreamBatcher(Config.dataset,
                                     'dev_ranking',
                                     Config.batch_size,
                                     randomize=False,
                                     loader_threads=4,
                                     keys=input_keys)
    test_rank_batcher = StreamBatcher(Config.dataset,
                                      'test_ranking',
                                      Config.batch_size,
                                      randomize=False,
                                      loader_threads=4,
                                      keys=input_keys)

    if Config.model_name is None:
        model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'ConvE':
        model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'DistMult':
        model = DistMult(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'ComplEx':
        model = Complex(vocab['e1'].num_token, vocab['rel'].num_token)
    else:
        log.info('Unknown model: {0}', Config.model_name)
        raise Exception("Unknown model!")

    train_batcher.at_batch_prepared_observers.insert(
        1, TargetIdx2MultiTarget(num_entities, 'e2_multi1',
                                 'e2_multi1_binary'))

    eta = ETAHook('train', print_every_x_batches=100)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)
    train_batcher.subscribe_to_events(
        LossHook('train', print_every_x_batches=100))

    if Config.cuda:
        model.cuda()
    if load:
        model_params = torch.load(model_path)
        print(model)
        total_param_size = []
        params = [(key, value.size(), value.numel())
                  for key, value in model_params.items()]
        for key, size, count in params:
            total_param_size.append(count)
            print(key, size, count)
        print(np.array(total_param_size).sum())
        model.load_state_dict(model_params)
        model.eval()
        ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation')
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
    else:
        model.init()

    total_param_size = []
    params = [value.numel() for value in model.parameters()]
    print(params)
    print(np.sum(params))

    max_mrr = 0
    count = 0
    max_count = 3
    opt = torch.optim.Adam(model.parameters(),
                           lr=Config.learning_rate,
                           weight_decay=Config.L2)
    for epoch in range(1, epochs + 1):
        model.train()
        for i, str2var in enumerate(train_batcher):
            opt.zero_grad()
            e1 = str2var['e1']
            rel = str2var['rel']
            e2_multi = str2var['e2_multi1_binary'].float()
            # label smoothing
            e2_multi = ((1.0 - Config.label_smoothing_epsilon) *
                        e2_multi) + (1.0 / e2_multi.size(1))

            pred = model.forward(e1, rel)
            loss = model.loss(pred, e2_multi)
            loss.backward()
            opt.step()

            train_batcher.state.loss = loss.cpu()

        print('saving to {0}'.format(model_path))
        torch.save(model.state_dict(), model_path)

        model.eval()
        with torch.no_grad():
            # ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
            if epoch % 15 == 0:
                mrr = ranking_and_hits(model, dev_rank_batcher, vocab,
                                       'dev_evaluation')
                if mrr <= max_mrr:
                    count += 1
                    if count > max_count:
                        break
                else:
                    count = 0
                    max_mrr = mrr
    mrr_test = ranking_and_hits(model, test_rank_batcher, vocab,
                                'test_evaluation')
Example #6
0
def main(args):

    if args.preprocess:
        print('start preprocessing', flush=True)
        preprocess(args, delete_data=True)
        print('finish preprocessing', flush=True)

    else:
        input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']
        p = Pipeline(args.data, keys=input_keys)
        print(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()) +
              ': start loading vocabs',
              flush=True)
        p.load_vocabs()
        print(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()) +
              ': finish loading vocabs',
              flush=True)
        vocab = p.state['vocab']
        num_entities = vocab['e1'].num_token

        train_batcher = StreamBatcher(args.data,
                                      'train',
                                      args.batch_size,
                                      randomize=True,
                                      keys=input_keys,
                                      loader_threads=args.loader_threads)
        model = DistMult(args, vocab['e1'].num_token, vocab['rel'].num_token)
        train_batcher.at_batch_prepared_observers.insert(
            1,
            TargetIdx2MultiTarget(num_entities, 'e2_multi1',
                                  'e2_multi1_binary'))

        #         eta = ETAHook('train', print_every_x_batches=args.log_interval)
        #         train_batcher.subscribe_to_events(eta)
        #         train_batcher.subscribe_to_start_of_epoch_event(eta)
        #         train_batcher.subscribe_to_events(LossHook('train', print_every_x_batches=args.log_interval))

        model.cuda()
        model.init()

        total_param_size = []
        params = [value.numel() for value in model.parameters()]
        print(params, flush=True)
        print(np.sum(params), flush=True)

        opt = torch.optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.l2)
        print(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()) +
              f': start training with epochs = {args.epochs}',
              flush=True)
        for epoch in range(args.epochs):
            model.train()
            #             sampled_batches = set(np.random.choice(train_batcher.num_batches, args.num_batches, replace=False))
            #             print(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()) + f': start epoch {epoch} with batches = {len(sampled_batches)} out of {train_batcher.num_batches}', flush=True)
            #             processed_count = 0
            for i, str2var in enumerate(train_batcher):
                #                 if i not in sampled_batches: continue
                #                 if processed_count%int(args.num_batches/1000)==0: print(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()) + f': start epoch {epoch} batch {i} = {processed_count}', flush=True)
                #                 processed_count += 1
                opt.zero_grad()
                e1 = str2var['e1']
                rel = str2var['rel']
                e2_multi = str2var['e2_multi1_binary'].float()
                e2_multi = ((1.0 - args.label_smoothing) *
                            e2_multi) + (1.0 / e2_multi.size(1))

                pred = model.forward(e1, rel)
                loss = model.loss(pred, e2_multi)
                loss.backward()
                opt.step()


#                 train_batcher.state.loss = loss.cpu()

            print(time.strftime("%a, %d %b %Y %H:%M:%S +0000",
                                time.localtime()) +
                  f': finish training epoch {epoch}',
                  flush=True)

        model.eval()
        output(args, vocab['e1'], model.emb_e.weight.detach().cpu().numpy())
Example #7
0
def preprocess(args, delete_data=False):

    keys2keys = {}
    keys2keys['e1'] = 'e1'  # entities
    keys2keys['rel'] = 'rel'  # relations
    keys2keys['rel_eval'] = 'rel'  # relations
    keys2keys['e2'] = 'e1'  # entities
    keys2keys['e2_multi1'] = 'e1'  # entity
    keys2keys['e2_multi2'] = 'e1'  # entity
    input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']
    print('create dataset streamer', flush=True)
    d = DatasetStreamer(input_keys)
    d.add_stream_processor(JsonLoaderProcessors())
    d.add_stream_processor(DictKey2ListMapper(input_keys))

    # process full vocabulary and save it to disk
    d.set_path(args.train_path)
    print('create pipeline', flush=True)
    p = Pipeline(args.data,
                 delete_data,
                 keys=input_keys,
                 skip_transformation=True)
    p.add_sent_processor(ToLower())
    p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),
                         keys=['e2_multi1', 'e2_multi2'])
    p.add_token_processor(AddToVocab())
    print('execute full vocabs', flush=True)
    p.execute(d)
    print('save full vocabs', flush=True)
    p.save_vocabs()

    # process train sets and save them to hdf5
    p.skip_transformation = False
    d.set_path(args.train_path)
    p.clear_processors()
    p.add_sent_processor(ToLower())
    p.add_sent_processor(CustomTokenizer(lambda x: x.split(' ')),
                         keys=['e2_multi1', 'e2_multi2'])
    p.add_post_processor(
        ConvertTokenToIdx(keys2keys=keys2keys),
        keys=['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2'])
    p.add_post_processor(
        StreamToHDF5('train', samples_per_file=1000, keys=input_keys))
    print('execute and save train vocabs', flush=True)
    p.execute(d)
Example #8
0
def main():
    if do_process: preprocess(dataset_name, delete_data=True)
    input_keys = ['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2']
    p = Pipeline(dataset_name, keys=input_keys)
    p.load_vocabs()
    vocab = p.state['vocab']

    num_entities = vocab['e1'].num_token

    train_batcher = StreamBatcher(dataset_name,
                                  'train',
                                  Config.batch_size,
                                  randomize=True,
                                  keys=input_keys)
    dev_rank_batcher = StreamBatcher(dataset_name,
                                     'dev_ranking',
                                     Config.batch_size,
                                     randomize=False,
                                     loader_threads=4,
                                     keys=input_keys,
                                     is_volatile=True)
    test_rank_batcher = StreamBatcher(dataset_name,
                                      'test_ranking',
                                      Config.batch_size,
                                      randomize=False,
                                      loader_threads=4,
                                      keys=input_keys,
                                      is_volatile=True)

    #model = Complex(vocab['e1'].num_token, vocab['rel'].num_token)
    #model = DistMult(vocab['e1'].num_token, vocab['rel'].num_token)
    model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token)

    train_batcher.at_batch_prepared_observers.insert(
        1, TargetIdx2MultiTarget(num_entities, 'e2_multi1',
                                 'e2_multi1_binary'))

    eta = ETAHook('train', print_every_x_batches=100)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)
    train_batcher.subscribe_to_events(
        LossHook('train', print_every_x_batches=100))

    if Config.cuda:
        model.cuda()
    if load:
        model_params = torch.load(model_path)
        print(model)
        print([(key, value.size()) for key, value in model_params.items()])
        model.load_state_dict(model_params)
        model.eval()
        ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation')
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
    else:
        model.init()

    opt = torch.optim.Adam(model.parameters(),
                           lr=Config.learning_rate,
                           weight_decay=Config.L2)
    for epoch in range(epochs):
        model.train()
        for i, str2var in enumerate(train_batcher):
            opt.zero_grad()
            e1 = str2var['e1']
            rel = str2var['rel']
            e2_multi = str2var['e2_multi1_binary'].float()
            # label smoothing
            e2_multi = ((1.0 - Config.label_smoothing_epsilon) *
                        e2_multi) + (1.0 / e2_multi.size(1))

            pred = model.forward(e1, rel)
            loss = model.loss(pred, e2_multi)
            loss.backward()
            opt.step()

            train_batcher.state.loss = loss

        print('saving to {0}'.format(model_path))
        torch.save(model.state_dict(), model_path)

        model.eval()
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
        if epoch % 3 == 0:
            if epoch > 0:
                ranking_and_hits(model, test_rank_batcher, vocab,
                                 'test_evaluation')
Example #9
0
def main():
    if Config.process: preprocess(Config.dataset, delete_data=True)
    train_triples_path = path_root + 'data/{0}/train.txt'.format(
        Config.dataset)
    # dev_triples_path = 'data/{0}/valid.txt'.format(Config.dataset)  # used for development
    test_triples_path = path_root + 'data/{0}/test.txt'.format(Config.dataset)

    input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']
    p = Pipeline(Config.dataset, keys=input_keys)
    p.load_vocabs()
    vocab = p.state['vocab']
    num_entities = vocab['e1'].num_token
    train_batcher = StreamBatcher(Config.dataset,
                                  'train',
                                  Config.batch_size,
                                  randomize=True,
                                  keys=input_keys)
    dev_rank_batcher = StreamBatcher(Config.dataset,
                                     'dev_ranking',
                                     Config.batch_size,
                                     randomize=False,
                                     loader_threads=4,
                                     keys=input_keys)
    test_rank_batcher = StreamBatcher(Config.dataset,
                                      'test_ranking',
                                      Config.batch_size,
                                      randomize=False,
                                      loader_threads=4,
                                      keys=input_keys)

    allRels = get_AllRels(vocab)
    allEntTokens = get_AllEntities(vocab)

    if Config.model_name is None:
        model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'ConvE':

        if not test:
            model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token,
                          allEntTokens, allRels)
        else:
            if testEntGraph:
                types2E, types2rel2idx, types2rels = read_graphs(
                    gpath, f_post_fix, featIdx, isCCG, lower)
                model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token,
                              allEntTokens, allRels, types2E, types2rel2idx,
                              types2rels)
            else:
                model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token,
                              allEntTokens, allRels)
    elif Config.model_name == 'DistMult':
        model = DistMult(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'ComplEx':
        model = Complex(vocab['e1'].num_token, vocab['rel'].num_token)
    else:
        print('Unknown model: {0}', Config.model_name)
        raise Exception("Unknown model!")

    train_batcher.at_batch_prepared_observers.insert(
        1, TargetIdx2MultiTarget(num_entities, 'e2_multi1',
                                 'e2_multi1_binary'))

    eta = ETAHook('train', print_every_x_batches=100)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)
    train_batcher.subscribe_to_events(
        LossHook('train', print_every_x_batches=100))

    if Config.cuda:
        model.cuda()
    if load:
        model_params = torch.load(model_path)

        print(model)
        total_param_size = []
        params = [(key, value.size(), value.numel())
                  for key, value in model_params.items()]
        for key, size, count in params:
            total_param_size.append(count)
            print(key, size, count)
        print(np.array(total_param_size).sum())
        model.load_state_dict(model_params)
        if test:
            if computeAllProbs:
                fout_probs = open(Config.probs_file_path, 'w')
                model.eval()
                with torch.no_grad():
                    compute_probs(model, test_rank_batcher, vocab,
                                  'test_probs', fout_probs, test_triples_path)
            else:
                model.eval()
                with torch.no_grad():
                    if testEntGraph:
                        # ranking_and_hits_entGraph(model, dev_rank_batcher, vocab, relW2idx, Config.model_name, 'dev_evaluation',train_triples_path, 20)
                        ranking_and_hits_entGraph(model, test_rank_batcher,
                                                  vocab, 'test_evaluation',
                                                  train_triples_path, 20)
                    else:
                        # ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation', train_triples_path, 20)
                        ranking_and_hits(model, test_rank_batcher, vocab,
                                         'test_evaluation', train_triples_path,
                                         20)
            return

    else:
        model.init()

    params = [value.numel() for value in model.parameters()]

    print(params)
    print(np.sum(params))

    opt = torch.optim.Adam(model.parameters(),
                           lr=Config.learning_rate,
                           weight_decay=Config.L2)
    for epoch in range(epochs):
        model.train()
        for i, str2var in enumerate(train_batcher):
            opt.zero_grad()
            e1 = str2var['e1']
            rel = str2var['rel']
            e2_multi = str2var['e2_multi1_binary'].float()
            # label smoothing
            e2_multi = ((1.0 - Config.label_smoothing_epsilon) *
                        e2_multi) + (1.0 / e2_multi.size(1))
            pred = model.forward(e1, rel)
            loss = model.loss(pred, e2_multi)
            loss.backward()
            opt.step()

            train_batcher.state.loss = loss.cpu()

        if save:
            print('saving to {0}'.format(model_path))
            if not os.path.isdir(path_root + 'saved_models'):
                os.mkdir(path_root + 'saved_models')
            torch.save(model.state_dict(), model_path)

        model.eval()

        if epoch % 5 == 0:
            with torch.no_grad():
                ranking_and_hits(model, dev_rank_batcher, vocab,
                                 'dev_evaluation', train_triples_path, 20)
                if epoch % 10 == 0:  #This was 10
                    if epoch > 0:
                        ranking_and_hits(model, test_rank_batcher, vocab,
                                         'test_evaluation', train_triples_path,
                                         20)
                        # Let's write the rel embeddings!

        if model_name == "ConvE":
            fout = open(
                'ents2emb_tmp_' + Config.model_name + '_' + Config.dataset +
                '.txt', 'w')
            lookup_tensor = torch.tensor(
                [i for i in range(vocab['e1'].num_token)],
                dtype=torch.long).to('cuda')
            emb_e = model.emb_e(lookup_tensor).cpu().detach().numpy()
            for i in range(vocab['e1'].num_token):
                fout.write(vocab['e1'].idx2token[i] + '\t' + str(emb_e[i]) +
                           '\n')

            fout.close()

            fout = open(
                'rels2emb_' + Config.model_name + '_' + Config.dataset +
                '_tmp.txt', 'w')
            for i in range(vocab['rel'].num_token):
                if i in model.relIdx2Embed:
                    fout.write(vocab['rel'].idx2token[i] + '\t' +
                               str(model.relIdx2Embed[i]) + '\n')
            fout.close()

    if model_name == "ConvE":
        #Let's write the final rel embeddings!
        fout = open(
            'rels2emb_' + Config.model_name + '_' + Config.dataset + '.txt',
            'w')
        for i in range(vocab['rel'].num_token):
            if i in model.relIdx2Embed:
                fout.write(vocab['rel'].idx2token[i] + '\t' +
                           str(model.relIdx2Embed[i]) + '\n')
            else:
                print("doesn't have: ", vocab['rel'].idx2token[i])
Example #10
0
def main(args, model_path):
    if args.preprocess: preprocess(args.data, delete_data=True)
    input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']
    p = Pipeline(args.data, keys=input_keys)
    p.load_vocabs()
    vocab = p.state['vocab']  # 都要把数据转换成对象存储起来。这里用的是spodernet 中的Vocab对象

    num_entities = vocab['e1'].num_token  # 得到总共有多少个实体(sense)
    # 生成三批数据
    train_batcher = StreamBatcher(args.data,
                                  'train',
                                  args.batch_size,
                                  randomize=True,
                                  keys=input_keys,
                                  loader_threads=args.loader_threads)
    dev_rank_batcher = StreamBatcher(args.data,
                                     'dev_ranking',
                                     args.test_batch_size,
                                     randomize=False,
                                     loader_threads=args.loader_threads,
                                     keys=input_keys)
    test_rank_batcher = StreamBatcher(args.data,
                                      'test_ranking',
                                      args.test_batch_size,
                                      randomize=False,
                                      loader_threads=args.loader_threads,
                                      keys=input_keys)

    model = ConvE(args, vocab['e1'].num_token, vocab['rel'].num_token)

    train_batcher.at_batch_prepared_observers.insert(
        1, TargetIdx2MultiTarget(num_entities, 'e2_multi1',
                                 'e2_multi1_binary'))

    # 这部分功能应该是:在训练完之后使用一个回调
    eta = ETAHook('train', print_every_x_batches=args.log_interval)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)
    train_batcher.subscribe_to_events(
        LossHook('train', print_every_x_batches=args.log_interval))

    P = Preprocessor("../external/wordnet-mlj12")
    tokenidx_to_synset = vocab['e1'].idx2token

    encoder = DefinitionEncoder()
    encoder.cuda()
    model.cuda()
    if args.initialize:
        model_params = torch.load(args.initialize)
        print(model)
        total_param_size = []
        params = [(key, value.size(), value.numel())
                  for key, value in model_params.items()]
        for key, size, count in params:
            total_param_size.append(count)
            print(key, size, count)
        print(np.array(total_param_size).sum())
        model.load_state_dict(model_params)
        model.eval()
        ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation')
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
        # 赋值definition encoder,但是在model的属性中,没有找到 encoder
        model.encoder = encoder
        model.encoder.init()
    elif args.resume:
        model.encoder = encoder
        model_params = torch.load(model_path)
        print(model)
        total_param_size = []
        params = [(key, value.size(), value.numel())
                  for key, value in model_params.items()]
        for key, size, count in params:
            total_param_size.append(count)
            print(key, size, count)
        print(np.array(total_param_size).sum())
        model.load_state_dict(model_params)
        model.eval()
        ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation',
                         tokenidx_to_synset, P.get_batch)
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation',
                         tokenidx_to_synset, P.get_batch)
    else:
        model.encoder = encoder
        model.encoder.init()
        model.init()

    total_param_size = []
    params = [value.numel() for value in model.parameters()]
    print(params)
    print(np.sum(params))

    opt = torch.optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.l2)
    best_dev_mrr = 0

    model.eval()
    dev_mrr = ranking_and_hits(model, dev_rank_batcher, vocab,
                               'dev_evaluation', tokenidx_to_synset,
                               P.get_batch)
    # 准备训练
    for epoch in range(args.epochs):
        model.train()
        for i, str2var in enumerate(train_batcher):
            opt.zero_grad()
            e1 = str2var['e1']
            rel = str2var['rel']

            e1_tokens = [
                tokenidx_to_synset[idx]
                for idx in e1.detach().cpu().numpy().ravel()
            ]
            batch, lengths = P.get_batch(e1_tokens)

            # e1_emb 就是使用 bilstm 得到的embedding
            e1_emb = model.encoder((batch.cuda(), lengths))[0]

            e2_multi = str2var['e2_multi1_binary'].float()
            # label smoothing
            e2_multi = ((1.0 - args.label_smoothing) *
                        e2_multi) + (1.0 / e2_multi.size(1))

            # 放到
            pred = model.forward(e1_emb, rel, e1_encoded=True)
            loss = model.loss(pred, e2_multi)
            loss.backward()
            opt.step()

            train_batcher.state.loss = loss.cpu()

        #saving on improvement in dev score
        #print('saving to {0}'.format(model_path))
        #torch.save(model.state_dict(), model_path)

        model.eval()
        with torch.no_grad():
            if epoch % 5 == 0 and epoch > 0:
                dev_mrr = ranking_and_hits(model, dev_rank_batcher, vocab,
                                           'dev_evaluation',
                                           tokenidx_to_synset, P.get_batch)
                if dev_mrr > best_dev_mrr:
                    print('saving to {} MRR {}->{}'.format(
                        model_path, best_dev_mrr, dev_mrr))
                    best_dev_mrr = dev_mrr
                    torch.save(model.state_dict(), model_path)

            if epoch % 5 == 0:
                if epoch > 0:
                    ranking_and_hits(model, test_rank_batcher, vocab,
                                     'test_evaluation', tokenidx_to_synset,
                                     P.get_batch)

    if args.represent:
        P = Preprocessor()
        synsets = [P.idx_to_synset[idx] for idx in range(len(P.idx_to_synset))]
        embeddings = []
        embeddings_proj = []
        for i in tqdm(range(0, len(synsets), args.test_batch_size)):
            synsets_batch = synsets[i:i + args.test_batch_size]
            with torch.no_grad():
                batch, lengths = P.get_batch(synsets_batch)
                emb_proj, emb = model.encoder((batch.cuda(), lengths))
                embeddings_proj.append(emb_proj.detach().cpu())
                embeddings.append(emb.detach().cpu())
        embeddings = torch.cat(embeddings, 0).numpy()
        embeddings_proj = torch.cat(embeddings_proj, 0).numpy()
        print('embeddings', embeddings.shape, embeddings_proj.shape)
        basename, ext = os.path.splitext(args.represent)
        fname = args.represent
        np.savez_compressed(fname, embeddings=embeddings, synsets=synsets)
        fname = basename + '_projected' + ext
        np.savez_compressed(fname, embeddings=embeddings_proj, synsets=synsets)
Example #11
0
def main():
    if Config.process: preprocess(Config.dataset, delete_data=True)
    input_keys = ['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2']
    p = Pipeline(Config.dataset, keys=input_keys)
    p.load_vocabs()
    vocab = p.state['vocab']

    num_entities = vocab['e1'].num_token

    train_batcher = StreamBatcher(Config.dataset, 'train', Config.batch_size, randomize=True, keys=input_keys)
    dev_rank_batcher = StreamBatcher(Config.dataset, 'dev_ranking', Config.batch_size, randomize=False, loader_threads=4, keys=input_keys, is_volatile=True)
    test_rank_batcher = StreamBatcher(Config.dataset, 'test_ranking', Config.batch_size, randomize=False, loader_threads=4, keys=input_keys, is_volatile=True)

    # Load literals
    numerical_literals = np.load(f'data/{Config.dataset}/literals/numerical_literals.npy')

    # Initialize KBLN RBF parameters
    X_train = np.load(f'data/{Config.dataset}/bin/train.npy')
    h = X_train[:, 0]
    t = X_train[:, 2]

    n = numerical_literals[h, :] - numerical_literals[t, :]
    c = np.mean(n, axis=0).astype('float32')  # size: (n_literals)
    var = np.var(n, axis=0) + 1e-6  # size: (n_literals), added eps to avoid degenerate case

    # Get normalized literals (for LiteralE)
    max_lit, min_lit = np.max(numerical_literals, axis=0), np.min(numerical_literals, axis=0)
    numerical_literals_normalized = (numerical_literals - min_lit) / (max_lit - min_lit + 1e-8)

    # Load literal models
    if Config.model_name is None or Config.model_name == 'KBLN':
        model = KBLN(vocab['e1'].num_token, vocab['rel'].num_token, numerical_literals, c, var)
        print('Chosen model: KBLN')
    elif Config.model_name == 'LiteralE_KBLN':
        model = LiteralE_KBLN(vocab['e1'].num_token, vocab['rel'].num_token, numerical_literals, numerical_literals_normalized, c, var)
        print('Chosen model: LiteralE_KBLN')

    train_batcher.at_batch_prepared_observers.insert(1, TargetIdx2MultiTarget(num_entities, 'e2_multi1', 'e2_multi1_binary'))

    eta = ETAHook('train', print_every_x_batches=100)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)
    train_batcher.subscribe_to_events(LossHook('train', print_every_x_batches=100))

    if Config.cuda:
        model.cuda()
    if load:
        model_params = torch.load(model_path)
        print(model)
        total_param_size = []
        params = [(key, value.size(), value.numel()) for key, value in model_params.items()]
        for key, size, count in params:
            total_param_size.append(count)
            print(key, size, count)
        print(np.array(total_param_size).sum())
        model.load_state_dict(model_params)
        model.eval()
        ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation')
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
    else:
        model.init()

    total_param_size = []
    params = [value.numel() for value in model.parameters()]
    print(params)
    print(np.sum(params))

    opt = torch.optim.Adam(model.parameters(), lr=Config.learning_rate, weight_decay=Config.L2)
    for epoch in range(epochs):
        model.train()
        for i, str2var in enumerate(train_batcher):
            opt.zero_grad()
            e1 = str2var['e1']
            rel = str2var['rel']
            e2_multi = str2var['e2_multi1_binary'].float()
            # label smoothing
            e2_multi = ((1.0-Config.label_smoothing_epsilon)*e2_multi) + (1.0/e2_multi.size(1))

            pred = model.forward(e1, rel)
            loss = model.loss(pred, e2_multi)
            loss.backward()
            opt.step()

            train_batcher.state.loss = loss


        print('saving to {0}'.format(model_path))
        torch.save(model.state_dict(), model_path)

        model.eval()
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
        if epoch % 3 == 0:
            if epoch > 0:
                ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation')
Example #12
0
def main():
    if Config.process: preprocess(Config.dataset, delete_data=True)
    input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']
    p = Pipeline(Config.dataset, keys=input_keys)
    p.load_vocabs()
    vocab = p.state['vocab']

    num_entities = vocab['e1'].num_token

    train_batcher = StreamBatcher(Config.dataset,
                                  'train',
                                  Config.batch_size,
                                  randomize=True,
                                  keys=input_keys)
    dev_rank_batcher = StreamBatcher(Config.dataset,
                                     'dev_ranking',
                                     Config.batch_size,
                                     randomize=False,
                                     loader_threads=4,
                                     keys=input_keys)
    test_rank_batcher = StreamBatcher(Config.dataset,
                                      'test_ranking',
                                      Config.batch_size,
                                      randomize=False,
                                      loader_threads=4,
                                      keys=input_keys)

    if Config.model_name is None:
        model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'ConvE':
        model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'DistMult':
        model = DistMult(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'ComplEx':
        model = Complex(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'RNNDist':
        model = RNNDist(vocab['e1'].num_token, vocab['rel'].num_token)
    else:
        log.info('Unknown model: {0}', Config.model_name)
        raise Exception("Unknown model!")

    train_batcher.at_batch_prepared_observers.insert(
        1, TargetIdx2MultiTarget(num_entities, 'e2_multi1',
                                 'e2_multi1_binary'))

    eta = ETAHook('train', print_every_x_batches=100)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)
    train_batcher.subscribe_to_events(
        LossHook('train', print_every_x_batches=100))
    if Config.dataset == 'ICEWS18':
        lengths = [
            1618, 956, 815, 1461, 1634, 1596, 1754, 1494, 800, 979, 1588, 1779,
            1831, 1762, 1566, 812, 820, 1707, 1988, 1845, 1670, 1695, 956, 930,
            1641, 1813, 1759, 1664, 1616, 1021, 998, 1668, 1589, 1720
        ]
    else:
        lengths = [
            1090, 730, 646, 939, 681, 783, 546, 526, 524, 586, 656, 741, 562,
            474, 493, 487, 474, 477, 460, 532, 348, 530, 402, 493, 503, 452,
            668, 512, 406, 467, 524, 563, 524, 418, 441, 487, 515, 475, 478,
            532, 387, 479, 485, 417, 542, 496, 487, 445, 504, 350, 432, 445,
            401, 570, 554, 504, 505, 483, 587, 441, 489, 501, 487, 513, 513,
            524, 655, 545, 599, 702, 734, 519, 603, 579, 537, 635, 437, 422,
            695, 575, 553, 485, 429, 663, 475, 673, 527, 559, 540, 591, 558,
            698, 422, 1145, 969, 1074, 888, 683, 677, 910, 902, 644, 777, 695,
            571, 656, 797, 576, 468, 676, 687, 549, 482, 1007, 778, 567, 813,
            788, 879, 557, 724, 850, 809, 685, 714, 554, 799, 727, 208, 946,
            979, 892, 859, 1092, 1038, 999, 1477, 1126, 1096, 1145, 955, 100,
            1264, 1287, 962, 1031, 1603, 1662, 1179, 1064, 1179, 1105, 1465,
            1176, 1219, 1137, 1112, 791, 829, 2347, 917, 913, 1107, 960, 850,
            1005, 1045, 871, 972, 921, 1019, 984, 1033, 848, 918, 699, 1627,
            1580, 1354, 1119, 1065, 1208, 1037, 1134, 980, 1249, 1031, 908,
            787, 819, 804, 764, 959, 1057, 770, 691, 816, 620, 788, 829, 895,
            1128, 1023, 1038, 1030, 1016, 991, 866, 878, 1013, 977, 914, 976,
            717, 740, 904, 912, 1043, 1117, 930, 1116, 1028, 946, 922, 1151,
            1092, 967, 1189, 1081, 1158, 943, 981, 1212, 1104, 941, 912, 1347,
            1241, 1479, 1188, 1152, 1164, 1167, 1173, 1280, 979, 142, 1458,
            910, 1126, 1053, 1083, 897, 1021, 1075, 881, 1054, 941, 927, 860,
            1081, 876, 1952, 1576, 1560, 1599, 1226, 1083, 964, 1059, 1179,
            982, 1032, 933, 877, 1032, 957, 884, 909, 846, 850, 798, 843, 1183,
            1108, 1185, 797, 915, 952, 1181, 744, 86, 889, 1151, 925, 1119,
            1115, 1036, 772, 1052, 837, 897, 1095, 926, 1034, 1031, 995, 907,
            969, 981, 1135, 915, 1161, 100, 1269, 1244, 1331, 1124, 1074, 1162,
            1159, 1078, 1311, 1210, 1308, 945, 1183, 1580, 1406, 1417, 1173,
            1348, 1274, 1179, 893, 1107, 950, 1028, 1055, 1059, 1244, 1082,
            1179, 1011, 955, 886, 865, 857
        ]
    if Config.cuda:
        model.cuda()
    if load:
        # if True:
        model_params = torch.load(model_path)
        print(model)
        total_param_size = []
        params = [(key, value.size(), value.numel())
                  for key, value in model_params.items()]
        for key, size, count in params:
            total_param_size.append(count)
            print(key, size, count)
        print(np.array(total_param_size).sum())
        model.load_state_dict(model_params)
        model.eval()
        ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation')
        # ranks = ranking_and_hits2(model, test_rank_batcher, vocab, 'test_evaluation')
        print(len(ranks))

        mrr = []
        curr_step = 0
        for i in range(len(lengths)):
            rr = np.array(ranks[curr_step:curr_step + 2 * lengths[i]])
            mrr.append(np.mean(1 / rr))

            curr_step += 2 * lengths[i]
        with open(Config.dataset + 'mrr.txt', 'w') as f:
            for i, mr in enumerate(mrr):
                print("MRR (filtered) @ {}th day: {:.6f}".format(i, mr))
                f.write(str(mr) + '\n')
        h10 = []
        curr_step = 0
        for i in range(len(lengths)):
            rr = np.array(ranks[curr_step:curr_step + 2 * lengths[i]])
            h10.append(np.mean(rr <= 10))
        with open(Config.dataset + 'h10.txt', 'w') as f:
            for i, mr in enumerate(h10):
                print("h10 (filtered) @ {}th day: {:.6f}".format(i, mr))
                f.write(str(mr) + '\n')
        h10 = []
        for i in range(len(lengths)):
            rr = np.array(ranks[curr_step:curr_step + 2 * lengths[i]])
            h10.append(np.mean(rr <= 3))
        with open(Config.dataset + 'h3.txt', 'w') as f:
            for i, mr in enumerate(h10):
                print("h10 (filtered) @ {}th day: {:.6f}".format(i, mr))
                f.write(str(mr) + '\n')

        h10 = []

        for i in range(len(lengths)):
            rr = np.array(ranks[curr_step:curr_step + 2 * lengths[i]])
            h10.append(np.mean(rr <= 1))
        with open(Config.dataset + 'h1.txt', 'w') as f:
            for i, mr in enumerate(h10):
                print("h10 (filtered) @ {}th day: {:.6f}".format(i, mr))
                f.write(str(mr) + '\n')
        print("length", len(ranks))
        print("length_2", 2 * sum(lengths))

        # ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
    else:
        model.init()

    total_param_size = []
    params = [value.numel() for value in model.parameters()]
    print(params)
    print(np.sum(params))

    opt = torch.optim.Adam(model.parameters(),
                           lr=Config.learning_rate,
                           weight_decay=Config.L2)
    for epoch in range(epochs):
        # break
        model.train()
        for i, str2var in enumerate(train_batcher):
            opt.zero_grad()
            e1 = str2var['e1']
            rel = str2var['rel']
            e2_multi = str2var['e2_multi1_binary'].float()

            # label smoothing
            # e2_multi = ((1.0-Config.label_smoothing_epsilon)*e2_multi) + (1.0/e2_multi.size(1))
            # print("this",Config.label_smoothing_epsilon, e2_multi.size(1))

            pred = model.forward(e1, rel)
            # loss = model.loss(pred, e2_multi)
            # #
            loss = torch.zeros(1).cuda()
            for j in range(128):
                position = torch.nonzero(e2_multi[j])[0].cuda()
                label = torch.cat(
                    [torch.ones(len(position)),
                     torch.zeros(len(position))]).cuda()
                neg_position = torch.randint(e2_multi.shape[1],
                                             (len(position), )).long().cuda()
                position = torch.cat([position, neg_position])
                loss += model.loss(pred[j, position], label)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           1.0)  # clip gradients
            opt.step()

            train_batcher.state.loss = loss.cpu()

        print('saving to {0}'.format(model_path))
        torch.save(model.state_dict(), model_path)

        model.eval()
        with torch.no_grad():
            # ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
            if epoch == 50:
                ranks = ranking_and_hits(model, test_rank_batcher, vocab,
                                         'test_evaluation')
Example #13
0
File: main.py Project: zzw-x/CPL
def main():
    if Config.process: preprocess(Config.dataset, delete_data=True)
    input_keys = ['e1', 'rel', 'rel_eval', 'e2', 'e2_multi1', 'e2_multi2']
    p = Pipeline(Config.dataset, keys=input_keys)
    p.load_vocabs()
    vocab = p.state['vocab']

    num_entities = vocab['e1'].num_token

    train_batcher = StreamBatcher(Config.dataset, 'train', Config.batch_size, randomize=True, keys=input_keys)
    dev_rank_batcher = StreamBatcher(Config.dataset, 'dev_ranking', Config.batch_size, randomize=False, loader_threads=4, keys=input_keys)
    test_rank_batcher = StreamBatcher(Config.dataset, 'test_ranking', Config.batch_size, randomize=False, loader_threads=4, keys=input_keys)


    if Config.model_name is None:
        model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'ConvE':
        model = ConvE(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'DistMult':
        model = DistMult(vocab['e1'].num_token, vocab['rel'].num_token)
    elif Config.model_name == 'ComplEx':
        model = Complex(vocab['e1'].num_token, vocab['rel'].num_token)
    else:
        log.info('Unknown model: {0}', Config.model_name)
        raise Exception("Unknown model!")

    train_batcher.at_batch_prepared_observers.insert(1,TargetIdx2MultiTarget(num_entities, 'e2_multi1', 'e2_multi1_binary'))


    eta = ETAHook('train', print_every_x_batches=100)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)
    train_batcher.subscribe_to_events(LossHook('train', print_every_x_batches=100))

    if Config.cuda:
        model.cuda()
    if load:
        model_params = torch.load(model_path)
        print(model)
        total_param_size = []
        params = [(key, value.size(), value.numel()) for key, value in model_params.items()]
        for key, size, count in params:
            total_param_size.append(count)
            print(key, size, count)
        print(np.array(total_param_size).sum())
        model.load_state_dict(model_params)
        model.eval()
        ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation',epochs,True)
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation',epochs,False)
    else:
        model.init()

    total_param_size = []
    params = [value.numel() for value in model.parameters()]
    print(params)
    print(np.sum(params))

    opt = torch.optim.Adam(model.parameters(), lr=Config.learning_rate, weight_decay=Config.L2)
    for epoch in range(epochs):
        model.train()
        for i, str2var in tqdm(enumerate(train_batcher)):
            opt.zero_grad()
            e1 = str2var['e1']
            rel = str2var['rel']
            e2_multi = str2var['e2_multi1_binary'].float()
            # label smoothing
            pred = model.forward(e1, rel)
            loss = torch.zeros(1).cuda()
            for j in range(128):
                position = torch.nonzero(e2_multi[j])[0].cuda()
                label = torch.cat([torch.ones(len(position)), torch.zeros(len(position))]).cuda()
                neg_position = torch.randint(e2_multi.shape[1], (len(position),)).long().cuda()
                position = torch.cat([position, neg_position])
                loss += model.loss(pred[j, position], label)
            loss.backward()
            opt.step()

            train_batcher.state.loss = loss.cpu()


        print('saving to {0}'.format(model_path))
        torch.save(model.state_dict(), model_path)

        model.eval()
        with torch.no_grad():
            if epoch % 100 == 0:
                if epoch > 0:
                    ranking_and_hits(model, test_rank_batcher, vocab, Config.dataset + "-" + Config.model_name,epoch,False)
            if epoch + 1 == epochs:
                ranking_and_hits(model, test_rank_batcher, vocab, Config.dataset,epoch,True)
Example #14
0
def main():
    if Config.process: preprocess(Config.dataset, delete_data=True)
    input_keys = ['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2']
    p = Pipeline(Config.dataset, keys=input_keys)
    p.load_vocabs()
    vocab = p.state['vocab']

    num_entities = vocab['e1'].num_token

    train_batcher = StreamBatcher(Config.dataset,
                                  'train',
                                  Config.batch_size,
                                  randomize=True,
                                  keys=input_keys)
    dev_rank_batcher = StreamBatcher(Config.dataset,
                                     'dev_ranking',
                                     Config.batch_size,
                                     randomize=False,
                                     loader_threads=4,
                                     keys=input_keys,
                                     is_volatile=True)
    test_rank_batcher = StreamBatcher(Config.dataset,
                                      'test_ranking',
                                      Config.batch_size,
                                      randomize=False,
                                      loader_threads=4,
                                      keys=input_keys,
                                      is_volatile=True)

    # Load literals
    numerical_literals = np.load(
        f'data/{Config.dataset}/literals/numerical_literals.npy')

    # Normalize literals
    max_lit, min_lit = np.max(numerical_literals,
                              axis=0), np.min(numerical_literals, axis=0)
    numerical_literals = (numerical_literals - min_lit) / (max_lit - min_lit +
                                                           1e-8)

    # Load Multitask models
    model = MTKGNN_DistMult(vocab['e1'].num_token, vocab['rel'].num_token,
                            numerical_literals)

    train_batcher.at_batch_prepared_observers.insert(
        1, TargetIdx2MultiTarget(num_entities, 'e2_multi1',
                                 'e2_multi1_binary'))

    eta = ETAHook('train', print_every_x_batches=100)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)
    train_batcher.subscribe_to_events(
        LossHook('train', print_every_x_batches=100))

    if Config.cuda:
        model.cuda()
    if load:
        model_params = torch.load(model_path)
        print(model)
        total_param_size = []
        params = [(key, value.size(), value.numel())
                  for key, value in model_params.items()]
        for key, size, count in params:
            total_param_size.append(count)
            print(key, size, count)
        print(np.array(total_param_size).sum())
        model.load_state_dict(model_params)
        model.eval()
        ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation')
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
    else:
        model.init()

    total_param_size = []
    params = [value.numel() for value in model.parameters()]
    print(params)
    print(np.sum(params))

    opt_rel = torch.optim.Adam(model.rel_params,
                               lr=Config.learning_rate,
                               weight_decay=Config.L2)
    opt_attr = torch.optim.Adam(model.attr_params,
                                lr=Config.learning_rate,
                                weight_decay=Config.L2)

    for epoch in range(epochs):
        model.train()

        for i, str2var in enumerate(train_batcher):
            opt_rel.zero_grad()
            opt_attr.zero_grad()

            e1 = str2var['e1']
            e2 = str2var['e2']
            rel = str2var['rel']
            e2_multi = str2var['e2_multi1_binary'].float()
            # label smoothing
            e2_multi = ((1.0 - Config.label_smoothing_epsilon) *
                        e2_multi) + (1.0 / e2_multi.size(1))

            pred = model.forward(e1, rel)
            loss_rel = model.loss_rel(pred, e2_multi)
            loss_rel.backward()
            opt_rel.step()

            pred_left, target_left = model.forward_attr(e1, 'left')
            pred_right, target_right = model.forward_attr(e1, 'right')
            loss_attr_left = model.loss_attr(pred_left, target_left)
            loss_attr_right = model.loss_attr(pred_right, target_right)
            loss_attr = loss_attr_left + loss_attr_right
            loss_attr.backward()
            opt_attr.step()

            train_batcher.state.loss = loss_rel + loss_attr

        # Attribute Specific Training
        for k in range(4):
            pred_left, pred_right, target = model.forward_AST()
            loss_AST = model.loss_attr(pred_left, target) + model.loss_attr(
                pred_right, target)
            loss_AST.backward()
            opt_attr.step()

        print('saving to {0}'.format(model_path))
        torch.save(model.state_dict(), model_path)

        model.eval()

        if epoch % 3 == 0:
            if epoch > 0:
                ranking_and_hits(model, dev_rank_batcher, vocab,
                                 'dev_evaluation')
                ranking_and_hits(model, test_rank_batcher, vocab,
                                 'test_evaluation')
Example #15
0
def preprocess_SNLI(delete_data=False):
    # load data
    #names, file_paths = snli2json()
    #train_path, dev_path, test_path = file_paths
    tokenizer = nltk.tokenize.WordPunctTokenizer()

    zip_path = join(get_data_path(), 'snli_1.0.zip', 'snli_1.0')
    file_paths = [
        'snli_1.0_train.jsonl', 'snli_1.0_dev.jsonl', 'snli_1.0_test.jsonl'
    ]

    not_t = []
    t = ['input', 'support', 'target']
    # tokenize and convert to hdf5
    # 1. Setup pipeline to save lengths and generate vocabulary
    p = Pipeline('snli_example', delete_data)
    p.add_path(join(zip_path, file_paths[0]))
    p.add_line_processor(JsonLoaderProcessors())
    p.add_line_processor(
        RemoveLineOnJsonValueCondition('gold_label',
                                       lambda label: label == '-'))
    p.add_line_processor(
        DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label']))
    p.add_sent_processor(ToLower())
    p.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p.add_token_processor(AddToVocab())
    p.add_post_processor(SaveLengthsToState())
    p.execute()
    p.clear_processors()
    p.state['vocab'].save_to_disk()

    # 2. Process the data further to stream it to hdf5
    p.add_sent_processor(ToLower())
    p.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p.add_post_processor(ConvertTokenToIdx())
    p.add_post_processor(
        CreateBinsByNestedLength('snli_train', min_batch_size=128))
    state = p.execute()

    # dev and test data
    p2 = Pipeline('snli_example')
    p2.copy_vocab_from_pipeline(p)
    p2.add_path(join(zip_path, file_paths[1]))
    p2.add_line_processor(JsonLoaderProcessors())
    p2.add_line_processor(
        RemoveLineOnJsonValueCondition('gold_label',
                                       lambda label: label == '-'))
    p2.add_line_processor(
        DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label']))
    p2.add_sent_processor(ToLower())
    p2.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p2.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p2.add_post_processor(SaveLengthsToState())
    p2.execute()

    p2.clear_processors()
    p2.add_sent_processor(ToLower())
    p2.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p2.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p2.add_post_processor(ConvertTokenToIdx())
    p2.add_post_processor(StreamToHDF5('snli_dev'))
    p2.execute()

    p3 = Pipeline('snli_example')
    p3.copy_vocab_from_pipeline(p)
    p3.add_path(join(zip_path, file_paths[2]))
    p3.add_line_processor(JsonLoaderProcessors())
    p3.add_line_processor(
        RemoveLineOnJsonValueCondition('gold_label',
                                       lambda label: label == '-'))
    p3.add_line_processor(
        DictKey2ListMapper(['sentence1', 'sentence2', 'gold_label']))
    p3.add_sent_processor(ToLower())
    p3.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p3.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p3.add_post_processor(SaveLengthsToState())
    p3.execute()

    p3.clear_processors()
    p3.add_sent_processor(ToLower())
    p3.add_sent_processor(Tokenizer(tokenizer.tokenize), t)
    #p3.add_sent_processor(NaiveNCharTokenizer(3), not_t)
    p3.add_post_processor(ConvertTokenToIdx())
    p3.add_post_processor(StreamToHDF5('snli_test'))
    p3.execute()
Example #16
0
def main():
    Logger.GLOBAL_LOG_LEVEL = LogLevel.INFO
    #Config.backend = Backends.TENSORFLOW
    Config.backend = Backends.TORCH
    Config.cuda = True
    Config.dropout = 0.1
    Config.hidden_size = 128
    Config.embedding_size = 256
    Config.L2 = 0.00003

    do_process = False
    if do_process:
        preprocess_SNLI(delete_data=True)

    p = Pipeline('snli_example')
    vocab = p.state['vocab']
    vocab.load_from_disk()

    batch_size = 128
    if Config.backend == Backends.TENSORFLOW:
        from spodernet.backends.tfbackend import TensorFlowConfig
        TensorFlowConfig.init_batch_size(batch_size)
    train_batcher = StreamBatcher('snli_example',
                                  'snli_train',
                                  batch_size,
                                  randomize=True,
                                  loader_threads=8)
    #train_batcher.subscribe_to_batch_prepared_event(SomeExpensivePreprocessing())
    dev_batcher = StreamBatcher('snli_example', 'snli_dev', batch_size)
    test_batcher = StreamBatcher('snli_example', 'snli_test', batch_size)

    #train_batcher.subscribe_to_events(AccuracyHook('Train', print_every_x_batches=1000))
    train_batcher.subscribe_to_events(
        LossHook('Train', print_every_x_batches=100))
    train_batcher.subscribe_to_events(
        AccuracyHook('Train', print_every_x_batches=100))
    dev_batcher.subscribe_to_events(
        AccuracyHook('Dev', print_every_x_batches=100))
    dev_batcher.subscribe_to_events(LossHook('Dev', print_every_x_batches=100))
    eta = ETAHook(print_every_x_batches=100)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)

    net = Net(vocab.num_embeddings, vocab.num_labels)
    if Config.cuda:
        net.cuda()

    epochs = 10
    opt = torch.optim.Adam(net.parameters(), lr=0.001)
    net.train()
    for epoch in range(epochs):
        for str2var in train_batcher:
            opt.zero_grad()
            loss, argmax = net(str2var)
            loss.backward()
            opt.step()
            train_batcher.state.loss = loss
            train_batcher.state.targets = str2var['target']
            train_batcher.state.argmax = argmax

    net.eval()
    for i, str2var in enumerate(dev_batcher):
        t = str2var['target']
        idx = str2var['index']
        loss, argmax = net(str2var)
        dev_batcher.state.loss = loss
        dev_batcher.state.targets = str2var['target']
        dev_batcher.state.argmax = argmax
Example #17
0
def main():
    if Config.process: preprocess(Config.dataset, delete_data=True)
    input_keys = ['e1', 'rel', 'e2', 'e2_multi1', 'e2_multi2']
    p = Pipeline(Config.dataset, keys=input_keys)
    p.load_vocabs()
    vocab = p.state['vocab']

    num_entities = vocab['e1'].num_token

    train_batcher = StreamBatcher(Config.dataset,
                                  'train',
                                  Config.batch_size,
                                  randomize=True,
                                  keys=input_keys)
    dev_rank_batcher = StreamBatcher(Config.dataset,
                                     'dev_ranking',
                                     Config.batch_size,
                                     randomize=False,
                                     loader_threads=4,
                                     keys=input_keys,
                                     is_volatile=True)
    test_rank_batcher = StreamBatcher(Config.dataset,
                                      'test_ranking',
                                      Config.batch_size,
                                      randomize=False,
                                      loader_threads=4,
                                      keys=input_keys,
                                      is_volatile=True)

    # Load literals
    numerical_literals = np.load(
        f'data/{Config.dataset}/literals/numerical_literals.npy')

    # Normalize literals
    max_lit, min_lit = np.max(numerical_literals,
                              axis=0), np.min(numerical_literals, axis=0)
    numerical_literals = (numerical_literals - min_lit) / (max_lit - min_lit +
                                                           1e-8)

    # Load literal models
    if Config.model_name is None:
        model = DistMultLiteral(vocab['e1'].num_token, vocab['rel'].num_token,
                                numerical_literals)
    elif Config.model_name == 'DistMultLiteral_highway':
        model = DistMultLiteral_highway(vocab['e1'].num_token,
                                        vocab['rel'].num_token,
                                        numerical_literals)
    elif Config.model_name == 'DistMultLiteral_gate':
        model = DistMultLiteral_gate(vocab['e1'].num_token,
                                     vocab['rel'].num_token,
                                     numerical_literals)
    elif Config.model_name == 'ComplEx':
        model = ComplexLiteral(vocab['e1'].num_token, vocab['rel'].num_token,
                               numerical_literals)
    elif Config.model_name == 'ConvE':
        model = ConvELiteral(vocab['e1'].num_token, vocab['rel'].num_token,
                             numerical_literals)
    elif Config.model_name == 'ConvEAlt':
        model = ConvELiteralAlt(vocab['e1'].num_token, vocab['rel'].num_token,
                                numerical_literals)
    elif Config.model_name == 'DistMultNN':
        model = DistMultLiteralNN(vocab['e1'].num_token,
                                  vocab['rel'].num_token, numerical_literals)
    elif Config.model_name == 'DistMultNN2':
        model = DistMultLiteralNN2(vocab['e1'].num_token,
                                   vocab['rel'].num_token, numerical_literals)
    else:
        log.info('Unknown model: {0}', Config.model_name)
        raise Exception("Unknown model!")

    train_batcher.at_batch_prepared_observers.insert(
        1, TargetIdx2MultiTarget(num_entities, 'e2_multi1',
                                 'e2_multi1_binary'))

    eta = ETAHook('train', print_every_x_batches=100)
    train_batcher.subscribe_to_events(eta)
    train_batcher.subscribe_to_start_of_epoch_event(eta)
    train_batcher.subscribe_to_events(
        LossHook('train', print_every_x_batches=100))

    if Config.cuda:
        model.cuda()
    if load:
        model_params = torch.load(model_path)
        print(model)
        total_param_size = []
        params = [(key, value.size(), value.numel())
                  for key, value in model_params.items()]
        for key, size, count in params:
            total_param_size.append(count)
            print(key, size, count)
        print(np.array(total_param_size).sum())
        model.load_state_dict(model_params)
        model.eval()
        ranking_and_hits(model, test_rank_batcher, vocab, 'test_evaluation')
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
    else:
        model.init()

    total_param_size = []
    params = [value.numel() for value in model.parameters()]
    print(params)
    print(np.sum(params))

    opt = torch.optim.Adam(model.parameters(),
                           lr=Config.learning_rate,
                           weight_decay=Config.L2)
    for epoch in range(epochs):
        model.train()
        for i, str2var in enumerate(train_batcher):
            opt.zero_grad()
            e1 = str2var['e1']
            rel = str2var['rel']
            e2_multi = str2var['e2_multi1_binary'].float()
            # label smoothing

            #e2_multi = ((1.0-Config.label_smoothing_epsilon)*e2_multi) + (1.0/e2_multi.size(1))
            pred = model.forward(e1, rel)
            loss = model.loss(pred, e2_multi)
            loss.backward()
            opt.step()

            train_batcher.state.loss = loss

        print('saving to {0}'.format(model_path))
        torch.save(model.state_dict(), model_path)

        model.eval()
        ranking_and_hits(model, dev_rank_batcher, vocab, 'dev_evaluation')
        if epoch % 3 == 0:
            if epoch > 0:
                ranking_and_hits(model, test_rank_batcher, vocab,
                                 'test_evaluation')