Beispiel #1
0
# Test Kaggle learntools
from learntools.core import binder
binder.bind(globals())
from learntools.python.ex1 import *
color = "blue"
q0.check()
print("learntools ok")

# PyTorch smoke test based on http://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html
import torch
import torch.nn as tnn
import torch.autograd as autograd
torch.manual_seed(31337)
linear_torch = tnn.Linear(5, 3)
data_torch = autograd.Variable(torch.randn(2, 5))
print(linear_torch(data_torch))
print("PyTorch ok")

import fastai
from fastai.io import get_data
print("fast.ai ok")

import numpy as np
print("Numpy imported ok")
print("Your lucky number is: " + str(np.random.randint(100)))

# Numpy must be linked to the MKL. (Occasionally, a third-party package will muck up the installation
# and numpy will be reinstalled with an OpenBLAS backing.)
from numpy.distutils.system_info import get_info
# This will throw an exception if the MKL is not linked correctly.
Beispiel #2
0
 def look_up_embed(self, id):
     lookup_tensor = torch.LongTensor([id - 1]).cuda()
     return self.embeds(autograd.Variable(lookup_tensor))
Beispiel #3
0
        os.path.join(parts[i], trial) for trial in os.listdir(parts[i])
    ]
    trials += tmp_trial

running_loss = 0
while True:
    trial_order = np.random.permutation(trials)
    for itr in trial_order:
        gd.load_csv(os.path.join(itr, GAZE_NAME))

        inds = range(max(WINDOWS), len(gd.data) - 1, 1)
        order = np.random.permutation(inds)
        batches = load_batches(order, BATCH_SIZE, gd.data)
        for i in order:
            b1, b2, b3, lbl = batches.next()
            b1 = ag.Variable(b1)
            b2 = ag.Variable(b2)
            b3 = ag.Variable(b3)
            lbl = ag.Variable(lbl)

            optimizer.zero_grad()
            print('forward')
            out = gazenet(b1, b2, b3)
            print('loss')
            loss = criterion(out, lbl)
            print('backward')
            loss.backward()
            print('optimise')
            optimizer.step()
            print(loss.data[0])
            if loss.data[0] < lowest_loss:
Beispiel #4
0
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)
def run_epoch(data_loader, train_model, model, gen, optimizer, step, args):
    '''
    Train model for one pass of train data, and return loss, acccuracy
    '''
    eval_model = not train_model
    data_iter = data_loader.__iter__()

    losses = []
    obj_losses = []
    k_selection_losses = []
    k_continuity_losses = []
    preds = []
    golds = []
    losses = []

    if train_model:
        model.train()
        gen.train()
    else:
        gen.eval()
        model.eval()

    num_batches_per_epoch = len(data_iter)
    if train_model:
        num_batches_per_epoch = min(len(data_iter), 10000)

    for _ in tqdm.tqdm(range(num_batches_per_epoch)):
        batch = data_iter.next()
        if train_model:
            step += 1
            if  step % 100 == 0 or args.debug_mode:
                args.gumbel_temprature = max( np.exp((step+1) *-1* args.gumbel_decay), .05)

        x_indx = utils.get_x_indx(batch, args, eval_model)
        text = batch['text']
        indices = batch['i']
        y = autograd.Variable(batch['y'], volatile=eval_model)

        if args.cuda:
            x_indx, y = x_indx.cuda(), y.cuda()

        if train_model:
            optimizer.zero_grad()

        if args.get_rationales:
            mask, z = gen(x_indx)
        else:
            mask = None

        logit, _ = model(x_indx, mask=mask)


        loss = get_loss(logit, y, args)
        obj_loss = loss

        if args.get_rationales:
            selection_cost, continuity_cost = gen.loss(mask, x_indx)

            loss += args.selection_lambda* selection_cost
            loss += args.continuity_lambda* continuity_cost

        if train_model:
            loss.backward()
            optimizer.step()

        if args.get_rationales:
            k_selection_losses.append( generic.tensor_to_numpy(selection_cost))
            k_continuity_losses.append( generic.tensor_to_numpy(continuity_cost))

        obj_losses.append(generic.tensor_to_numpy(obj_loss))
        losses.append( generic.tensor_to_numpy(loss) )
        preds.extend(
            torch.max(logit.data,
                      1)[1].view(y.size()).cpu().numpy())  # Record predictions
        golds.extend(batch['y'].numpy())


    if args.objective  in ['cross_entropy', 'margin']:
        metric = sklearn.metrics.accuracy_score(y_true=golds, y_pred=preds)
        confusion_matrix = sklearn.metrics.confusion_matrix(y_true=golds,y_pred=preds)
    elif args.objective == 'mse':
        metric = sklearn.metrics.mean_squared_error(y_true=golds, y_pred=preds)
        confusion_matrix = "NA"

    epoch_stat = {
        'loss' : np.mean(losses),
        'obj_loss': np.mean(obj_losses),
        'metric':metric,
        'confusion_matrix': confusion_matrix 
    }

    if args.get_rationales:
        epoch_stat['k_selection_loss'] = np.mean(k_selection_losses)
        epoch_stat['k_continuity_loss'] = np.mean(k_continuity_losses)

    return epoch_stat, step,  losses, preds, golds
Beispiel #6
0
    super(LSTMpred, self).__init__()

    self.hidden_dim = hidden_dim
    self.hidden_layers = hidden_layers
    self.input_size = input_size
    self.output_size = output_size

    self.lstm = nn.LSTM(input_size, self.hidden_dim, hidden_layers, batch_first=True)  #lstm = nn.LSTM(input_dim, lstmoutput/hidden_dim, 
num_of_layers)
    self.hidden2out = nn.Linear(self.hidden_dim, output_size)

  def init_hidden(self, batch):

    return  (autograd.Variable(torch.zeros(self.hidden_layers, batch, self.hidden_dim)),  # (num_layers * num_directions, batch size, 
hidden_size)
                autograd.Variable(torch.zeros(self.hidden_layers, batch, self.hidden_dim)))


  def forward(self, batch_in, lengths):
#      print("inputs len", batch_in.size(),lengths)
      self.hidden = self.init_hidden(batch_in.size(0))
      pack = torch.nn.utils.rnn.pack_padded_sequence(batch_in, lengths, batch_first=True)
      packed_output, (ht, ct) = self.lstm(pack, self.hidden)
      unpacked, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
      final_out = (self.hidden2out((unpacked)))
      return final_out

def pad_seq(sequence):

  ordered = sorted(sequence, key=len, reverse=True)
  lengths = [len(x) for x in ordered]
Beispiel #7
0
import sys

# Get the arguments
# open the argument parsing results (discourse arguments)
if len(sys.argv) != 3:
    print("USAGE> Prepare_Label_Vecs.py [label_file] [dataset_name]")
ce_file = open(sys.argv[1], "r")
ce_csv = csv.reader(ce_file)

tweet_id = 0
ce_vec_seqs = []
ce_vec = []

next(ce_csv)  # skip the header
for line in ce_csv:
    causality_vec = []
    if tweet_id != int(line[0]):

        ce_vec_seqs.append(autograd.Variable(torch.LongTensor(ce_vec)))
        ce_vec = []

    ce_vec.append(int(line[2]))
    tweet_id = int(line[0])

ce_vec_seqs.append(autograd.Variable(
    torch.LongTensor(ce_vec)))  # input the final ce vec for the final tweet
del (ce_vec_seqs[0])  # delete the first empty element

pickle.dump(
    ce_vec_seqs,
    open("causal_explanation_da_labels_" + sys.argv[2] + ".list", "wb"))
Beispiel #8
0
def make_context_vector(context, word_to_idx):
    idx = [word_to_idx[w] for w in context]
    tensor = torch.LongTensor(idx)
    return autograd.Variable(tensor)
Beispiel #9
0
def make_target_vector(target, word_to_idx):
    idx = [word_to_idx[target]]
    tensor = torch.LongTensor(idx)
    return autograd.Variable(tensor)
Beispiel #10
0
 def init_hidden(self, batch_size):
     h0 = autograd.Variable(torch.zeros(1, batch_size,
                                        self.hidden_size)).cuda()
     c0 = autograd.Variable(torch.zeros(1, batch_size,
                                        self.hidden_size)).cuda()
     return h0, c0
Beispiel #11
0
def main():

    DIR = args.DIR
    embedding_file = args.embedding_dir

    best_network_file = "./model/network_model_pretrain.best.top.pair"
    print >> sys.stderr,"Read model from ",best_network_file
    best_network_model = torch.load(best_network_file)

    embedding_matrix = numpy.load(embedding_file)
    "Building torch model"
    network_model = network.Network(nnargs["pair_feature_dimention"],nnargs["mention_feature_dimention"],nnargs["word_embedding_dimention"],nnargs["span_dimention"],1000,nnargs["embedding_size"],nnargs["embedding_dimention"],embedding_matrix).cuda()
    net_copy(network_model,best_network_model)

    best_network_file = "./model/network_model_pretrain.best.top.ana"
    print >> sys.stderr,"Read model from ",best_network_file
    best_network_model = torch.load(best_network_file)

    ana_network = network.Network(nnargs["pair_feature_dimention"],nnargs["mention_feature_dimention"],nnargs["word_embedding_dimention"],nnargs["span_dimention"],1000,nnargs["embedding_size"],nnargs["embedding_dimention"],embedding_matrix).cuda()
    net_copy(ana_network,best_network_model)

    reduced=""
    if args.reduced == 1:
        reduced="_reduced"

    print >> sys.stderr,"prepare data for train ..."
    train_docs_iter = DataReader.DataGnerater("train"+reduced)
    print >> sys.stderr,"prepare data for dev and test ..."
    dev_docs_iter = DataReader.DataGnerater("dev"+reduced)
    test_docs_iter = DataReader.DataGnerater("test"+reduced)

    print "Performance after pretraining..."
    print "DEV"
    metric = performance.performance(dev_docs_iter,network_model,ana_network) 
    print "Average:",metric["average"]
    print "TEST"
    metric = performance.performance(test_docs_iter,network_model,ana_network) 
    print "Average:",metric["average"]
    print "***"
    print
    sys.stdout.flush()

    l2_lambda = 1e-6
    #lr = 0.00001
    #lr = 0.000005
    lr = 0.000002
    #lr = 0.0000009
    dropout_rate = 0.5
    shuffle = True
    times = 0

    reinforce = True

    model_save_dir = "./model/reinforce/"
    utils.mkdir(model_save_dir)

    score_softmax = nn.Softmax()
    optimizer = optim.RMSprop(network_model.parameters(), lr=lr, eps = 1e-6)
    ana_optimizer = optim.RMSprop(ana_network.parameters(), lr=lr, eps = 1e-6)

    scheduler = lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.5)
    ana_scheduler = lr_scheduler.StepLR(ana_optimizer, step_size=15, gamma=0.5)
   
    for echo in range(30):

        start_time = timeit.default_timer()
        print "Pretrain Epoch:",echo

        scheduler.step()
        ana_scheduler.step()

        train_docs = utils.load_pickle(args.DOCUMENT + 'train_docs.pkl')

        docs_by_id = {doc.did: doc for doc in train_docs}
       
        print >> sys.stderr,"Link docs ..."
        tmp_data = []
        path = []
        for data in train_docs_iter.rl_case_generater(shuffle=True):
            mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\
            target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return = data

            mention_index = autograd.Variable(torch.from_numpy(mention_word_index).type(torch.cuda.LongTensor))
            mention_spans = autograd.Variable(torch.from_numpy(mention_span).type(torch.cuda.FloatTensor))
            candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor))
            candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor))
            pair_feature = autograd.Variable(torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor))
            anaphors = autograd.Variable(torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor))
            antecedents = autograd.Variable(torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor))

            anaphoricity_index = autograd.Variable(torch.from_numpy(anaphoricity_word_indexs).type(torch.cuda.LongTensor))
            anaphoricity_span = autograd.Variable(torch.from_numpy(anaphoricity_spans).type(torch.cuda.FloatTensor))
            anaphoricity_feature = autograd.Variable(torch.from_numpy(anaphoricity_features).type(torch.cuda.FloatTensor))

            output, pair_score = network_model.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents,0.0)
            ana_output, ana_score = ana_network.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, 0.0)
            ana_pair_output, ana_pair_score = ana_network.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents, 0.0)

            reindex = autograd.Variable(torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor))

            scores_reindex = torch.transpose(torch.cat((pair_score,ana_score),1),0,1)[reindex]
            ana_scores_reindex = torch.transpose(torch.cat((ana_pair_score,ana_score),1),0,1)[reindex]

            doc = docs_by_id[rl['did']]

            for s,e in zip(rl["starts"],rl["ends"]):
                score = score_softmax(torch.transpose(ana_scores_reindex[s:e],0,1)).data.cpu().numpy()[0]
                pair_score = score_softmax(torch.transpose(scores_reindex[s:e-1],0,1)).data.cpu().numpy()[0]

                ana_action = utils.sample_action(score)
                if ana_action == (e-s-1):
                    action = ana_action
                else:
                    pair_action = utils.sample_action(pair_score*score[:-1])
                    action = pair_action
                path.append(action)
                link = action
                m1, m2 = rl['ids'][s + link]
                doc.link(m1, m2)

            tmp_data.append((mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return))
                
            if rl["end"] == True:
                doc = docs_by_id[rl['did']]
                reward = doc.get_f1()
                inside_index = 0
                for mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,rl,candi_ids_return in tmp_data:

                    for (start, end) in zip(rl['starts'], rl['ends']):
                        ids = rl['ids'][start:end]
                        ana = ids[0, 1]
                        old_ant = doc.ana_to_ant[ana]
                        doc.unlink(ana)
                        costs = rl['costs'][start:end]
                        for ant_ind in range(end - start):
                            costs[ant_ind] = doc.link(ids[ant_ind, 0], ana, hypothetical=True, beta=1)
                        doc.link(old_ant, ana) 

                    cost = 0.0
                    mention_index = autograd.Variable(torch.from_numpy(mention_word_index).type(torch.cuda.LongTensor))
                    mention_spans = autograd.Variable(torch.from_numpy(mention_span).type(torch.cuda.FloatTensor))
                    candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor))
                    candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor))
                    pair_feature = autograd.Variable(torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor))
                    anaphors = autograd.Variable(torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor))
                    antecedents = autograd.Variable(torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor))
                    anaphoricity_index = autograd.Variable(torch.from_numpy(anaphoricity_word_indexs).type(torch.cuda.LongTensor))
                    anaphoricity_span = autograd.Variable(torch.from_numpy(anaphoricity_spans).type(torch.cuda.FloatTensor))
                    anaphoricity_feature = autograd.Variable(torch.from_numpy(anaphoricity_features).type(torch.cuda.FloatTensor))
        
                    ana_output, ana_score = ana_network.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate)
                    ana_pair_output, ana_pair_score = ana_network.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents,dropout_rate)
        
                    reindex = autograd.Variable(torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor))
        
                    ana_scores_reindex = torch.transpose(torch.cat((ana_pair_score,ana_score),1),0,1)[reindex]
        
                    ana_optimizer.zero_grad()
                    ana_loss = None
                    i = inside_index
                    for s,e in zip(rl["starts"],rl["ends"]):
                        costs = rl["costs"][s:e]
                        costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor))
                        score = torch.squeeze(score_softmax(torch.transpose(ana_scores_reindex[s:e],0,1)))
                        baseline = torch.sum(score*costs) 

                        action = path[i]
                        this_cost = torch.log(score[action])*-1.0*(reward-baseline)
                        
                        if ana_loss is None:
                            ana_loss = this_cost
                        else:
                            ana_loss += this_cost
                        i += 1
                    ana_loss.backward()
                    torch.nn.utils.clip_grad_norm(ana_network.parameters(), 5.0)
                    ana_optimizer.step()
        
                    mention_index = autograd.Variable(torch.from_numpy(mention_word_index).type(torch.cuda.LongTensor))
                    mention_spans = autograd.Variable(torch.from_numpy(mention_span).type(torch.cuda.FloatTensor))
                    candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor))
                    candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor))
                    pair_feature = autograd.Variable(torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor))
                    anaphors = autograd.Variable(torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor))
                    antecedents = autograd.Variable(torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor))
        
                    anaphoricity_index = autograd.Variable(torch.from_numpy(anaphoricity_word_indexs).type(torch.cuda.LongTensor))
                    anaphoricity_span = autograd.Variable(torch.from_numpy(anaphoricity_spans).type(torch.cuda.FloatTensor))
                    anaphoricity_feature = autograd.Variable(torch.from_numpy(anaphoricity_features).type(torch.cuda.FloatTensor))
        
                    output, pair_score = network_model.forward_all_pair(nnargs["word_embedding_dimention"],mention_index,mention_spans,candi_index,candi_spans,pair_feature,anaphors,antecedents,dropout_rate)
        
                    ana_output, ana_score = ana_network.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate)
        
                    reindex = autograd.Variable(torch.from_numpy(rl["reindex"]).type(torch.cuda.LongTensor))
        
                    scores_reindex = torch.transpose(torch.cat((pair_score,ana_score),1),0,1)[reindex]
        
                    pair_loss = None
                    optimizer.zero_grad()
                    i = inside_index
                    index = 0
                    for s,e in zip(rl["starts"],rl["ends"]):
                        action = path[i]
                        if (not (action == (e-s-1))) and (anaphoricity_target[index] == 1):
                            costs = rl["costs"][s:e-1]
                            costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor))
                            score = torch.squeeze(score_softmax(torch.transpose(scores_reindex[s:e-1],0,1)))
                            baseline = torch.sum(score*costs)
                            this_cost = torch.log(score[action])*-1.0*(reward-baseline)
                            if pair_loss is None:
                                pair_loss = this_cost
                            else:
                                pair_loss += this_cost
                        i += 1
                        index += 1
                    if pair_loss is not None:
                        pair_loss.backward()
                        torch.nn.utils.clip_grad_norm(network_model.parameters(), 5.0)
                        optimizer.step()
                    inside_index = i

                tmp_data = []
                path = []
                        
        end_time = timeit.default_timer()
        print >> sys.stderr, "TRAINING Use %.3f seconds"%(end_time-start_time)
        print >> sys.stderr, "cost:",cost
        print >> sys.stderr,"save model ..."
        torch.save(network_model, model_save_dir+"network_model_rl_worker.%d"%echo)
        torch.save(ana_network, model_save_dir+"network_model_rl_manager.%d"%echo)
        
        print "DEV"
        metric = performance.performance(dev_docs_iter,network_model,ana_network) 
        print "Average:",metric["average"]
        print "DEV Ana: ",metric["ana"]
        print "TEST"
        metric = performance.performance(test_docs_iter,network_model,ana_network) 
        print "Average:",metric["average"]
        print "TEST Ana: ",metric["ana"]
        print

        sys.stdout.flush()
Beispiel #12
0
        if i %(len(train_data)/Division) == 0:
            # evaluate
            eval_result = evaluate(model, dev_data, dictionaries)
            accuracys.append(eval_result['accuracy'])
            precisions.append(eval_result['precision'])
            recalls.append(eval_result['recall'])
            FB1s.append(eval_result['FB1'])
            save_model_dictionaries('model', model, dictionaries, opts)

        # Step 1. Remember that Pytorch accumulates gradients.  We need to clear them out
        # before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into Variables
        # of word indices.
        input_words = autograd.Variable(torch.LongTensor(train_data[index]['words']))
        targets = autograd.Variable(torch.LongTensor(train_data[index]['tags']))

        # Step 3. Run our forward pass. We combine this step with get_loss function
        #tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by calling
        loss = model.get_loss(targets, input_words = input_words)

        epoch_costs.append(loss.data.numpy())
        loss.backward()
        nn.utils.clip_grad_norm(model.parameters(), opts.clip)
        optimizer.step()


    print("Epoch %i, cost average: %f" % (epoch, np.mean(epoch_costs)))
def main():

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n
    num_rewards = len(task_rewards[mode])

    full_rollout = True

    env_model     = EnvModel(envs.observation_space.shape, num_pixels, num_rewards)
    env_model.load_state_dict(torch.load("env_model_" + mode))

    distil_policy = ActorCritic(envs.observation_space.shape, envs.action_space.n)
    distil_optimizer = optim.Adam(distil_policy.parameters())

    imagination = ImaginationCore(1, state_shape, num_actions, num_rewards, env_model, distil_policy, full_rollout=full_rollout)

    actor_critic = I2A(state_shape, num_actions, num_rewards, 256, imagination, full_rollout=full_rollout)
    #rmsprop hyperparams:
    lr    = 7e-4
    eps   = 1e-5
    alpha = 0.99
    optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha)

    #if USE_CUDA:
    #    env_model     = env_model.cuda()
    #    distil_policy = distil_policy.cuda()
    #    actor_critic  = actor_critic.cuda()

    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 5
    num_frames = int(10e5)

    rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape)
    #rollout.cuda()

    all_rewards = []
    all_losses  = []

    state = envs.reset()
    current_state = torch.FloatTensor(np.float32(state))

    rollout.states[0].copy_(current_state)

    episode_rewards = torch.zeros(num_envs, 1)
    final_rewards   = torch.zeros(num_envs, 1)

    for i_update in tqdm(range(num_frames)):

        for step in range(num_steps):
            #if USE_CUDA:
            #    current_state = current_state.cuda()
            action = actor_critic.act(autograd.Variable(current_state))

            next_state, reward, done, _ = envs.step(action.squeeze(1).cpu().data.numpy())

            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            masks = torch.FloatTensor(1-np.array(done)).unsqueeze(1)
            final_rewards *= masks
            final_rewards += (1-masks) * episode_rewards
            episode_rewards *= masks

            #if USE_CUDA:
            #    masks = masks.cuda()

            current_state = torch.FloatTensor(np.float32(next_state))
            rollout.insert(step, current_state, action.data, reward, masks)


        _, next_value = actor_critic(autograd.Variable(rollout.states[-1], volatile=True))
        next_value = next_value.data

        returns = rollout.compute_returns(next_value, gamma)

        logit, action_log_probs, values, entropy = actor_critic.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1)
        )
    
        distil_logit, _, _, _ = distil_policy.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1)
        )
        
        distil_loss = 0.01 * (F.softmax(logit).detach() * F.log_softmax(distil_logit)).sum(1).mean()

        values = values.view(num_steps, num_envs, 1)
        action_log_probs = action_log_probs.view(num_steps, num_envs, 1)
        advantages = autograd.Variable(returns) - values

        value_loss = advantages.pow(2).mean()
        action_loss = -(autograd.Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        loss.backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
        optimizer.step()
    
        distil_optimizer.zero_grad()
        distil_loss.backward()
        optimizer.step()
    
        if i_update % 100 == 0:
            all_rewards.append(final_rewards.mean())
            all_losses.append(loss.item())
        
            #clear_output(True)
            plt.figure(figsize=(20,5))
            plt.subplot(131)
            plt.title('epoch %s. reward: %s' % (i_update, np.mean(all_rewards[-10:])))
            plt.plot(all_rewards)
            plt.subplot(132)
            plt.title('loss %s' % all_losses[-1])
            plt.plot(all_losses)
            plt.show()
        
        rollout.after_update()

    torch.save(actor_critic.state_dict(), "i2a_" + mode)
 def feature_size(self):
     return self.features(autograd.Variable(torch.zeros(1, *self.in_shape))).view(1, -1).size(1)
Beispiel #15
0
import math, random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd 
import torch.nn.functional as F

from IPython.display import clear_output
import matplotlib.pyplot as plt

USE_CUDA = torch.cuda.is_available()
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)

from collections import deque

class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
            
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
Beispiel #16
0
 def wrap_var(self, x, **kwargs):
     x = A.Variable(x, **kwargs)
     return x
Beispiel #17
0
def train(model, dataloader, devloader,loss_function=nn.CrossEntropyLoss(),
          init_lr=0.1, epochs=100, lr_decay_epoch = 30,
          print_epoch = 10, gpu=False):

    # Cuda is not critical for this task with low dimensionol inputs
    if gpu and torch.cuda.is_available():
        model.cuda()
    losses = []
    train_accs_top10 = []
    dev_losses = []
    dev_accs_top10 = []
    for epoch in range(epochs):

        # learning rate decay
        div, mod = divmod(epoch, lr_decay_epoch)
        if mod == 0:
            optimizer = optim.SGD(model.parameters(), lr=init_lr*(0.1)**div)

        total_loss = torch.Tensor([0])
        total_dev_loss = torch.Tensor([0])

        # iterate the dataset to load context heroes(team) and center hero(target)
        for teams, targets in dataloader:

            if gpu and torch.cuda.is_available():
                teams = teams.cuda()
                targets = targets.cuda()

            # wrap the embeddings of the team and target center hero to Variable
            inputs = autograd.Variable(teams)
            targets = autograd.Variable(targets.view(-1))

            # zero out the accumulated gradients
            model.zero_grad()

            # Run the forward pass
            out = model(inputs)
            # Compute your loss function.
            loss = loss_function(out, targets)

            # backpropagate and update the embeddings
            loss.backward()
            optimizer.step()

            # record total loss in this epoch
            total_loss += loss.cpu().data
        # acc_train_top10 = accuracy_in_train(model,dataloader,batch_size=16)
        # train_accs_top10.append(acc_train_top10)
        print("total_loss is %s"%total_loss)
        # print("total_train_acc is %s"%acc_train_top10)

        for teams, targets in devloader:

            if gpu and torch.cuda.is_available():
                teams = teams.cuda()
                targets = targets.cuda()

            # wrap the embeddings of the team and target center hero to Variable
            inputs = autograd.Variable(teams)
            targets = autograd.Variable(targets.view(-1))

            # zero out the accumulated gradients
            # model.zero_grad()

            # Run the forward pass
            out = model(inputs)

            # Compute your loss function.
            dev_loss = loss_function(out, targets)
            # print("dev_loss is %s"%dev_loss)

            # # backpropagate and update the embeddings
            # loss.backward()
            # optimizer.step()

            # record total loss in this epoch
            total_dev_loss += dev_loss.cpu().data 
        print("total_dev_loss is %s"%total_dev_loss)
        # acc_dev_top10 = accuracy_in_train(model,devloader,batch_size=16)
        # print("total_dev_acc is %s"%acc_dev_top10)

        # dev_accs_top10.append(acc_dev_top10)
        if epoch % print_epoch == 0:
            print('epoch: %d, loss: %.3f' % (epoch, total_loss/len(dataloader)))
            print("dev loss:%s"%str(total_dev_loss/len(devloader)))
           

        losses.append(total_loss/len(dataloader))
        dev_losses.append(total_dev_loss/len(devloader))
    # return losses for plot
    return np.array(losses),np.array(dev_losses),np.array(train_accs_top10),np.array(dev_accs_top10)
Beispiel #18
0
optimizer = optim.SGD(model.parameters(), lr=0.0008, weight_decay=1e-2)


n_train_samples = len(X_train)
# For n epochs...
for epoch in range(N_EPOCHS):
  total_loss = torch.Tensor([0])

  random_indices = np.random.permutation(n_train_samples)

  for index in random_indices:
    review = X_train[index]
    label = int(y_train[index]) # Why doesn't y.astype(int) work??

    # Initialize hidden layer
    hidden = autograd.Variable(torch.zeros((1, 128)))
    word_vector = autograd.Variable(torch.LongTensor(review))
    model.zero_grad()
    for w in range(word_vector.size()[0]):
      output, hidden = model(word_vector[w], hidden)
    
    loss = loss_function(output, autograd.Variable(torch.LongTensor([label])))
    loss.backward()
    torch.nn.utils.clip_grad_norm(model.parameters(), MAX_NORM)
    optimizer.step()
    total_loss += loss.data
  print(torch.norm(next(model.parameters()).grad))
  print("[epoch {}] {}".format(epoch, total_loss))

#print(losses)  # The loss decreased every iteration over the training data!
Beispiel #19
0
  def init_hidden(self, batch):

    return  (autograd.Variable(torch.zeros(self.hidden_layers, batch, self.hidden_dim)),  # (num_layers * num_directions, batch size, 
Beispiel #20
0
    def _viterbi_decode_nbest(self, feats, mask, nbest):
        """
            input:
                feats: (batch, seq_len, self.tag_size+2)
                mask: (batch, seq_len)
            output:
                decode_idx: (batch, nbest, seq_len) decoded sequence
                path_score: (batch, nbest) corresponding score for each sequence (to be implementated)
                nbest decode for sentence with one token is not well supported, to be optimized
        """
        batch_size = feats.size(0)
        seq_len = feats.size(1)
        tag_size = feats.size(2)
        assert (tag_size == self.tagset_size + 2)
        ## calculate sentence length for each sentence
        length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long()
        ## mask to (seq_len, batch_size)
        mask = mask.transpose(1, 0).contiguous()
        ins_num = seq_len * batch_size
        ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
        feats = feats.transpose(1, 0).contiguous().view(
            ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
        ## need to consider start
        scores = feats + self.transitions.view(1, tag_size, tag_size).expand(
            ins_num, tag_size, tag_size)
        scores = scores.view(seq_len, batch_size, tag_size, tag_size)

        # build iter
        seq_iter = enumerate(scores)
        ## record the position of best score
        back_points = list()
        partition_history = list()
        ##  reverse mask (bug for mask = 1- mask, use this as alternative choice)
        # mask = 1 + (-1)*mask
        mask = (1 - mask.long()).bool()
        _, inivalues = next(
            seq_iter)  # bat_size * from_target_size * to_target_size
        # only need start from start_tag
        partition = inivalues[:, START_TAG, :].clone(
        )  # bat_size * to_target_size
        ## initial partition [batch_size, tag_size]
        partition_history.append(
            partition.view(batch_size, tag_size,
                           1).expand(batch_size, tag_size, nbest))
        # iter over last scores
        for idx, cur_values in seq_iter:
            if idx == 1:
                cur_values = cur_values.view(
                    batch_size, tag_size,
                    tag_size) + partition.contiguous().view(
                        batch_size, tag_size, 1).expand(
                            batch_size, tag_size, tag_size)
            else:
                # previous to_target is current from_target
                # partition: previous results log(exp(from_target)), #(batch_size * nbest * from_target)
                # cur_values: batch_size * from_target * to_target
                cur_values = cur_values.view(
                    batch_size, tag_size, 1, tag_size).expand(
                        batch_size, tag_size, nbest,
                        tag_size) + partition.contiguous().view(
                            batch_size, tag_size, nbest, 1).expand(
                                batch_size, tag_size, nbest, tag_size)
                ## compare all nbest and all from target
                cur_values = cur_values.view(batch_size, tag_size * nbest,
                                             tag_size)
                # print "cur size:",cur_values.size()
            partition, cur_bp = torch.topk(cur_values, nbest, 1)
            ## cur_bp/partition: [batch_size, nbest, tag_size], id should be normize through nbest in following backtrace step
            # print partition[:,0,:]
            # print cur_bp[:,0,:]
            # print "nbest, ",idx
            if idx == 1:
                cur_bp = cur_bp * nbest
            partition = partition.transpose(2, 1)
            cur_bp = cur_bp.transpose(2, 1)

            # print partition
            # exit(0)
            #partition: (batch_size * to_target * nbest)
            #cur_bp: (batch_size * to_target * nbest) Notice the cur_bp number is the whole position of tag_size*nbest, need to convert when decode
            partition_history.append(partition)
            ## cur_bp: (batch_size,nbest, tag_size) topn source score position in current tag
            ## set padded label as 0, which will be filtered in post processing
            ## mask[idx] ? mask[idx-1]
            cur_bp.masked_fill_(
                mask[idx].view(batch_size, 1,
                               1).expand(batch_size, tag_size, nbest), 0)
            # print cur_bp[0]
            back_points.append(cur_bp)
        ### add score to final STOP_TAG
        partition_history = torch.cat(partition_history, 0).view(
            seq_len, batch_size, tag_size, nbest).transpose(
                1, 0).contiguous()  ## (batch_size, seq_len, nbest, tag_size)
        ### get the last position for each setences, and select the last partitions using gather()
        last_position = length_mask.view(batch_size, 1, 1, 1).expand(
            batch_size, 1, tag_size, nbest) - 1
        last_partition = torch.gather(partition_history, 1,
                                      last_position).view(
                                          batch_size, tag_size, nbest, 1)
        ### calculate the score from last partition to end state (and then select the STOP_TAG from it)
        last_values = last_partition.expand(
            batch_size, tag_size, nbest, tag_size) + self.transitions.view(
                1, tag_size, 1, tag_size).expand(batch_size, tag_size, nbest,
                                                 tag_size)
        last_values = last_values.view(batch_size, tag_size * nbest, tag_size)
        end_partition, end_bp = torch.topk(last_values, nbest, 1)
        ## end_partition: (batch, nbest, tag_size)
        end_bp = end_bp.transpose(2, 1)
        # end_bp: (batch, tag_size, nbest)
        pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size,
                                                 nbest)).long()
        if self.gpu:
            pad_zero = pad_zero.cuda()
        back_points.append(pad_zero)
        back_points = torch.cat(back_points).view(seq_len, batch_size,
                                                  tag_size, nbest)

        ## select end ids in STOP_TAG
        pointer = end_bp[:, STOP_TAG, :]  ## (batch_size, nbest)
        insert_last = pointer.contiguous().view(
            batch_size, 1, 1, nbest).expand(batch_size, 1, tag_size, nbest)
        back_points = back_points.transpose(1, 0).contiguous()
        ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values
        # print "lp:",last_position
        # print "il:",insert_last[0]
        # exit(0)
        ## copy the ids of last position:insert_last to back_points, though the last_position index
        ## last_position includes the length of batch sentences
        # print "old:", back_points[9,0,:,:]
        back_points.scatter_(1, last_position, insert_last)
        ## back_points: [batch_size, seq_length, tag_size, nbest]
        # print "new:", back_points[9,0,:,:]
        # exit(0)
        # print pointer[2]
        '''
        back_points: in simple demonstratration
        x,x,x,x,x,x,x,x,x,7
        x,x,x,x,x,4,0,0,0,0
        x,x,6,0,0,0,0,0,0,0
        '''

        back_points = back_points.transpose(1, 0).contiguous()
        # print back_points[0]
        ## back_points: (seq_len, batch, tag_size, nbest)
        ## decode from the end, padded position ids are 0, which will be filtered in following evaluation
        decode_idx = autograd.Variable(
            torch.LongTensor(seq_len, batch_size, nbest))
        if self.gpu:
            decode_idx = decode_idx.cuda()
        decode_idx[-1] = pointer.data / nbest
        # print "pointer-1:",pointer[2]
        # exit(0)
        # use old mask, let 0 means has token
        for idx in range(len(back_points) - 2, -1, -1):
            # print "pointer: ",idx,  pointer[3]
            # print "back:",back_points[idx][3]
            # print "mask:",mask[idx+1,3]
            new_pointer = torch.gather(
                back_points[idx].view(batch_size, tag_size * nbest), 1,
                pointer.contiguous().view(batch_size, nbest))
            decode_idx[idx] = new_pointer.data / nbest
            # # use new pointer to remember the last end nbest ids for non longest
            pointer = new_pointer + pointer.contiguous().view(
                batch_size, nbest) * mask[idx].view(batch_size, 1).expand(
                    batch_size, nbest).long()

        # exit(0)
        path_score = None
        decode_idx = decode_idx.transpose(1, 0)
        ## decode_idx: [batch, seq_len, nbest]
        # print decode_idx[:,:,0]
        # print "nbest:",nbest
        # print "diff:", decode_idx[:,:,0]- decode_idx[:,:,4]
        # print decode_idx[:,0,:]
        # exit(0)

        ### calculate probability for each sequence
        scores = end_partition[:, :, STOP_TAG]
        ## scores: [batch_size, nbest]
        max_scores, _ = torch.max(scores, 1)
        minus_scores = scores - max_scores.view(batch_size, 1).expand(
            batch_size, nbest)
        path_score = F.softmax(minus_scores, 1)
        ## path_score: [batch_size, nbest]
        # exit(0)
        return path_score, decode_idx
Beispiel #21
0
#Author: Zhi Zhong

import sys 
import torch 
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

## LSTM cell (input->3, output->3)
lstm = nn.LSTM(3,3)
inputs = [autograd.Variable(torch.randn((1,3))) for _ in range(5)]
print(inputs)


hidden = (autograd.Variable(torch.randn((1,1,3)),
	autograd.Variable(torch.randn((1,1,3))))

print(hidden) 


for i in inputs:
    out, hidden = lstm(i.view(1,1,-1), hidden)
    print(out)
    print(hidden)


 def init_hidden(self):
     # the first is the hidden h
     # the second is the cell  c
     return (autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim).cuda()),
             autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim).cuda()))
Beispiel #23
0
    def _viterbi_decode(self, feats, mask):
        """
            input:
                feats: (batch, seq_len, self.tag_size+2)
                mask: (batch, seq_len)
            output:
                decode_idx: (batch, seq_len) decoded sequence
                path_score: (batch, 1) corresponding score for each sequence (to be implementated)
        """
        batch_size = feats.size(0)
        seq_len = feats.size(1)
        tag_size = feats.size(2)
        assert (tag_size == self.tagset_size + 2)
        ## calculate sentence length for each sentence
        length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long()
        ## mask to (seq_len, batch_size)
        mask = mask.transpose(1, 0).contiguous()
        ins_num = seq_len * batch_size
        ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
        feats = feats.transpose(1, 0).contiguous().view(
            ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
        ## need to consider start
        scores = feats + self.transitions.view(1, tag_size, tag_size).expand(
            ins_num, tag_size, tag_size)
        scores = scores.view(seq_len, batch_size, tag_size, tag_size)

        # build iter
        seq_iter = enumerate(scores)
        ## record the position of best score
        back_points = list()
        partition_history = list()

        ##  reverse mask (bug for mask = 1- mask, use this as alternative choice)
        # mask = 1 + (-1)*mask
        mask = (1 - mask.long()).byte()
        _, inivalues = seq_iter.__next__(
        )  # bat_size * from_target_size * to_target_size
        # only need start from start_tag
        partition = inivalues[:, START_TAG, :].clone().view(
            batch_size, tag_size)  # bat_size * to_target_size
        partition_history.append(partition)
        # iter over last scores
        for idx, cur_values in seq_iter:
            # previous to_target is current from_target
            # partition: previous results log(exp(from_target)), #(batch_size * from_target)
            # cur_values: batch_size * from_target * to_target
            cur_values = cur_values + partition.contiguous().view(
                batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size)
            ## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG
            partition, cur_bp = torch.max(cur_values, 1)
            partition_history.append(partition)
            ## cur_bp: (batch_size, tag_size) max source score position in current tag
            ## set padded label as 0, which will be filtered in post processing
            cur_bp.masked_fill_(
                mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)
            back_points.append(cur_bp)
        ### add score to final STOP_TAG
        partition_history = torch.cat(partition_history, 0).view(
            seq_len, batch_size,
            -1).transpose(1,
                          0).contiguous()  ## (batch_size, seq_len. tag_size)
        ### get the last position for each setences, and select the last partitions using gather()
        last_position = length_mask.view(batch_size, 1, 1).expand(
            batch_size, 1, tag_size) - 1
        last_partition = torch.gather(partition_history, 1,
                                      last_position).view(
                                          batch_size, tag_size, 1)
        ### calculate the score from last partition to end state (and then select the STOP_TAG from it)
        last_values = last_partition.expand(
            batch_size, tag_size, tag_size) + self.transitions.view(
                1, tag_size, tag_size).expand(batch_size, tag_size, tag_size)
        _, last_bp = torch.max(last_values, 1)
        pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long()
        if self.gpu:
            pad_zero = pad_zero.cuda()
        back_points.append(pad_zero)
        back_points = torch.cat(back_points).view(seq_len, batch_size,
                                                  tag_size)

        ## select end ids in STOP_TAG
        pointer = last_bp[:, STOP_TAG]
        insert_last = pointer.contiguous().view(batch_size, 1, 1).expand(
            batch_size, 1, tag_size)
        back_points = back_points.transpose(1, 0).contiguous()
        ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values
        # print "lp:",last_position
        # print "il:",insert_last
        back_points.scatter_(1, last_position, insert_last)
        # print "bp:",back_points
        # exit(0)
        back_points = back_points.transpose(1, 0).contiguous()
        ## decode from the end, padded position ids are 0, which will be filtered if following evaluation
        decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size))
        if self.gpu:
            decode_idx = decode_idx.cuda()
        decode_idx[-1] = pointer.data
        for idx in range(len(back_points) - 2, -1, -1):
            pointer = torch.gather(back_points[idx], 1,
                                   pointer.contiguous().view(batch_size, 1))
            decode_idx[idx] = pointer.data
        path_score = None
        decode_idx = decode_idx.transpose(1, 0)
        return path_score, decode_idx
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    idxs = torch.LongTensor(idxs)
    #print(len(tensor))
    #tensor = tensor.view(batch_size,len(tensor)/batch_size)
    return autograd.Variable(idxs)
Beispiel #25
0
def main():

    DIR = args.DIR
    embedding_file = args.embedding_dir

    #network_file = "./model/model.pkl"
    network_file = "./model/pretrain/network_model_pretrain.49"
    if os.path.isfile(network_file):
        print >> sys.stderr, "Read model from ./model/model.pkl"
        network_model = torch.load(network_file)
    else:
        embedding_matrix = numpy.load(embedding_file)
        #print len(embedding_matrix)

        "Building torch model"
        network_model = network.Network(pair_feature_dimention,
                                        mention_feature_dimention,
                                        word_embedding_dimention,
                                        span_dimention, 1000, embedding_size,
                                        embedding_dimention,
                                        embedding_matrix).cuda()
        print >> sys.stderr, "save model ..."
        torch.save(network_model, network_file)

    reduced = ""
    if args.reduced == 1:
        reduced = "_reduced"

    print >> sys.stderr, "prepare data for train ..."
    train_docs = DataReader.DataGnerater("train" + reduced)
    print >> sys.stderr, "prepare data for dev and test ..."
    dev_docs = DataReader.DataGnerater("dev" + reduced)
    test_docs = DataReader.DataGnerater("test" + reduced)

    l2_lambda = 1e-6
    lr = 0.00009
    dropout_rate = 0.5
    shuffle = True
    times = 0
    best_thres = 0.5

    model_save_dir = "./model/pretrain/"

    last_cost = 0.0
    all_best_results = {
        'thresh': 0.0,
        'accuracy': 0.0,
        'precision': 0.0,
        'recall': 0.0,
        'f1': 0.0
    }

    #for echo in range(30,200):
    for echo in range(50, 150):

        start_time = timeit.default_timer()
        print "Pretrain Epoch:", echo

        if echo == 100:
            lr = lr * 0.7
        #if echo == 150:
        #    lr = lr/2.0

        #optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, network_model.parameters()), lr=lr, weight_decay=l2_lambda)
        #optimizer = optim.RMSprop(network_model.parameters(), lr=lr, weight_decay=l2_lambda)
        optimizer = optim.RMSprop(network_model.parameters(),
                                  lr=lr,
                                  eps=1e-5,
                                  weight_decay=l2_lambda)

        pair_cost_this_turn = 0.0
        ana_cost_this_turn = 0.0

        pair_nums = 0
        ana_nums = 0

        pos_num = 0
        neg_num = 0
        inside_time = 0.0

        for data in train_docs.train_generater(shuffle=shuffle):

            mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\
            target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target = data
            mention_index = autograd.Variable(
                torch.from_numpy(mention_word_index).type(
                    torch.cuda.LongTensor))
            mention_span = autograd.Variable(
                torch.from_numpy(mention_span).type(torch.cuda.FloatTensor))
            candi_index = autograd.Variable(
                torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor))
            candi_spans = autograd.Variable(
                torch.from_numpy(candi_span).type(torch.cuda.FloatTensor))
            pair_feature = autograd.Variable(
                torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor))
            anaphors = autograd.Variable(
                torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor))
            antecedents = autograd.Variable(
                torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor))

            anaphoricity_index = autograd.Variable(
                torch.from_numpy(anaphoricity_word_indexs).type(
                    torch.cuda.LongTensor))
            anaphoricity_span = autograd.Variable(
                torch.from_numpy(anaphoricity_spans).type(
                    torch.cuda.FloatTensor))
            anaphoricity_feature = autograd.Variable(
                torch.from_numpy(anaphoricity_features).type(
                    torch.cuda.FloatTensor))

            gold = target.tolist()
            anaphoricity_gold = anaphoricity_target.tolist()

            pair_nums += len(gold)
            ana_nums += len(anaphoricity_gold)

            lable = autograd.Variable(torch.cuda.FloatTensor([gold]))
            ana_lable = autograd.Variable(
                torch.cuda.FloatTensor([anaphoricity_gold]))

            output, _ = network_model.forward_all_pair(
                word_embedding_dimention, mention_index, mention_span,
                candi_index, candi_spans, pair_feature, anaphors, antecedents,
                dropout_rate)
            ana_output, _ = network_model.forward_anaphoricity(
                word_embedding_dimention, anaphoricity_index,
                anaphoricity_span, anaphoricity_feature, dropout_rate)

            optimizer.zero_grad()

            #loss = get_pair_loss(output,positive,negative,train_docs.scale_factor)
            loss = F.binary_cross_entropy(
                output, lable, size_average=False) / train_docs.scale_factor
            ana_loss = F.binary_cross_entropy(
                ana_output, ana_lable,
                size_average=False) / train_docs.anaphoricity_scale_factor

            pair_cost_this_turn += loss.data[0] * train_docs.scale_factor
            ana_cost_this_turn += ana_loss.data[
                0] * train_docs.anaphoricity_scale_factor

            loss_all = loss + ana_loss
            loss_all.backward()
            optimizer.step()

        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTrain epoch", echo, "Pair total cost:", pair_cost_this_turn / float(
            pair_nums), "Anaphoricity total cost", ana_cost_this_turn / float(
                ana_nums)
        print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time -
                                                               start_time)
        print >> sys.stderr, "Learning Rate", lr

        print >> sys.stderr, "save model ..."
        torch.save(network_model,
                   model_save_dir + "network_model_pretrain.%d" % echo)

        #if cost_this_turn > last_cost:
        #    lr = lr*0.7
        gold = []
        predict = []

        ana_gold = []
        ana_predict = []

        for data in dev_docs.train_generater(shuffle=False):

            mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\
            target,positive,negative, anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target = data

            mention_index = autograd.Variable(
                torch.from_numpy(mention_word_index).type(
                    torch.cuda.LongTensor))
            mention_span = autograd.Variable(
                torch.from_numpy(mention_span).type(torch.cuda.FloatTensor))
            candi_index = autograd.Variable(
                torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor))
            candi_spans = autograd.Variable(
                torch.from_numpy(candi_span).type(torch.cuda.FloatTensor))
            pair_feature = autograd.Variable(
                torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor))
            anaphors = autograd.Variable(
                torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor))
            antecedents = autograd.Variable(
                torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor))

            anaphoricity_index = autograd.Variable(
                torch.from_numpy(anaphoricity_word_indexs).type(
                    torch.cuda.LongTensor))
            anaphoricity_span = autograd.Variable(
                torch.from_numpy(anaphoricity_spans).type(
                    torch.cuda.FloatTensor))
            anaphoricity_feature = autograd.Variable(
                torch.from_numpy(anaphoricity_features).type(
                    torch.cuda.FloatTensor))

            gold += target.tolist()
            ana_gold += anaphoricity_target.tolist()

            output, _ = network_model.forward_all_pair(
                word_embedding_dimention, mention_index, mention_span,
                candi_index, candi_spans, pair_feature, anaphors, antecedents,
                0.0)
            predict += output.data.cpu().numpy()[0].tolist()

            ana_output, _ = network_model.forward_anaphoricity(
                word_embedding_dimention, anaphoricity_index,
                anaphoricity_span, anaphoricity_feature, 0.0)
            ana_predict += ana_output.data.cpu().numpy()[0].tolist()

        gold = numpy.array(gold, dtype=numpy.int32)
        predict = numpy.array(predict)

        best_results = {
            'thresh': 0.0,
            'accuracy': 0.0,
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0
        }

        thresh_list = [0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]
        for thresh in thresh_list:
            evaluation_results = get_metrics(gold, predict, thresh)
            if evaluation_results["f1"] >= best_results["f1"]:
                best_results = evaluation_results

        print "Pair accuracy: %f and Fscore: %f with thresh: %f"\
                %(best_results["accuracy"],best_results["f1"],best_results["thresh"])
        sys.stdout.flush()

        if best_results["f1"] > all_best_results["f1"]:
            all_best_results = best_results
            print >> sys.stderr, "New High Result, Save Model"
            torch.save(network_model,
                       model_save_dir + "network_model_pretrain.best")

        ana_gold = numpy.array(ana_gold, dtype=numpy.int32)
        ana_predict = numpy.array(ana_predict)
        best_results = {
            'thresh': 0.0,
            'accuracy': 0.0,
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0
        }
        for thresh in thresh_list:
            evaluation_results = get_metrics(ana_gold, ana_predict, thresh)
            if evaluation_results["f1"] >= best_results["f1"]:
                best_results = evaluation_results
        print "Anaphoricity accuracy: %f and Fscore: %f with thresh: %f"\
                %(best_results["accuracy"],best_results["f1"],best_results["thresh"])
        sys.stdout.flush()

        if (echo + 1) % 10 == 0:
            best_network_model = torch.load(model_save_dir +
                                            "network_model_pretrain.best")
            print "DEV:"
            performance.performance(dev_docs, best_network_model)
            print "TEST:"
            performance.performance(test_docs, best_network_model)

    ## output best
    print "In sum, anaphoricity accuracy: %f and Fscore: %f with thresh: %f"\
        %(best_results["accuracy"],best_results["f1"],best_results["thresh"])
    sys.stdout.flush()
Beispiel #26
0
    def forward(self, data):
        slow_feats = Variable(torch.cuda.FloatTensor(data[1]))
        moderate_feats = Variable(torch.cuda.FloatTensor(data[2]))
        fast_feats = Variable(torch.cuda.FloatTensor(data[3]))
        #slow_feats = Variable(torch.FloatTensor(data[1]))
        #moderate_feats = Variable(torch.FloatTensor(data[2]))
        #fast_feats = Variable(torch.FloatTensor(data[3]))

        slow_feats = slow_feats.unsqueeze(
            0)  #this is to add a batch size dimension
        moderate_feats = moderate_feats.unsqueeze(0)
        fast_feats = fast_feats.unsqueeze(0)

        # Forward passes

        #print(slow_feats.shape,moderate_feats.shape,fast_feats.shape)
        pad_attn_slow = self._forward(slow_feats, 'slow')
        pad_attn_moderate = self._forward(moderate_feats, 'moderate')
        pad_attn_fast = self._forward(fast_feats, 'fast')

        if self.lstm_output_type == 'same':
            if self.use_second_attention:
                new_tensor = torch.cuda.FloatTensor(1, 3, self.lstm_hidden_dim)
                #first_attns = torch.cat(1,(pad_attn_slow, pad_attn_moderate,pad_attn_fast)) #concat to be 1x3xhidden_dim
                new_tensor[:, 0, :] = pad_attn_slow
                new_tensor[:, 1, :] = pad_attn_moderate
                new_tensor[:, 2, :] = pad_attn_fast
                pad_attn = self.final_attn(
                    (new_tensor, autograd.Variable(torch.cuda.LongTensor(
                        [3]))))  #length of 3 always because fast,slow,moderate
            else:
                # Concatenate slow, moderate and fast
                pad_attn = torch.cat(
                    (pad_attn_slow, pad_attn_moderate, pad_attn_fast),
                    1)  #concat to be 1x3*hidden_dim
        elif self.lstm_output_type == 'different':
            if self.use_second_attention:
                #pad all with zeros
                new_tensor = torch.cuda.FloatTensor(1, 3, self.hidden_dim_slow)
                padded_moderate = F.pad(pad_attn_moderate,
                                        pad=(0, self.hidden_dim_slow -
                                             self.hidden_dim_moderate))
                padded_fast = F.pad(pad_attn_fast,
                                    pad=(0, self.hidden_dim_slow -
                                         self.hidden_dim_fast))
                new_tensor[:, 0, :] = pad_attn_slow
                new_tensor[:, 1, :] = padded_moderate
                new_tensor[:, 2, :] = padded_fast
                pad_attn = self.final_attn(
                    (new_tensor, autograd.Variable(torch.cuda.LongTensor(
                        [3]))))  #length of 3 always because fast,slow,moderate
            else:
                # Concatenate slow, moderate and fast
                pad_attn = torch.cat(
                    (pad_attn_slow, pad_attn_moderate, pad_attn_fast),
                    1)  #concat to be 1x3*hidden_dim

        # Pass through FC layer and Softmax
        tag_space = self.hidden2tag(pad_attn)
        tag_score = F.log_softmax(tag_space, dim=1)

        # Return predictions
        return tag_score
Beispiel #27
0
 def init_hidden(self, batch_size):
     #        h,c shape: [num_layers * num_directions, batch, hidden_size]
     return (autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)),
             autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)))
                if teacher_forcing or True:
                    enc_loss += criterion(
                        pred_c,
                        autograd.Variable(torch.LongTensor([input_c_encoded])))
                prev_c = input_c_encoded
                # prev_c_encoded = autograd.Variable(
                #     torch.from_numpy(np.array([input_c_encoded], np.int32)).long().view(1, 1)
                # )
            if n <= 4 and epoch % print_every == 0:
                if n == 0:
                    encoder_debug += 'epoch %s encoder:\n' % epoch
                encoder_debug += '    [%s] => [%s]\n' % (input_sentence_verify,
                                                         sentence)
            return state, enc_loss

        state = autograd.Variable(torch.zeros(1, 1, hidden_size))
        state, enc_loss = encode(input_encoded, state)
        loss += enc_loss

        # decode
        if False:
            prev_c_encoded = autograd.Variable(
                torch.from_numpy(np.array([encoding.start_code],
                                          np.int32)).long().view(1, 1))

            output_sentence = ''
            for t, target_c_encoded in enumerate(target_encoded[1:]):
                # this is going to correspond approximately to
                # 'teacher forcing' in the seq2seq example
                # on the pytorch website
                prev_c_embedded = embedding(prev_c_encoded)
Beispiel #29
0
def make_var(np_array, requires_grad=False):
    tensor = torch.from_numpy(np_array.astype(np.float32))
    return autograd.Variable(tensor, requires_grad=requires_grad)
Beispiel #30
0
def batchify_with_label(data, input_batch_list, input_batch_list_text, gpu):

    with torch.no_grad():  # feili, compatible with 0.4
        batch_size = len(input_batch_list)
        words = [sent[0] for sent in input_batch_list]
        if input_batch_list_text is None:
            chars = [sent[1] for sent in input_batch_list]
        if data.feat_config is not None:
            if len(input_batch_list[0]) > 3:
                labels = [sent[2] for sent in input_batch_list]
                features = [np.asarray(sent[3]) for sent in input_batch_list]
                feature_num = len(features[0][0])
            else:
                labels = None
                features = [np.asarray(sent[2]) for sent in input_batch_list]
                feature_num = len(features[0][0])

        else:
            if len(input_batch_list[0]) > 2:
                labels = [sent[2] for sent in input_batch_list]
            else:
                labels = None
        word_seq_lengths = torch.LongTensor(list(map(len, words)))

        if input_batch_list_text is not None:
            if labels:
                words_text = [sent[3] for sent in input_batch_list_text]
            else:
                words_text = [sent[2] for sent in input_batch_list_text]

        max_seq_len = word_seq_lengths.max().item()
        word_seq_tensor = autograd.Variable(
            torch.zeros((batch_size, max_seq_len), dtype=torch.long))
        label_seq_tensor = autograd.Variable(
            torch.zeros((batch_size, max_seq_len), dtype=torch.long))
        if data.feat_config is not None:
            feature_seq_tensors = []
            for idx in range(feature_num):
                feature_seq_tensors.append(
                    autograd.Variable(
                        torch.zeros((batch_size, max_seq_len),
                                    dtype=torch.long)))
        if input_batch_list_text is not None:
            words_text_tensor = [['<pad>' for col in range(max_seq_len)]
                                 for row in range(batch_size)]

        mask = autograd.Variable(
            torch.zeros((batch_size, max_seq_len), dtype=torch.uint8))
        if labels:
            for idx, (seq, label,
                      seqlen) in enumerate(zip(words, labels,
                                               word_seq_lengths)):
                word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
                label_seq_tensor[idx, :seqlen] = torch.LongTensor(label)
                mask[idx, :seqlen] = torch.Tensor([1] * seqlen.item())
                if data.feat_config is not None:
                    for idy in range(feature_num):
                        feature_seq_tensors[idy][
                            idx, :seqlen] = torch.LongTensor(
                                features[idx][:, idy])
                if input_batch_list_text is not None:
                    words_text_tensor[idx][:seqlen] = words_text[idx]

        else:
            for idx, (seq, seqlen) in enumerate(zip(words, word_seq_lengths)):
                word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
                mask[idx, :seqlen] = torch.Tensor([1] * seqlen.item())
                if data.feat_config is not None:
                    for idy in range(feature_num):
                        feature_seq_tensors[idy][
                            idx, :seqlen] = torch.LongTensor(
                                features[idx][:, idy])
                if input_batch_list_text is not None:
                    words_text_tensor[idx][:seqlen] = words_text[idx]

        word_seq_lengths, word_perm_idx = word_seq_lengths.sort(
            0, descending=True)
        word_seq_tensor = word_seq_tensor[word_perm_idx]
        if data.feat_config is not None:
            for idx in range(feature_num):
                feature_seq_tensors[idx] = feature_seq_tensors[idx][
                    word_perm_idx]

        if labels:
            label_seq_tensor = label_seq_tensor[word_perm_idx]
        mask = mask[word_perm_idx]

        if input_batch_list_text is not None:
            words_text_tensor_1 = []
            for i in range(batch_size):
                ii = word_perm_idx[i].item()
                words_text_tensor_1.append(words_text_tensor[ii])

            char_seq_tensor = None
            char_seq_lengths = None
            char_seq_recover = None
        else:
            words_text_tensor_1 = None
            ### deal with char
            # pad_chars (batch_size, max_seq_len)
            pad_chars = [
                chars[idx] + [[0]] * (max_seq_len - len(chars[idx]))
                for idx in range(len(chars))
            ]
            length_list = [list(map(len, pad_char)) for pad_char in pad_chars]
            max_word_len = max(list(map(max, length_list)))
            char_seq_tensor = autograd.Variable(
                torch.zeros((batch_size, max_seq_len, max_word_len),
                            dtype=torch.long))
            char_seq_lengths = torch.LongTensor(length_list)
            for idx, (seq,
                      seqlen) in enumerate(zip(pad_chars, char_seq_lengths)):
                for idy, (word, wordlen) in enumerate(zip(seq, seqlen)):
                    # print len(word), wordlen
                    char_seq_tensor[idx,
                                    idy, :wordlen] = torch.LongTensor(word)

            char_seq_tensor = char_seq_tensor[word_perm_idx].view(
                batch_size * max_seq_len, -1)
            char_seq_lengths = char_seq_lengths[word_perm_idx].view(
                batch_size * max_seq_len, )
            char_seq_lengths, char_perm_idx = char_seq_lengths.sort(
                0, descending=True)
            char_seq_tensor = char_seq_tensor[char_perm_idx]
            _, char_seq_recover = char_perm_idx.sort(0, descending=False)

        _, word_seq_recover = word_perm_idx.sort(0, descending=False)
        if opt.gpu >= 0 and torch.cuda.is_available():
            word_seq_tensor = word_seq_tensor.cuda(gpu)

            word_seq_lengths = word_seq_lengths.cuda(gpu)
            word_seq_recover = word_seq_recover.cuda(gpu)
            if labels:
                label_seq_tensor = label_seq_tensor.cuda(gpu)
            if data.feat_config is not None:
                for idx in range(feature_num):
                    feature_seq_tensors[idx] = feature_seq_tensors[idx].cuda(
                        gpu)
            if input_batch_list_text is None:
                char_seq_tensor = char_seq_tensor.cuda(gpu)
                char_seq_recover = char_seq_recover.cuda(gpu)
            mask = mask.cuda(gpu)

        if labels:
            if data.feat_config is not None:
                return word_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, label_seq_tensor, mask, feature_seq_tensors, words_text_tensor_1
            else:
                return word_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, label_seq_tensor, mask, None, words_text_tensor_1
        else:
            if data.feat_config is not None:
                return word_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, None, mask, feature_seq_tensors, words_text_tensor_1
            else:
                return word_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, None, mask, None, words_text_tensor_1