def get_pc(data, We, weight4ind, params): "Comput the principal component" def get_weighted_average(We, x, w): "Compute the weighted average vectors" n_samples = x.shape[0] emb = np.zeros((n_samples, We.shape[1])) for i in range(n_samples): emb[i, :] = w[i, :].dot(We[x[i, :], :]) / np.count_nonzero(w[i, :]) return emb for i in data: i[0].populate_embeddings(words) if not params.task == "sentiment": i[1].populate_embeddings(words) if params.task == "ent": (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataEntailment(data) if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, weight4ind) elif params.task == "sim": (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataSim(data, -1) if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, weight4ind) elif params.task == "sentiment": (scores, g1x, g1mask) = data_io.getDataSentiment(data) if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, weight4ind) emb = get_weighted_average(We, g1x, g1mask) svd = TruncatedSVD(n_components=params.npc, n_iter=7, random_state=0) svd.fit(emb) return svd.components_
def train_util(model, train_data, dev, test, train, words, params): "utility function for training the model" start_time = time() try: for eidx in range(params.epochs): kf = data_io.get_minibatches_idx(len(train_data), params.batchsize, shuffle=True) uidx = 0 for _, train_index in kf: uidx += 1 batch = [train_data[t] for t in train_index] # load the word ids for i in batch: i[0].populate_embeddings(words) if not params.task == "sentiment": i[1].populate_embeddings(words) # load the data if params.task == "ent": (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataEntailment(batch) elif params.task == "sim": (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataSim(batch, model.nout) elif params.task == "sentiment": (scores, g1x, g1mask) = data_io.getDataSentiment(batch) else: raise ValueError('Task should be ent or sim.') # train if not params.task == "sentiment": if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, params.weight4ind) g2mask = data_io.seq2weight(g2x, g2mask, params.weight4ind) cost = model.train_function(scores, g1x, g2x, g1mask, g2mask) else: if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, params.weight4ind) cost = model.train_function(scores, g1x, g1mask) if np.isnan(cost) or np.isinf(cost): print('NaN detected') # undo batch to save RAM for i in batch: i[0].representation = None i[0].unpopulate_embeddings() if not params.task == "sentiment": i[1].representation = None i[1].unpopulate_embeddings() # evaluate if params.task == "sim": dp, ds = eval.supervised_evaluate(model, words, dev, params) tp, ts = eval.supervised_evaluate(model, words, test, params) rp, rs = eval.supervised_evaluate(model, words, train, params) print(("evaluation: ", dp, ds, tp, ts, rp, rs)) elif params.task == "ent" or params.task == "sentiment": ds = eval.supervised_evaluate(model, words, dev, params) ts = eval.supervised_evaluate(model, words, test, params) rs = eval.supervised_evaluate(model, words, train, params) print(("evaluation: ", ds, ts, rs)) else: raise ValueError('Task should be ent or sim.') print(('Epoch ', (eidx + 1), 'Cost ', cost)) sys.stdout.flush() except KeyboardInterrupt: print("Training interupted") end_time = time() print(("total time:", (end_time - start_time)))