def get_pc(data, We, weight4ind, params): "Comput the principal component" def get_weighted_average(We, x, w): "Compute the weighted average vectors" n_samples = x.shape[0] emb = np.zeros((n_samples, We.shape[1])) for i in range(n_samples): emb[i, :] = w[i, :].dot(We[x[i, :], :]) / np.count_nonzero(w[i, :]) return emb for i in data: i[0].populate_embeddings(words) if not params.task == "sentiment": i[1].populate_embeddings(words) if params.task == "ent": (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataEntailment(data) if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, weight4ind) elif params.task == "sim": (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataSim(data, -1) if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, weight4ind) elif params.task == "sentiment": (scores, g1x, g1mask) = data_io.getDataSentiment(data) if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, weight4ind) emb = get_weighted_average(We, g1x, g1mask) svd = TruncatedSVD(n_components=params.npc, n_iter=7, random_state=0) svd.fit(emb) return svd.components_
def getAccSentiment(model, words, f, params=[]): f = open(f, 'r') lines = f.readlines() preds = [] golds = [] seq1 = [] ct = 0 for i in lines: i = i.split("\t") p1 = i[0] score = i[1] X1 = data_io.getSeq(p1, words) seq1.append(X1) ct += 1 if ct % 100 == 0: x1, m1 = data_io.prepare_data(seq1) if params and params.weightfile: m1 = data_io.seq2weight(x1, m1, params.weight4ind) scores = model.scoring_function(x1, m1) scores = np.squeeze(scores) preds.extend(scores.tolist()) seq1 = [] golds.append(score) if len(seq1) > 0: x1, m1 = data_io.prepare_data(seq1) if params and params.weightfile: m1 = data_io.seq2weight(x1, m1, params.weight4ind) scores = model.scoring_function(x1, m1) scores = np.squeeze(scores) preds.extend(scores.tolist()) return accSentiment(preds, golds)
def sim_getCorrelation(We, words, f, weight4ind, scoring_function, params): f = open(f, 'r') lines = f.readlines() golds = [] seq1 = [] seq2 = [] for i in lines: i = i.split("\t") p1 = i[0] p2 = i[1] score = float(i[2]) X1, X2 = data_io.getSeqs(p1, p2, words) seq1.append(X1) seq2.append(X2) golds.append(score) x1, m1 = data_io.prepare_data(seq1) x2, m2 = data_io.prepare_data(seq2) m1 = data_io.seq2weight(x1, m1, weight4ind) m2 = data_io.seq2weight(x2, m2, weight4ind) scores = scoring_function(We, x1, x2, m1, m2, params) preds = np.squeeze(scores) return pearsonr(preds, golds)[0], spearmanr(preds, golds)[0]
def embeding_sentence_cosine_similarity(s1,s2): word_idx_seq_of_sentence, mask = data_io.sentences2idx([s1,s2], Word2Indx) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location print(s1,s2) print('word_idx_seq_of_sentence') print(word_idx_seq_of_sentence) print('mask') print(mask) word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask, Index2Weight) # get word weights # set parameters param = params.params() param.rmpc = rmpc embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence, word_weight_of_sentence, param) s1_embed = embedding[0] s2_embed = embedding[1] return distance.cosine(s1_embed,s2_embed)
def cosine_distance_by_sentence_vector(s1, s2): word_idx_seq_of_sentence, mask = data_io.sentences2idx( [' '.join(s1), ' '.join(s2)], Word2Indx ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location word_weight_of_sentence = data_io.seq2weight( word_idx_seq_of_sentence, mask, Index2Weight) # get word weights # set parameters param = params.params() param.rmpc = rmpc embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence, word_weight_of_sentence, param) s1_embed = embedding[0] s2_embed = embedding[1] return cosine_similarity(s1_embed, s2_embed)
def train_util(model, train_data, dev, test, train, words, params): "utility function for training the model" start_time = time() try: for eidx in range(params.epochs): kf = data_io.get_minibatches_idx(len(train_data), params.batchsize, shuffle=True) uidx = 0 for _, train_index in kf: uidx += 1 batch = [train_data[t] for t in train_index] # load the word ids for i in batch: i[0].populate_embeddings(words) if not params.task == "sentiment": i[1].populate_embeddings(words) # load the data if params.task == "ent": (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataEntailment(batch) elif params.task == "sim": (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataSim(batch, model.nout) elif params.task == "sentiment": (scores, g1x, g1mask) = data_io.getDataSentiment(batch) else: raise ValueError('Task should be ent or sim.') # train if not params.task == "sentiment": if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, params.weight4ind) g2mask = data_io.seq2weight(g2x, g2mask, params.weight4ind) cost = model.train_function(scores, g1x, g2x, g1mask, g2mask) else: if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, params.weight4ind) cost = model.train_function(scores, g1x, g1mask) if np.isnan(cost) or np.isinf(cost): print('NaN detected') # undo batch to save RAM for i in batch: i[0].representation = None i[0].unpopulate_embeddings() if not params.task == "sentiment": i[1].representation = None i[1].unpopulate_embeddings() # evaluate if params.task == "sim": dp, ds = eval.supervised_evaluate(model, words, dev, params) tp, ts = eval.supervised_evaluate(model, words, test, params) rp, rs = eval.supervised_evaluate(model, words, train, params) print("evaluation: ", dp, ds, tp, ts, rp, rs) elif params.task == "ent" or params.task == "sentiment": ds = eval.supervised_evaluate(model, words, dev, params) ts = eval.supervised_evaluate(model, words, test, params) rs = eval.supervised_evaluate(model, words, train, params) print("evaluation: ", ds, ts, rs) else: raise ValueError('Task should be ent or sim.') print('Epoch ', (eidx + 1), 'Cost ', cost) sys.stdout.flush() except KeyboardInterrupt: print("Training interupted") end_time = time() print("total time:", (end_time - start_time))
# input wordfile = '../data/glove.840B.300d.txt' # word vector file, can be downloaded from GloVe website weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme sentences = [ 'this is an example sentence', 'this is another sentence that is slightly longer' ] # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i