def get_pc(data, We, weight4ind, params): "Comput the principal component" def get_weighted_average(We, x, w): "Compute the weighted average vectors" n_samples = x.shape[0] emb = np.zeros((n_samples, We.shape[1])) for i in range(n_samples): emb[i, :] = w[i, :].dot(We[x[i, :], :]) / np.count_nonzero(w[i, :]) return emb for i in data: i[0].populate_embeddings(words) if not params.task == "sentiment": i[1].populate_embeddings(words) if params.task == "ent": (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataEntailment(data) if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, weight4ind) elif params.task == "sim": (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataSim(data, -1) if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, weight4ind) elif params.task == "sentiment": (scores, g1x, g1mask) = data_io.getDataSentiment(data) if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, weight4ind) emb = get_weighted_average(We, g1x, g1mask) svd = TruncatedSVD(n_components=params.npc, n_iter=7, random_state=0) svd.fit(emb) return svd.components_
def getAccSentiment(model,words,f, params=[]): f = open(f,'r') lines = f.readlines() preds = [] golds = [] seq1 = [] ct = 0 for i in lines: i = i.split("\t") p1 = i[0]; score = i[1] X1 = data_io.getSeq(p1,words) seq1.append(X1) ct += 1 if ct % 100 == 0: x1,m1 = data_io.prepare_data(seq1) if params and params.weightfile: m1 = data_io.seq2weight(x1, m1, params.weight4ind) scores = model.scoring_function(x1,m1) scores = np.squeeze(scores) preds.extend(scores.tolist()) seq1 = [] golds.append(score) if len(seq1) > 0: x1,m1 = data_io.prepare_data(seq1) if params and params.weightfile: m1 = data_io.seq2weight(x1, m1, params.weight4ind) scores = model.scoring_function(x1,m1) scores = np.squeeze(scores) preds.extend(scores.tolist()) return accSentiment(preds,golds)
def sim_getCorrelation(We, words, f, weight4ind, scoring_function, params): f = open(f, 'r') lines = f.readlines() golds = [] seq1 = [] seq2 = [] for i in lines: i = i.split("\t") p1 = i[0] p2 = i[1] score = float(i[2]) X1, X2 = data_io.getSeqs(p1, p2, words) seq1.append(X1) seq2.append(X2) golds.append(score) x1, m1 = data_io.prepare_data(seq1) x2, m2 = data_io.prepare_data(seq2) m1 = data_io.seq2weight(x1, m1, weight4ind) m2 = data_io.seq2weight(x2, m2, weight4ind) scores = scoring_function(We, x1, x2, m1, m2, params) print seq1[0] print seq2[0] print scores[0] preds = np.squeeze(scores) return pearsonr(preds, golds)[0], spearmanr(preds, golds)[0]
def return_sif(sentences, words, weight4ind, param, Weights): # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # get word weights # get SIF embedding embeddings = SIF_embedding.SIF_embedding(Weights, x, w, param) # embedding[i,:] is the embedding for sentence i return embeddings
def fit(self, sentences, We, lowercase_tokens, embeddings_format, embeddings_filepath, params, word_map, weight4ind): # store these off for pickling or extra transforms self.word_map = word_map self.weight4ind = weight4ind self.params = params self.lowercase_tokens = lowercase_tokens self.embeddings_format = embeddings_format self.embeddings_filepath = embeddings_filepath self.sentence_count = len(sentences) x, m = data_io.sentences2idx(sentences, self.word_map) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, self.weight4ind) # get word weights # now let's do some of what happens in src/SIF_embedding.py # but also keep some pieces along the way #weighted_emb = get_weighted_average(We, x, w) weighted_emb = get_weighted_average_alternate(We, x, w) self.compute_pc(weighted_emb) self.trained = True return self.remove_pc(weighted_emb)
def main(sentences, wordfile: str, weightfile: str, weightpara: float = 1e-3, rmpc: int = 1): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m, _ = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i
def getSIFscore(sentences: list, words, weight4ind, rmpc, We, params, sx: int, sy: int): # load sentences x, m = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # print('load sentences finished') # set parameters params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i embeddingSize = len(embedding) # print('embeddingSize= ',embeddingSize) emb = list() for x in range(embeddingSize): emb.append(embedding[x, :]) emb1 = emb[sx] emb2 = emb[sy] inn = (emb1 * emb2).sum() emb1norm = numpy.sqrt((emb1 * emb1).sum()) emb2norm = numpy.sqrt((emb2 * emb2).sum()) score = inn / emb1norm / emb2norm # print(sentences[sx],'--------',sentences[sy],' = ',score,'\n') return score
def compute_sif_emb(self, sentences): # load sentences x1, m = data_io.sentences2idx(sentences, self.words) w1 = data_io.seq2weight(x1, m, self.weight4ind) result = get_emb(self.We, x1, w1) return result
def get_embedding(sentence, words, weight4ind, params, We): # load sentences xx, mm = data_io.sentences2idx(sentence, words) ww = data_io.seq2weight(xx, mm, weight4ind) # get word weights # get SIF embedding em = SIF_embedding.SIF_embedding( We, xx, ww, params) # embedding[i,:] is the embedding for sentence i return em
def sim_badSents(We, words, weight4ind, scoring_function, params, fpc, sent1, sent2): seq1 = [] seq2 = [] X1, X2 = data_io.getSeqs(sent1, sent2, words) seq1.append(X1) seq2.append(X2) x1, m1 = data_io.prepare_data(seq1) x2, m2 = data_io.prepare_data(seq2) m1 = data_io.seq2weight(x1, m1, weight4ind) m2 = data_io.seq2weight(x2, m2, weight4ind) scores = scoring_function(We, x1, x2, m1, m2, params, fpc) preds = np.squeeze(scores) preds = preds * 2 + 3 return preds
def get_embeddings(words, We, word2weight, weight4ind, filename, params): # load sentences x, m, _ = data_io.sentiment2idx(filename, words) # x is the array of word indices, m is a mask w = data_io.seq2weight(x, m, weight4ind) # get word weights # get SIF embedding embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def sim_getCorrelation(We, words, f, weight4ind, scoring_function, params, fpc, test_name): f = open(f, 'r') lines = f.readlines() golds = [] seq1 = [] seq2 = [] index = [] idx = 0 for i in lines: i = i.split("\t") p1 = i[0] p2 = i[1] score = float(i[2]) X1, X2 = data_io.getSeqs(p1, p2, words) seq1.append(X1) seq2.append(X2) golds.append(score) index.append(idx) idx += 1 x1, m1 = data_io.prepare_data(seq1) x2, m2 = data_io.prepare_data(seq2) m1 = data_io.seq2weight(x1, m1, weight4ind) m2 = data_io.seq2weight(x2, m2, weight4ind) golds = np.asarray(golds) scores = scoring_function(We, x1, x2, m1, m2, params, fpc) # scores = scoring_function(We, x1, x2, m1, m2, golds, params, fpc) # preds = np.squeeze(scores).reshape(-1, 1) preds = np.squeeze(scores) # print('the prediction list is {}'.format(preds)) # add SVM predictor # clf = pickle.load(open('../score_predictor/model_svm', 'rb')) # clf.fit(preds, golds) # preds = clf.predict(preds) print(preds) # np.save(open("../pred_list", 'wb'), preds) # np.save(open("../gold_list", 'wb'), golds) # show_result_image(preds, golds, index, fpc, test_name) # find_bad_scores(preds.tolist(), lower_threshold=2.5, higher_threshold=3.8) MSE = sqrt(mean_squared_error(golds, preds)) return pearsonr(preds, golds)[0], MSE
def get_embs(sentences, params): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # get SIF embedding embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def prepare_first_pc(We, words, weight4ind, generation_function, params, fpc): print("reading file: {}.".format(fpc)) # pre_calculate_first_pc(We, words, fpc, weight4ind, generation_function, params) file_name = fpc f = os.path.join("../data/", fpc) f = open(f, 'r') seq = [] for i in f.readlines(): X = data_io.getSeq(i, words) seq.append(X) x, m = data_io.prepare_data(seq) m = data_io.seq2weight(x, m, weight4ind) generation_function(We, x, m, params, file_name)
def getCorrelation(model,words,f, params=[]): f = open(f,'r') lines = f.readlines() preds = [] golds = [] seq1 = [] seq2 = [] for i in lines: i = i.split("\t") p1 = i[0]; p2 = i[1]; score = float(i[2]) X1, X2 = data_io.getSeqs(p1,p2,words) seq1.append(X1) seq2.append(X2) golds.append(score) x1,m1 = data_io.prepare_data(seq1) x2,m2 = data_io.prepare_data(seq2) if params and params.weightfile: m1 = data_io.seq2weight(x1, m1, params.weight4ind) m2 = data_io.seq2weight(x2, m2, params.weight4ind) scores = model.scoring_function(x1,x2,m1,m2) preds = np.squeeze(scores) return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def sim_getCorrelation1(We, words, file_index, weight4ind, scoring_function, params): f = open(file_index[0], 'r') #print(f) line = f.readlines() lines = [lin for lin in line] f = open(file_index[1], 'r') #print(f) score_line = f.readlines() score_lines = [score for score in score_line] golds = [] seq1 = [] seq2 = [] for index in range(len(lines)): i = lines[index] j = score_lines[index] i = i.split("\t") #print(i) #print(i) p1 = i[0].lower() p2 = i[1].lower() try: score = float(j) X1, X2 = data_io.getSeqs(p1, p2, words) seq1.append(X1) seq2.append(X2) golds.append(score) except: pass x1, m1 = data_io.prepare_data(seq1) x2, m2 = data_io.prepare_data(seq2) m1 = data_io.seq2weight(x1, m1, weight4ind) m2 = data_io.seq2weight(x2, m2, weight4ind) # print(x1,x2,m1,m2) # print(x1.shape,x2.shape,m1.shape,m2.shape) scores = scoring_function(We, x1, x2, m1, m2, params) preds = np.squeeze(scores) return pearsonr(preds, golds)[0]
def generate_vecs(models, document): words, weight4ind, rmpc, We = models x, m = data_io.sentences2idx(document, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters param = params.params() param.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, param) # embedding[i,:] is the embedding for sentence i return embedding
def SIFSentEmbedding(weighttxt, docfile, words, We, weight4ind, weightpara=1e-3, paramm=1): # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] # number of principal components to remove in SIF weighting scheme sentences = sent_tokenize(docfile) x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # get word weights paramm = params.params() paramm = paramm.LC embedding = SIF_embedding.SIF_embedding( We, x, w, paramm) # embedding[i,:] is the embedding for sentence i return embedding
def sif_embedding(sen): import sys #sys.path.append("../src") #sys.path.append("../data") import data_io, params, SIF_embedding import params import SIF_embedding # input wordfile = 'data/dic_files.txt' # word vector file, can be downloaded from GloVe website weightfile = 'data/dic_freq.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme # sentences = ['这是一个例句', '这是一个更长一些的例句'] # sentences = ['昨天天气不错', '这是一个更长一些的例句'] sentences = sen # sentences = ['this is an example sentence', 'this is another sentence that is slightly longer'] # load word vectors (words, We) = data_io.getWordmap(wordfile) # print(words,We) #单词,和词向量 # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences # x, m, _ = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location # print(x,m) w = data_io.seq2weight(x, m, weight4ind) # get word weights # print('word weight:',w) # set parameters # params = params.params() params = params.params_all() # name 'params' is not defined params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def vectorize_sif(filename): class params(object): def __init__(self): self.LW = 1e-5 self.LC = 1e-5 self.eta = 0.05 def __str__(self): t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta t = map(str, t) return ' '.join(t) # input wordfile = 'glove.6B.100d.txt' # word vector file, can be downloaded from GloVe website weightfile = 'enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme #sentiment_file = '../data/sentiment-test' # sentiment data file #cleanfile = "2/D1026-A.M.100.E.10.segs.cl" #sentiment_file = '../data/clean-5.txt' # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences (here use sentiment data as an example) #x, m, _ = data_io.sentiment2idx(sentiment_file, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentiment2idx(filename, words) w = data_io.seq2weight(x, m, weight4ind) # get word weights # parameters params = params() #params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding_lib.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def sentences2embeddings(sentences): """ Input: sentences - a list of sentences Output: sentence_embeddings - a list of sentence embeddings (numpy vectors of shape (1,300)) """ # load sentences x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters parameters = params.params() parameters.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding(We, x, w, parameters) # embedding[i,:] is the embedding for sentence i sentence_embeddings = [] for i in range(len(sentences)): e = np.array(embedding[i,:]).reshape((1,300)) # reshape to fit into the function semantic_similarity sentence_embeddings.append(e) return sentence_embeddings
def get_sent_vec(sentences): import params # 详见data_io.py (words, We) = data_io.getWordmap(wordfile) # 详见data_io.py word2weight = data_io.getWordWeight(weightfile, weightpara) weight4ind = data_io.getWeight(words, word2weight) # 详见data_io.py x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # 参数设置 params = params.params() params.rmpc = rmpc # 调用SIF核心算法计算句向量,详见SIF_core embedding = SIF_core.SIF_embedding(We, x, w, params) get_sent_vec = {} for i in range(len(embedding)): get_sent_vec[sentences[i]] = embedding[i] return get_sent_vec
def get_sent_vec(sentences): ''' 通过SIF算法计算句向量 :param sentences: 类型为列表,列表内的元素为字句子(句子类型为字符串,不用通过jieba.cut分词处理) :return: 输出类型为字典,key为句子字符串,value为句向量列表 ''' import params # 详见data_io.py x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # 参数设置 rmpc = 1 # number of principal components to remove in SIF weighting scheme params = params.params() params.rmpc = rmpc # 调用SIF核心算法计算句向量,详见SIF_core embedding = SIF_core.SIF_embedding(We, x, w, params) get_sent_vec = {} for i in range(len(embedding)): get_sent_vec[sentences[i]] = embedding[i] return get_sent_vec
def sentences2vecs(sentences, We, words, weight4ind): x, m = sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) return SIF_embedding.get_weighted_average(We, x, w)
def top_filtering(logits, words, weight4ind, We, tokenizer, history, args, params, embedding1, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf'), current_output=None): """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering Args: logits: logits distribution shape (vocabulary size) top_k: <=0: no filtering, >0: keep only top k tokens with highest probability. top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset whose total probability mass is greater than or equal to the threshold top_p. In practice, we select the highest probability tokens whose cumulative probability mass exceeds the threshold top_p. threshold: a minimal threshold to keep logits """ if current_output is None: current_output = [] assert logits.dim( ) == 1 # Only work for batch size 1 for now - could update but it would obfuscate a bit the code top_k = min(top_k, logits.size(-1)) if top_k > 0: # Remove all tokens with a probability less than the last token in the top-k tokens indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value if top_p > 0.0: # Compute cumulative probabilities of sorted tokens sorted_logits, sorted_indices = torch.sort(logits, descending=True) cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) # Remove tokens with cumulative probability above the threshold sorted_indices_to_remove = cumulative_probabilities > top_p # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ ..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 # Back to unsorted indices and set them to -infinity indices_to_remove = sorted_indices[sorted_indices_to_remove] indices_to_use = sorted_indices[~sorted_indices_to_remove] logits[indices_to_remove] = filter_value if len(indices_to_use) > 1: cands = [current_output + [idx] for idx in indices_to_use.tolist()] raw_cands = [ tokenizer.decode(cand, skip_special_tokens=True) for cand in cands ] scores = [] for i in raw_cands: sentences = [i] x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) embedding2 = SIF_embedding.SIF_embedding(We, x, w, params) inn = (embedding1 * embedding2).sum(axis=1) emb1norm = np.sqrt((embedding1 * embedding1).sum(axis=1)) emb2norm = np.sqrt((embedding2 * embedding2).sum(axis=1)) scores.append(inn / emb1norm / emb2norm) #print(sentences) for idx, sim in zip(indices_to_use, scores): logits[idx] += sim.item() """ probs = F.softmax(logits, dim=-1) index = [] for i in probs: if i > 0: index.append(i) prev = torch.topk(probs, 1)[1] if args.no_sample else torch.multinomial(probs, len(index)) text = [] last_utt = history[-1] last = tokenizer.decode(last_utt, skip_special_tokens=True) sentences = [last] # load sentences x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters global params params.rmpc = rmpc # get SIF embedding embedding1 = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i for i in prev: text.append(i.item()) for i in text: cand = current_output.copy() cand.append(i) indice = i raw_text=tokenizer.decode(cand, skip_special_tokens=True) sentences = [raw_text] x, m= data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) embedding2 = SIF_embedding.SIF_embedding(We, x, w, params) inn = (embedding1 * embedding2 ).sum(axis=1) emb1norm = np.sqrt((embedding1 * embedding1).sum(axis=1)) emb2norm = np.sqrt((embedding2 * embedding2 ).sum(axis=1)) scores = inn / emb1norm / emb2norm #print(scores) logits[indice] += scores.item() cand.clear() """ indices_to_remove = logits < threshold logits[indices_to_remove] = filter_value return logits
def sample_sequence(personality, history, tokenizer, model, args, words, weight4ind, We, current_output=None): special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS) if current_output is None: current_output = [] last_utt = history[-1] last = tokenizer.decode(last_utt, skip_special_tokens=True) sentences = [last] # load sentences x, m = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights rmpc = 1 # number of principal components to remove in SIF weighting scheme # set parameters global params params.rmpc = rmpc # get SIF embedding embedding1 = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i for i in range(args.max_length): instance, _ = build_input_from_segments(personality, history, current_output, tokenizer, with_eos=False) input_ids = torch.tensor(instance["input_ids"], device=args.device).unsqueeze(0) token_type_ids = torch.tensor(instance["token_type_ids"], device=args.device).unsqueeze(0) temperature = 1.0 top_k = 0 top_p = 0.9 logits = model(input_ids, token_type_ids=token_type_ids) if isinstance(logits, tuple): # for gpt2 and maybe others logits = logits[0] logits = logits[0, -1, :] / args.temperature logits = top_filtering(logits, words, weight4ind, We, tokenizer, history, args, params, embedding1, top_k=top_k, top_p=top_p, current_output=current_output) probs = F.softmax(logits, dim=-1) prev = torch.topk( probs, 1)[1] if args.no_sample else torch.multinomial(probs, 1) if i < args.min_length and prev.item() in special_tokens_ids: while prev.item() in special_tokens_ids: if probs.max().item() == 1: warnings.warn( "Warning: model generating special token with probability 1." ) break # avoid infinitely looping over special token prev = torch.multinomial(probs, num_samples=1) if prev.item() in special_tokens_ids: break current_output.append(prev.item()) return current_output
def train_util(model, train_data, dev, test, train, words, params): "utility function for training the model" start_time = time() try: for eidx in range(params.epochs): kf = data_io.get_minibatches_idx(len(train_data), params.batchsize, shuffle=True) uidx = 0 for _, train_index in kf: uidx += 1 batch = [train_data[t] for t in train_index] # load the word ids for i in batch: i[0].populate_embeddings(words) if not params.task == "sentiment": i[1].populate_embeddings(words) # load the data if params.task == "ent": (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataEntailment(batch) elif params.task == "sim": (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataSim(batch, model.nout) elif params.task == "sentiment": (scores, g1x, g1mask) = data_io.getDataSentiment(batch) else: raise ValueError('Task should be ent or sim.') # train if not params.task == "sentiment": if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, params.weight4ind) g2mask = data_io.seq2weight(g2x, g2mask, params.weight4ind) cost = model.train_function(scores, g1x, g2x, g1mask, g2mask) else: if params.weightfile: g1mask = data_io.seq2weight(g1x, g1mask, params.weight4ind) cost = model.train_function(scores, g1x, g1mask) if np.isnan(cost) or np.isinf(cost): print('NaN detected') # undo batch to save RAM for i in batch: i[0].representation = None i[0].unpopulate_embeddings() if not params.task == "sentiment": i[1].representation = None i[1].unpopulate_embeddings() # evaluate if params.task == "sim": dp, ds = eval.supervised_evaluate(model, words, dev, params) tp, ts = eval.supervised_evaluate(model, words, test, params) rp, rs = eval.supervised_evaluate(model, words, train, params) print(("evaluation: ", dp, ds, tp, ts, rp, rs)) elif params.task == "ent" or params.task == "sentiment": ds = eval.supervised_evaluate(model, words, dev, params) ts = eval.supervised_evaluate(model, words, test, params) rs = eval.supervised_evaluate(model, words, train, params) print(("evaluation: ", ds, ts, rs)) else: raise ValueError('Task should be ent or sim.') print(('Epoch ', (eidx + 1), 'Cost ', cost)) sys.stdout.flush() except KeyboardInterrupt: print("Training interupted") end_time = time() print(("total time:", (end_time - start_time)))
#in sentence_file or '5-15' in sentence_file: batch_num = 0 with open(sentence_file, 'r', encoding='utf-8') as fr: print('Processing file', sentence_file, '...') p = hnswlib.Index(space='cosine', dim=dimension) p.init_index(max_elements = num_elements, ef_construction = 2000, M = 80) p.set_ef(1000) # Set number of threads used during batch search/construction # By default using all available cores p.set_num_threads(30) for n_lines in iter(lambda: tuple(islice(fr, batch_size)), ()): sents = list(map(str.strip, n_lines)) sent_id = list(map(lambda x:int(x.split('\t')[0]), sents)) sentences = list(map(lambda x:x.split('\t')[-1], sents)) x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # get SIF embedding embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i embeddings.normalize(embedding, ["unit", "center"]) p.add_items(embedding,sent_id) print('Finished batch', batch_num, '.', end = '\r') batch_num += 1 print('\nFinished loading', sentence_file, '.') out_file = sentence_file+'.ann' p.save_index(out_file) print('Finished saving', out_file, '.') del p
(glove_words, We) = data_io.getWordmap(wordfile) print("shape of Word embedding is: " + str(We.shape)) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( glove_words, word2weight) # weight4ind[i] is the weight for the i-th word # set parameters params = params.params() params.rmpc = rmpc # load sentences print("reading the input sentences now & converting to indices .. \n") sample_sents = read_NMT_data.read_data(sample_ara) # AraSIF embedding for sample sentences print("computing AraSIF embedding now ...\n") # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentences2idx(sample_sents, glove_words) w = data_io.seq2weight(x, m, weight4ind) # get word weights sample_embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i print("shape of sample sentence embedding is: " + str(sample_embedding.shape)) # serialize for future use numpy.save('sample_sentence_embedding.npy', sample_embedding)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataname', default='t6', help='dataset name', choices=['t6', 't26', '2C']) parser.add_argument('-c', '--classifiername', default='RF', help='which classifier to use', choices=['GaussianNB', 'RF', 'SVM', 'KNN']) args = parser.parse_args() data_name = args.dataname # t6 or t26, 2C, 4C clf_name = args.classifiername # classfier # Original SIF paper used glove.840B.300d, we use the ones that were trained on twitter. embed_dims = [100] # can add 25, 50, 200 dimension if needed wordfile_list = [ '../data/glove.twitter.27B.{}d.txt'.format(dim) for dim in embed_dims ] # each line is a word and its frequency weightfile = 'SIF-master/auxiliary_data/enwiki_vocab_min200.txt' # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] weightpara = 1e-3 # number of principal components to remove in SIF weighting scheme rmpc = 1 for wordfile, dim in zip(wordfile_list, embed_dims): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights # word2weight['str'] is the weight for the word 'str' word2weight = data_io.getWordWeight(weightfile, weightpara) # weight4ind[i] is the weight for the i-th word weight4ind = data_io.getWeight(words, word2weight) data_path = "../data/" if data_name == "t6": file_path = data_path + "CrisisLexT6_cleaned/" disasters = [ "sandy", "queensland", "boston", "west_texas", "oklahoma", "alberta" ] test_list = [ "{}_glove_token.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}_training.csv".format(disaster) for disaster in disasters ] if data_name == "t26": file_path = data_path + "CrisisLexT26_cleaned/" disasters = [ "2012_Colorado_wildfires", "2013_Queensland_floods", "2013_Boston_bombings", "2013_West_Texas_explosion", "2013_Alberta_floods", "2013_Colorado_floods", "2013_NY_train_crash" ] test_list = [ "{}-tweets_labeled.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}_training.csv".format(disaster) for disaster in disasters ] if data_name == "2C": file_path = data_path + "2CTweets_cleaned/" disasters = [ "Memphis", "Seattle", "NYC", "Chicago", "SanFrancisco", "Boston", "Brisbane", "Dublin", "London", "Sydney" ] test_list = [ "{}2C.csv.token.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}2C_training.csv".format(disaster) for disaster in disasters ] accu_list = [] roc_list = [] precision_list = [] recall_list = [] f1_list = [] for train, test in zip(train_list, test_list): train_file = os.path.join(file_path, train) test_file = os.path.join(file_path, test) xtrain, ytrain = load_data(data_name, train_file) xtest, ytest = load_data(data_name, test_file) # load train # xtrain_windx is the array of word indices, m_train is the binary mask indicating whether there is a word in that location xtrain_windx, m_train = data_io.sentences2idx(xtrain, words) w_train = data_io.seq2weight(xtrain_windx, m_train, weight4ind) # get word weights # set parameters paramss = params.params() paramss.rmpc = rmpc # get SIF embedding train_embed = SIF_embedding.SIF_embedding( We, xtrain_windx, w_train, paramss) # embedding[i,:] is the embedding for sentence i # load target # xtest_windx is the array of word indices, m_test is the binary mask indicating whether there is a word in that location xtest_windx, m_test = data_io.sentences2idx(xtest, words) # get word weights w_test = data_io.seq2weight(xtest_windx, m_test, weight4ind) # set parameters paramsss = params.params() paramsss.rmpc = rmpc # get SIF embedding test_embed = SIF_embedding.SIF_embedding( We, xtest_windx, w_test, paramsss) # embedding[i,:] is the embedding for sentence i print(test) accu, roc, precision, recall, f1 = run_classifier( train_embed, ytrain, test_embed, ytest, clf_name, 100) accu_list.append(accu) roc_list.append(roc) precision_list.append(precision) recall_list.append(recall) f1_list.append(f1) print("{}_SIF_{}_LOO_accuracy {}".format(data_name, clf_name + str(dim), accu_list)) print("{}_SIF_{}_LOO_roc {}".format(data_name, clf_name + str(dim), roc_list)) print("{}_SIF_{}_LOO_precision {}".format(data_name, clf_name + str(dim), precision_list)) print("{}_SIF_{}_LOO_recall {}".format(data_name, clf_name + str(dim), recall_list)) print("{}_SIF_{}_LOO_f1 {}".format(data_name, clf_name + str(dim), f1_list)) print( "{0}_SIF_LOO_{1} {2:.4f} + {3:.4f} {4:.4f} + {5:.4f} {6:.4f} + {7:.4f} {8:.4f} + {9:.4f} {10:.4f} + {11:.4f}" .format(data_name, clf_name + str(dim), np.mean(accu_list), np.std(accu_list), np.mean(roc_list), np.std(roc_list), np.mean(f1_list), np.std(f1_list), np.mean(precision_list), np.std(precision_list), np.mean(recall_list), np.std(recall_list)))