def sim_getCorrelation(We, words, f, weight4ind, scoring_function, params): f = open(f, 'r') lines = f.readlines() golds = [] seq1 = [] seq2 = [] for i in lines: i = i.split("\t") p1 = i[0] p2 = i[1] score = float(i[2]) X1, X2 = data_io.getSeqs(p1, p2, words) seq1.append(X1) seq2.append(X2) golds.append(score) x1, m1 = data_io.prepare_data(seq1) x2, m2 = data_io.prepare_data(seq2) m1 = data_io.seq2weight(x1, m1, weight4ind) m2 = data_io.seq2weight(x2, m2, weight4ind) scores = scoring_function(We, x1, x2, m1, m2, params) print seq1[0] print seq2[0] print scores[0] preds = np.squeeze(scores) return pearsonr(preds, golds)[0], spearmanr(preds, golds)[0]
def getAccSentiment(model,words,f, params=[]): f = open(f,'r') lines = f.readlines() preds = [] golds = [] seq1 = [] ct = 0 for i in lines: i = i.split("\t") p1 = i[0]; score = i[1] X1 = data_io.getSeq(p1,words) seq1.append(X1) ct += 1 if ct % 100 == 0: x1,m1 = data_io.prepare_data(seq1) if params and params.weightfile: m1 = data_io.seq2weight(x1, m1, params.weight4ind) scores = model.scoring_function(x1,m1) scores = np.squeeze(scores) preds.extend(scores.tolist()) seq1 = [] golds.append(score) if len(seq1) > 0: x1,m1 = data_io.prepare_data(seq1) if params and params.weightfile: m1 = data_io.seq2weight(x1, m1, params.weight4ind) scores = model.scoring_function(x1,m1) scores = np.squeeze(scores) preds.extend(scores.tolist()) return accSentiment(preds,golds)
def sim_badSents(We, words, weight4ind, scoring_function, params, fpc, sent1, sent2): seq1 = [] seq2 = [] X1, X2 = data_io.getSeqs(sent1, sent2, words) seq1.append(X1) seq2.append(X2) x1, m1 = data_io.prepare_data(seq1) x2, m2 = data_io.prepare_data(seq2) m1 = data_io.seq2weight(x1, m1, weight4ind) m2 = data_io.seq2weight(x2, m2, weight4ind) scores = scoring_function(We, x1, x2, m1, m2, params, fpc) preds = np.squeeze(scores) preds = preds * 2 + 3 return preds
def sim_getCorrelation(We, words, f, weight4ind, scoring_function, params, fpc, test_name): f = open(f, 'r') lines = f.readlines() golds = [] seq1 = [] seq2 = [] index = [] idx = 0 for i in lines: i = i.split("\t") p1 = i[0] p2 = i[1] score = float(i[2]) X1, X2 = data_io.getSeqs(p1, p2, words) seq1.append(X1) seq2.append(X2) golds.append(score) index.append(idx) idx += 1 x1, m1 = data_io.prepare_data(seq1) x2, m2 = data_io.prepare_data(seq2) m1 = data_io.seq2weight(x1, m1, weight4ind) m2 = data_io.seq2weight(x2, m2, weight4ind) golds = np.asarray(golds) scores = scoring_function(We, x1, x2, m1, m2, params, fpc) # scores = scoring_function(We, x1, x2, m1, m2, golds, params, fpc) # preds = np.squeeze(scores).reshape(-1, 1) preds = np.squeeze(scores) # print('the prediction list is {}'.format(preds)) # add SVM predictor # clf = pickle.load(open('../score_predictor/model_svm', 'rb')) # clf.fit(preds, golds) # preds = clf.predict(preds) print(preds) # np.save(open("../pred_list", 'wb'), preds) # np.save(open("../gold_list", 'wb'), golds) # show_result_image(preds, golds, index, fpc, test_name) # find_bad_scores(preds.tolist(), lower_threshold=2.5, higher_threshold=3.8) MSE = sqrt(mean_squared_error(golds, preds)) return pearsonr(preds, golds)[0], MSE
def getCorrelation(model,words,f, params=[]): f = open(f,'r') lines = f.readlines() preds = [] golds = [] seq1 = [] seq2 = [] for i in lines: i = i.split("\t") p1 = i[0]; p2 = i[1]; score = float(i[2]) X1, X2 = data_io.getSeqs(p1,p2,words) seq1.append(X1) seq2.append(X2) golds.append(score) x1,m1 = data_io.prepare_data(seq1) x2,m2 = data_io.prepare_data(seq2) if params and params.weightfile: m1 = data_io.seq2weight(x1, m1, params.weight4ind) m2 = data_io.seq2weight(x2, m2, params.weight4ind) scores = model.scoring_function(x1,x2,m1,m2) preds = np.squeeze(scores) return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def prepare_first_pc(We, words, weight4ind, generation_function, params, fpc): print("reading file: {}.".format(fpc)) # pre_calculate_first_pc(We, words, fpc, weight4ind, generation_function, params) file_name = fpc f = os.path.join("../data/", fpc) f = open(f, 'r') seq = [] for i in f.readlines(): X = data_io.getSeq(i, words) seq.append(X) x, m = data_io.prepare_data(seq) m = data_io.seq2weight(x, m, weight4ind) generation_function(We, x, m, params, file_name)
def sim_getCorrelation1(We, words, file_index, weight4ind, scoring_function, params): f = open(file_index[0], 'r') #print(f) line = f.readlines() lines = [lin for lin in line] f = open(file_index[1], 'r') #print(f) score_line = f.readlines() score_lines = [score for score in score_line] golds = [] seq1 = [] seq2 = [] for index in range(len(lines)): i = lines[index] j = score_lines[index] i = i.split("\t") #print(i) #print(i) p1 = i[0].lower() p2 = i[1].lower() try: score = float(j) X1, X2 = data_io.getSeqs(p1, p2, words) seq1.append(X1) seq2.append(X2) golds.append(score) except: pass x1, m1 = data_io.prepare_data(seq1) x2, m2 = data_io.prepare_data(seq2) m1 = data_io.seq2weight(x1, m1, weight4ind) m2 = data_io.seq2weight(x2, m2, weight4ind) # print(x1,x2,m1,m2) # print(x1.shape,x2.shape,m1.shape,m2.shape) scores = scoring_function(We, x1, x2, m1, m2, params) preds = np.squeeze(scores) return pearsonr(preds, golds)[0]
def sentences2idx(texts, words): """ Take in data, output array of word indices that can be fed into the algorithms. :param texts: List of texts :return: x1, m1. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location) """ seq = [] for t in texts: # Doing some cleaning of the text stopwords = utils.get_stopwords() text = t.strip().strip('"') text_clean = utils.clean_text(text) s = [w for w in text_clean.split(" ") if w not in stopwords] s = s[0:MAX_WORDS] seq.append(data_io.getSeq(' '.join(s), words)) x1, m1 = data_io.prepare_data(seq) return x1, m1
sentences = [ 'the lion is the king of the jungle'.split(), 'tigers hunt alone at night'.split(), 'long live the emperor'.split(), 'we call him little bobby tables'.split() ] # import pickle # sentences = pickle.load(open('tiger.pd', 'rb')) db = data_io.setup_db() # weights = data_io.weights_from_file(weightfile, weightpara) # data_io.glove_to_db(wordfile, db, weights=weights) # load sentences idx_mat, weight_mat, data = data_io.prepare_data(sentences, db) # set parameters params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding(idx_mat, weight_mat, data, params) with open('svdump.pd', 'wb') as f: pickle.dump(embedding, f) pprint.pprint(list(enumerate(sentences))) print("Cosine dist"), pprint.pprint( scipy.spatial.distance.squareform( scipy.spatial.distance.pdist(embedding, 'cosine')))