def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda): self.query = {} self.candidate = candidatePath self.tweet = {} self.mu = mu self.sigma = sigma #similarity threshold self.lamda = lamda #cluster threshold self.jaccInstance = Jaccard() self.klInstance = Distance(mu, corpusFile) print "corpus read done!"
def __init__(self, train_path, test_path): self.train_path = train_path self.test_path = test_path self.preprocessor = Preprocessor() self.trn = pd.DataFrame(columns=Classifier._COLS) # Read data_frame self.tst = pd.DataFrame(columns=Classifier._COLS) # Read data_frame self.trn_gs = pd.DataFrame(columns=Classifier._GS_COLS) # Known labels self.tst_gs = pd.DataFrame(columns=Classifier._GS_COLS) # Known labels self.tok_trn = [] self.tok_tst = [] self.feature_extractor = FeatureExtractor() self.jaccard = Jaccard() self.rfr = RFR() self.nn = MLPRegressor(hidden_layer_sizes=(100, 30, 30), validation_fraction=0.3, alpha=0.3, warm_start=False, max_iter=1000, activation='logistic')
class Classifier: _GS_COLS = ['labels'] _COLS = ['sentence0', 'sentence1'] def __init__(self, train_path, test_path): self.train_path = train_path self.test_path = test_path self.preprocessor = Preprocessor() self.trn = pd.DataFrame(columns=Classifier._COLS) # Read data_frame self.tst = pd.DataFrame(columns=Classifier._COLS) # Read data_frame self.trn_gs = pd.DataFrame(columns=Classifier._GS_COLS) # Known labels self.tst_gs = pd.DataFrame(columns=Classifier._GS_COLS) # Known labels self.tok_trn = [] self.tok_tst = [] self.feature_extractor = FeatureExtractor() self.jaccard = Jaccard() self.rfr = RFR() self.nn = MLPRegressor(hidden_layer_sizes=(100, 30, 30), validation_fraction=0.3, alpha=0.3, warm_start=False, max_iter=1000, activation='logistic') # -------------------------------------------------- CLASSIFY ▼ ---------------------------------------------------- def classify(self): print(self.trn.head()) print('Preprocessing...') self.tok_trn = self.preprocessor.run(self.trn) self.tok_tst = self.preprocessor.run(self.tst) print(self.tok_trn.head()) print(self.tok_trn['sentence0'].values[483]) print(self.trn['sentence0'].values[483]) # Features fea_trn = pd.read_pickle('./dump/fea_trn4.dump') fea_tst = pd.read_pickle('./dump/fea_tst4.dump') #fea_trn = self.feature_extractor.extract(tok_trn) #fea_tst = self.feature_extractor.extract(tok_tst) #fea_trn.to_pickle('./dump/fea_trn4.dump') #fea_tst.to_pickle('./dump/fea_tst4.dump') print('Creating BOG...') bog = BOG() bog.train_dictionary(self.tok_trn) bog_extended_trn = bog.get_bog_extended(self.tok_trn, fea_trn) bog_extended_tst = bog.get_bog_extended(self.tok_tst, fea_tst) bog_extended_trn_scaled = bog.get_bog_extended(self.tok_trn, fea_trn, scale=True) bog_extended_tst_scaled = bog.get_bog_extended(self.tok_tst, fea_tst, scale=True) print('Training RFR...') self.rfr.fit(bog_extended_trn, self.trn_gs['labels'].values) self.rfr.print_feature_importance(bog_extended_trn) print('Training NN...') self.nn.fit(bog_extended_trn_scaled, self.trn_gs['labels'].values) print('Testing...') predict_nn_trn = self.nn.predict(bog_extended_trn_scaled) predict_nn_tst = self.nn.predict(bog_extended_tst_scaled) predict_rfr_trn = self.rfr.predict(bog_extended_trn) predict_rfr_tst = self.rfr.predict(bog_extended_tst) predict_jac_trn = self.jaccard.predict(self.tok_trn) predict_jac_tst = self.jaccard.predict(self.tok_tst) predict_vot_trn = self.average(predict_rfr_trn, predict_nn_trn) predict_vot_tst = self.average(predict_rfr_tst, predict_nn_tst) self.show_results(predict_rfr_trn, predict_rfr_tst, predict_jac_trn, predict_jac_tst, predict_nn_trn, predict_nn_tst, predict_vot_trn, predict_vot_tst) def average(self, predict_rfr, predict_nn): voted = [] for rfr, nn in zip(predict_rfr, predict_nn): voted.append(0.5 * rfr + 0.5 * nn) return voted # ---------------------------------------------------- SHOW ▼ ----------------------------------------------------- def __add_table(self, table, name, trn, tst): table.append_column(name, [ '{:.2f} std: {:.1f}'.format(np.mean(trn), np.std(trn)), '{:.2f} std: {:.1f}'.format(np.mean(tst), np.std(tst)), '{:.4f}'.format(pearsonr(trn, self.trn_gs['labels'])[0]), '{:.4f}'.format(pearsonr(tst, self.tst_gs['labels'])[0]) ]) def show_results(self, rfr_trn, rfr_tst, jac_trn, jac_tst, nn_trn, nn_tst, vot_trn, vot_tst): table = BeautifulTable() table.append_column('', ['Trn', 'Tst', 'Trn Pearson', 'Tst Pearson']) self.__add_table(table, 'Real', self.trn_gs['labels'], self.tst_gs['labels']) self.__add_table(table, 'RFR', rfr_trn, rfr_tst) self.__add_table(table, 'Jaccard', jac_trn, jac_tst) self.__add_table(table, 'NN', nn_trn, nn_tst) self.__add_table(table, 'Voting', vot_trn, vot_tst) plt.scatter(nn_trn, self.trn_gs['labels'], c='Cyan') plt.xlabel('NN label') plt.ylabel('Real label') plt.show() plt.scatter(vot_trn, self.trn_gs['labels'], c='Blue') plt.xlabel('Averaging label') plt.ylabel('Real label') plt.show() plt.scatter(jac_trn, self.trn_gs['labels'], c='Green') plt.xlabel('Jaccard label') plt.ylabel('Real label') plt.show() plt.scatter(rfr_trn, self.trn_gs['labels'], c='Red') plt.xlabel('RFR label') plt.ylabel('Real label') plt.show() plt.scatter(nn_tst, self.tst_gs['labels'], c='Cyan') plt.xlabel('NN label') plt.ylabel('Real label') plt.show() plt.scatter(vot_tst, self.tst_gs['labels'], c='Blue') plt.xlabel('Averaging label') plt.ylabel('Real label') plt.show() plt.scatter(jac_tst, self.tst_gs['labels'], c='Green') plt.xlabel('Jaccard label') plt.ylabel('Real label') plt.show() plt.scatter(rfr_tst, self.tst_gs['labels'], c='Red') plt.xlabel('RFR label') plt.ylabel('Real label') plt.show() print(table) self.show_worst_test(vot_tst, rfr_tst, nn_tst, jac_tst) print() self.show_best_test(vot_tst, rfr_tst, nn_tst, jac_tst) print() def show_best_test(self, predicted, predicted_rfr, predicted_nn, predicted_jac, k=15): print('Best results in averaging:') err = np.abs(predicted - self.tst_gs['labels'].values) idx = np.argpartition(err, k)[:k] dic = { err[i]: i for i in idx } # Create a dictionary with the errors as the key for sorting output for err in sorted(dic, reverse=True): i = dic[err] print( '\33[100m{:d} Predicted [Averaging: {:.2f} RFR: {:.2f} NN: {:.2f} Jaccard: {:.2f}] Target: {:.2f} Err: {:.2f}\033[0m\n{:s}\n{:s}' .format( i, predicted[i], predicted_rfr[i], predicted_nn[i], predicted_jac[i], self.tst_gs['labels'].values[i], err, str(self.tst['sentence0'].values[i]).replace('\n', '').replace( '\r', ''), str(self.tst['sentence1'].values[i]).replace( '\n', '').replace('\r', ''))) def show_worst_test(self, predicted, predicted_rfr, predicted_nn, predicted_jac, k=15): print('Worst results in averaging:') err = np.abs(predicted - self.tst_gs['labels'].values) idx = np.argpartition(err, -k)[-k:] dic = { err[i]: i for i in idx } # Create a dictionary with the errors as the key for sorting output for err in sorted(dic, reverse=True): i = dic[err] print( '\33[100m{:d} Predicted [Averaging: {:.2f} RFR: {:.2f} NN: {:.2f} Jaccard: {:.2f}] Target: {:.2f} Err: {:.2f}\033[0m\n{:s}\n{:s}' .format( i, predicted[i], predicted_rfr[i], predicted_nn[i], predicted_jac[i], self.tst_gs['labels'].values[i], err, str(self.tst['sentence0'].values[i]).replace('\n', '').replace( '\r', ''), str(self.tst['sentence1'].values[i]).replace( '\n', '').replace('\r', ''))) # --------------------------------------------------- LOADING ▼ --------------------------------------------------- def load(self, use_dump=True): self.trn, self.trn_gs = self.__load_all(self.train_path) self.tst, self.tst_gs = self.__load_all(self.test_path) print('Train: {0} Test: {1}'.format(self.trn.shape, self.tst.shape)) def __load_all(self, dir): print(dir) files = listdir(dir) input = pd.DataFrame(columns=['sentence0', 'sentence1']) label = pd.DataFrame(columns=['labels']) for file in files: path = pth.join(dir, file) path_gs = path.replace('input', 'gs') if 'STS.input' in path: # Only read input files input_df = pd.read_csv(path, sep='\t', lineterminator='\n', names=Classifier._COLS, header=None, quoting=csv.QUOTE_NONE) label_df = pd.read_csv(path_gs, sep='\t', lineterminator='\n', names=Classifier._GS_COLS, header=None, quoting=csv.QUOTE_NONE) input = input.append(input_df) label = label.append(label_df) return \ input.fillna('').reset_index(drop=True), \ label.fillna('').reset_index(drop=True)
class Cluster: def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda): self.query = {} self.candidate = candidatePath self.tweet = {} self.mu = mu self.sigma = sigma #similarity threshold self.lamda = lamda #cluster threshold self.jaccInstance = Jaccard() self.klInstance = Distance(mu, corpusFile) print "corpus read done!" def write(self, writePath, alpha, yibuson): writeFile = writePath + "res.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson) result = open(writeFile, "w+") log = open(writePath + "log.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") log.write("Qid\tclusterCount\ttweetCount\n") log1 = open(writePath + "qidWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") log2 = open(writePath + "widWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") log3 = open(writePath + "jaccScore.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") num = 1 files = [] while(num <= 55): files.append(str(num) + ".res.content.all") num += 1 for file in files: #remember to make them initial on each query self.curQid = -1 self.cluster = [] self.qidWidKL = {} self.qidWidMax = 0 self.qidWidMin = 999 self.widWidKL = {} self.widWidMax = 0 self.widWidMin = 999 self.widScore = {} self.jacc = {} self.jaccMax = 0 self.jaccMin = 1 self.resultList = [] readPath = self.candidate + file with open(readPath, "r") as fin: for i, line in enumerate(fin): qid, Qid, wid, rank, score, runName, wcontent, qcontent = line.strip().split("\t") self.query[qid] = qcontent #first time selection if(float(score) < 4.59): if not self.cluster: print "break out of 4.59, ", file, " , empty cluster!" exit() break self.tweet[wid] = wcontent self.curQid = qid #calculate qidWidKL similarity = self.klInstance.kl(self.query[qid], wcontent) #calculate jaccard score jaccScore = self.jaccInstance.jaccardScore(qcontent, wcontent) #if similarity <= self.sigma and jaccScore >= yibuson: if similarity <= self.sigma: #set self.qidWidKL self.qidWidKL[qid+"-"+wid] = similarity if (similarity > self.qidWidMax): self.qidWidMax = similarity if (similarity < self.qidWidMin): self.qidWidMin = similarity #set self.jacc self.jacc[qid+"-"+wid] = jaccScore if self.jaccMax < jaccScore: self.jaccMax = jaccScore if self.jaccMin > jaccScore: self.jaccMin = jaccScore #calculate widWidKL if not self.cluster: self.cluster.append([wid]) self.widWidKL[wid] = [] else: self.__clustering(wid) if (i % 100) == 0: print file, " => ", i log1.write(str(self.qidWidMin) + "\t" + str(self.qidWidMax) + "\n") for key in self.qidWidKL: log1.write(key + "\t" + str(self.qidWidKL[key]) + "\n") log2.write(str(self.widWidMin) + "\t" + str(self.widWidMax) + "\n") for key in self.widWidKL: for i in range(len(self.widWidKL[key])): for widKey in self.widWidKL[key][i]: log2.write(key + "-" + widKey + "\t" + str(self.widWidKL[key][i][widKey]) + "\n") log3.write(str(self.jaccMin) + "\t" + str(self.jaccMax) + "\n") for key in self.jacc: log3.write(key + "\t" + str(self.jacc[key]) + "\n") rankInstance = Rank(self.widWidKL, self.widWidMin, self.widWidMax) #self.widScore = rankInstance.textRank() self.widScore = rankInstance.combinedCov(alpha, self.cluster, self.tweet) #log info clusterCount = len(self.cluster) tweetCount = 0 #select one wid from each cluster for i in range(len(self.cluster)): maxScore = 0 bestWid = -1 tweetCount += len(self.cluster[i]) for wid in self.cluster[i]: if self.widScore[wid] > maxScore: #select min query-tweet kl score #key = str(self.curQid) + "-" + wid #if self.qidWidKL[key] > maxScore: #maxScore = self.qidWidKL[key] maxScore = self.widScore[wid] bestWid = wid self.resultList.append(bestWid) #write log info log.write("MB" + self.curQid + "\t" + str(clusterCount) + "\t" + str(tweetCount) + "\n") #write result for wid in self.resultList: result.write("MB" + self.curQid + "\t" + "Q0\t" + wid + "\t1\t1\tYAO\n") def __clustering(self, wid): minScore = 999 index = -1 wcontent = self.tweet[wid] for i in range(len(self.cluster)): for cwid in self.cluster[i]: ccontent = self.tweet[cwid] score = self.klInstance.kl(self.tweet[wid], self.tweet[cwid]) #print i if wid in self.widWidKL: self.widWidKL[wid].append({cwid: score}) else: self.widWidKL[wid] = [{cwid: score}] if cwid in self.widWidKL: self.widWidKL[cwid].append({wid: score}) else: self.widWidKL[cwid] = [{wid: score}] #select miniScore, that is the most similar value if score < minScore: minScore = score index = i #record self.widWidMax & self.widWidMin if score < self.widWidMin: self.widWidMin = score if score > self.widWidMax: self.widWidMax = score #put wid into the cluster #a new cluster if minScore > self.lamda: self.cluster.append([wid]) #print self.cluster #add to a highest similarity cluster else: self.cluster[index].append(wid)
class Cluster: def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda): self.query = {} self.candidate = candidatePath self.tweet = {} self.mu = mu self.sigma = sigma #similarity threshold self.lamda = lamda #cluster threshold self.jaccInstance = Jaccard() self.klInstance = Distance(mu, corpusFile) print "corpus read done!" def write(self, writePath, alpha, yibuson): writeFile = writePath + "res.rmSW4.59S" + str(self.sigma) + ".L" + str( self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson) result = open(writeFile, "w+") log = open( writePath + "log.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") log.write("Qid\tclusterCount\ttweetCount\n") log1 = open( writePath + "qidWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") log2 = open( writePath + "widWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") log3 = open( writePath + "jaccScore.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") num = 1 files = [] while (num <= 55): files.append(str(num) + ".res.content.all") num += 1 for file in files: #remember to make them initial on each query self.curQid = -1 self.cluster = [] self.qidWidKL = {} self.qidWidMax = 0 self.qidWidMin = 999 self.widWidKL = {} self.widWidMax = 0 self.widWidMin = 999 self.widScore = {} self.jacc = {} self.jaccMax = 0 self.jaccMin = 1 self.resultList = [] readPath = self.candidate + file with open(readPath, "r") as fin: for i, line in enumerate(fin): qid, Qid, wid, rank, score, runName, wcontent, qcontent = line.strip( ).split("\t") self.query[qid] = qcontent #first time selection if (float(score) < 4.59): if not self.cluster: print "break out of 4.59, ", file, " , empty cluster!" exit() break self.tweet[wid] = wcontent self.curQid = qid #calculate qidWidKL similarity = self.klInstance.kl(self.query[qid], wcontent) #calculate jaccard score jaccScore = self.jaccInstance.jaccardScore( qcontent, wcontent) #if similarity <= self.sigma and jaccScore >= yibuson: if similarity <= self.sigma: #set self.qidWidKL self.qidWidKL[qid + "-" + wid] = similarity if (similarity > self.qidWidMax): self.qidWidMax = similarity if (similarity < self.qidWidMin): self.qidWidMin = similarity #set self.jacc self.jacc[qid + "-" + wid] = jaccScore if self.jaccMax < jaccScore: self.jaccMax = jaccScore if self.jaccMin > jaccScore: self.jaccMin = jaccScore #calculate widWidKL if not self.cluster: self.cluster.append([wid]) self.widWidKL[wid] = [] else: self.__clustering(wid) if (i % 100) == 0: print file, " => ", i log1.write(str(self.qidWidMin) + "\t" + str(self.qidWidMax) + "\n") for key in self.qidWidKL: log1.write(key + "\t" + str(self.qidWidKL[key]) + "\n") log2.write(str(self.widWidMin) + "\t" + str(self.widWidMax) + "\n") for key in self.widWidKL: for i in range(len(self.widWidKL[key])): for widKey in self.widWidKL[key][i]: log2.write(key + "-" + widKey + "\t" + str(self.widWidKL[key][i][widKey]) + "\n") log3.write(str(self.jaccMin) + "\t" + str(self.jaccMax) + "\n") for key in self.jacc: log3.write(key + "\t" + str(self.jacc[key]) + "\n") rankInstance = Rank(self.widWidKL, self.widWidMin, self.widWidMax) #self.widScore = rankInstance.textRank() self.widScore = rankInstance.combinedCov(alpha, self.cluster, self.tweet) #log info clusterCount = len(self.cluster) tweetCount = 0 #select one wid from each cluster for i in range(len(self.cluster)): maxScore = 0 bestWid = -1 tweetCount += len(self.cluster[i]) for wid in self.cluster[i]: if self.widScore[wid] > maxScore: #select min query-tweet kl score #key = str(self.curQid) + "-" + wid #if self.qidWidKL[key] > maxScore: #maxScore = self.qidWidKL[key] maxScore = self.widScore[wid] bestWid = wid self.resultList.append(bestWid) #write log info log.write("MB" + self.curQid + "\t" + str(clusterCount) + "\t" + str(tweetCount) + "\n") #write result for wid in self.resultList: result.write("MB" + self.curQid + "\t" + "Q0\t" + wid + "\t1\t1\tYAO\n") def __clustering(self, wid): minScore = 999 index = -1 wcontent = self.tweet[wid] for i in range(len(self.cluster)): for cwid in self.cluster[i]: ccontent = self.tweet[cwid] score = self.klInstance.kl(self.tweet[wid], self.tweet[cwid]) #print i if wid in self.widWidKL: self.widWidKL[wid].append({cwid: score}) else: self.widWidKL[wid] = [{cwid: score}] if cwid in self.widWidKL: self.widWidKL[cwid].append({wid: score}) else: self.widWidKL[cwid] = [{wid: score}] #select miniScore, that is the most similar value if score < minScore: minScore = score index = i #record self.widWidMax & self.widWidMin if score < self.widWidMin: self.widWidMin = score if score > self.widWidMax: self.widWidMax = score #put wid into the cluster #a new cluster if minScore > self.lamda: self.cluster.append([wid]) #print self.cluster #add to a highest similarity cluster else: self.cluster[index].append(wid)
def get_answer(question): pyserini = Pyserini(app.config.get('index')) jaccard = Jaccard() candidate_passages = pyserini.ranked_passages(query_string=question, num_hits=30, k=20) answer = jaccard.most_similar_passage(question, candidate_passages) return answer