def validate(self): logging.info('Validating', extra=d) softranking, simranking = {}, {} for j, q1id in enumerate(self.devset): softranking[q1id], simranking[q1id] = [], [] percentage = round(float(j + 1) / len(self.devset), 2) print('Progress: ', percentage, j + 1, sep='\t', end='\r') query = self.devset[q1id] q1 = query['tokens_proc'] elmo_emb1 = self.develmo.get(str(self.devidx[q1id])) w2v_emb = features.encode(q1, self.word2vec) # q1emb = features.glove_encode(q1, self.glove, self.voc2id) q1emb = [ np.concatenate([w2v_emb[i], elmo_emb1[i]]) for i in range(len(w2v_emb)) ] duplicates = query['duplicates'] for duplicate in duplicates: rel_question = duplicate['rel_question'] q2id = rel_question['id'] q2 = rel_question['tokens_proc'] elmo_emb2 = self.develmo.get(str(self.devidx[q2id])) w2v_emb = features.encode(q2, self.word2vec) # q2emb = features.glove_encode(q2, self.glove, self.voc2id) q2emb = [ np.concatenate([w2v_emb[i], elmo_emb2[i]]) for i in range(len(w2v_emb)) ] simple_score = self.simple_score(q1, q2) score = self.score(q1, q1emb, q2, q2emb) real_label = 0 if rel_question['relevance'] != 'Irrelevant': real_label = 1 simranking[q1id].append((real_label, simple_score, q2id)) softranking[q1id].append((real_label, score, q2id)) with open(os.path.join(DATA_QUESTION_PATH, 'softranking.txt'), 'w') as f: for q1id in softranking: for row in softranking[q1id]: label = 'false' if row[0] == 1: label = 'true' f.write('\t'.join([ str(q1id), str(row[2]), str(0), str(row[1]), label, '\n' ])) logging.info('Finishing to validate.', extra=d) return softranking, simranking
def get_features(self, q1id, q1, q2id, q2, set='train'): X = [] if set == 'train': q1_elmo = self.trainelmo.get(str(self.trainidx[q1id])) q2_elmo = self.trainelmo.get(str(self.trainidx[q2id])) else: q1_elmo = self.develmo.get(str(self.devidx[q1id])) q2_elmo = self.develmo.get(str(self.devidx[q2id])) q1_w2v = features.encode(q1, self.word2vec) q1_elmo_bottom = [ np.concatenate([q1_w2v[i], q1_elmo[0][i]]) for i in range(len(q1_w2v)) ] q1_elmo_middle = [ np.concatenate([q1_w2v[i], q1_elmo[1][i]]) for i in range(len(q1_w2v)) ] q1_elmo_top = [ np.concatenate([q1_w2v[i], q1_elmo[2][i]]) for i in range(len(q1_w2v)) ] q2_w2v = features.encode(q2, self.word2vec) q2_elmo_bottom = [ np.concatenate([q2_w2v[i], q2_elmo[0][i]]) for i in range(len(q2_w2v)) ] q2_elmo_middle = [ np.concatenate([q2_w2v[i], q2_elmo[1][i]]) for i in range(len(q2_w2v)) ] q2_elmo_top = [ np.concatenate([q2_w2v[i], q2_elmo[2][i]]) for i in range(len(q2_w2v)) ] # X.append(self.simbow.score(q1, q1_w2v, q2, q2_w2v)) X.append(self.simbow.score(q1, q1_elmo_bottom, q2, q2_elmo_bottom)) X.append(self.simbow.score(q1, q1_elmo_middle, q2, q2_elmo_middle)) X.append(self.simbow.score(q1, q1_elmo_top, q2, q2_elmo_top)) return X
def validate(self): logging.info('Validating', extra=d) simplelm, simpletrm, simpletrlm = {}, {}, {} lm, trm, trlm = {}, {}, {} for j, q1id in enumerate(self.devset): simplelm[q1id] = [] simpletrm[q1id] = [] simpletrlm[q1id] = [] lm[q1id] = [] trm[q1id] = [] trlm[q1id] = [] percentage = round(float(j + 1) / len(self.devset), 2) print('Progress: ', percentage, j + 1, sep='\t', end='\r') query = self.devset[q1id] q1 = query['tokens_proc'] elmo_emb1 = self.develmo.get(str(self.devidx[q1id])) w2v_emb = features.encode(q1, self.word2vec) q1emb = [ np.concatenate([w2v_emb[i], elmo_emb1[i]]) for i in range(len(w2v_emb)) ] duplicates = query['duplicates'] for duplicate in duplicates: rel_question = duplicate['rel_question'] q2id = rel_question['id'] q2 = rel_question['tokens_proc'] elmo_emb2 = self.develmo.get(str(self.devidx[q2id])) w2v_emb = features.encode(q2, self.word2vec) q2emb = [ np.concatenate([w2v_emb[i], elmo_emb2[i]]) for i in range(len(w2v_emb)) ] slmprob, strmprob, strlmprob, _ = self.translation.score( q1, q2) lmprob, trmprob, trlmprob, _ = self.translation.score_embeddings( q1, q1emb, q2, q2emb) real_label = 0 if rel_question['relevance'] != 'Irrelevant': real_label = 1 simplelm[q1id].append((real_label, slmprob, q2id)) simpletrm[q1id].append((real_label, strmprob, q2id)) simpletrlm[q1id].append((real_label, strlmprob, q2id)) lm[q1id].append((real_label, lmprob, q2id)) trm[q1id].append((real_label, trmprob, q2id)) trlm[q1id].append((real_label, trlmprob, q2id)) with open('data/translationranking.txt', 'w') as f: for q1id in trlm: for row in trlm[q1id]: label = 'false' if row[0] == 1: label = 'true' f.write('\t'.join([ str(q1id), str(row[2]), str(0), str(row[1]), label, '\n' ])) logging.info('Finishing to validate.', extra=d) return simplelm, simpletrm, simpletrlm, lm, trm, trlm
def validate(self): logging.info('Validating tree svm.', extra=d) treekernel = features.TreeKernel() ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(self.devset): ranking[q1id] = [] percentage = round(float(i + 1) / len(self.devset), 2) query = self.devset[q1id] q1_token2lemma = dict(zip(query['tokens'], query['lemmas'])) q1_tree = utils.binarize( utils.parse_tree(query['tree'], q1_token2lemma)) q1_w2v = features.encode(query['tokens'], self.word2vec) q1_elmo = self.fulldevelmo.get(str(self.fulldevidx[q1id])) q1_emb = [ np.concatenate([q1_w2v[i], q1_elmo[i]]) for i in range(len(q1_w2v)) ] duplicates = query['duplicates'] for duplicate in duplicates: rel_question = duplicate['rel_question'] q2id = rel_question['id'] # tree kernel q2_token2lemma = dict( zip(rel_question['tokens'], rel_question['lemmas'])) q2_tree = utils.binarize( utils.parse_tree(rel_question['tree'], q2_token2lemma)) # word2vec vectors q2_w2v = features.encode(rel_question['tokens'], self.word2vec) q2_elmo = self.fulldevelmo.get(str(self.fulldevidx[q2id])) q2_emb = [ np.concatenate([q2_w2v[i], q2_elmo[i]]) for i in range(len(q2_w2v)) ] q1_tree, q2_tree = treekernel.similar_terminals( q1_tree, q2_tree) X = [] for j, trainrow in enumerate(self.traindata): c1id, c2id = trainrow['q1_id'], trainrow['q2_id'] c1_token2lemma = dict( zip(trainrow['q1_full'], trainrow['q1_lemmas'])) c2_token2lemma = dict( zip(trainrow['q2_full'], trainrow['q2_lemmas'])) c1_tree = utils.binarize( utils.parse_tree(trainrow['q1_tree'], c1_token2lemma)) c2_tree = utils.binarize( utils.parse_tree(trainrow['q2_tree'], c2_token2lemma)) # word2vec vectors c1_w2v = features.encode(trainrow['q1_full'], self.word2vec) c1_elmo = self.fulltrainelmo.get( str(self.fulltrainidx[c1id])) c1_emb = [ np.concatenate([c1_w2v[i], c1_elmo[i]]) for i in range(len(c1_w2v)) ] c2_w2v = features.encode(trainrow['q2_full'], self.word2vec) c2_elmo = self.fulltrainelmo.get( str(self.fulltrainidx[c2id])) c2_emb = [ np.concatenate([c2_w2v[i], c2_elmo[i]]) for i in range(len(c2_w2v)) ] c1_tree, c2_tree = treekernel.similar_terminals( c1_tree, c2_tree) kq1 = self.memoize(q1id, q1_tree, q1_emb, q1id, q1_tree, q1_emb, treekernel) kc1 = self.memoize(c1id, c1_tree, c1_emb, c1id, c1_tree, c1_emb, treekernel) kq1c1 = float( self.memoize(q1id, q1_tree, q1_emb, c1id, c1_tree, c1_emb, treekernel)) / np.sqrt( kq1 * kc1) # normalized kq2 = self.memoize(q2id, q2_tree, q2_emb, q2id, q2_tree, q2_emb, treekernel) kc2 = self.memoize(c2id, c2_tree, c2_emb, c2id, c2_tree, c2_emb, treekernel) kq2c2 = float( self.memoize(q2id, q2_tree, q2_emb, c2id, c2_tree, c2_emb, treekernel)) / np.sqrt( kq2 * kc2) # normalized # kq1c2 = float(self.memoize(q1id, q1_tree, q1_emb, c2id, c2_tree, c2_emb, treekernel)) / np.sqrt(kq1 * kc2) # normalized # kq2c1 = float(self.memoize(q2id, q2_tree, q2_emb, c1id, c1_tree, c1_emb, treekernel)) / np.sqrt(kq2 * kc1) # normalized k = kq1c1 + kq2c2 X.append(k) print('Progress: ', percentage, i + 1, sep='\t', end='\r') score = self.model.decision_function([X])[0] pred_label = self.model.predict([X])[0] y_pred.append(pred_label) real_label = 0 if rel_question['relevance'] != 'Irrelevant': real_label = 1 y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) with open('data/treeranking.txt', 'w') as f: for qid in ranking: for row in ranking[qid]: label = 'false' if row[0] == 1: label = 'true' f.write('\t'.join([ str(qid), str(row[2]), str(0), str(row[1]), label, '\n' ])) logging.info('Finishing to validate tree svm.', extra=d) return ranking, y_real, y_pred
def train(self): logging.info('Training tree svm.', extra=d) treekernel = features.TreeKernel() if not os.path.exists(KERNEL_PATH): X, y = [], [] for i, q in enumerate(self.traindata): percentage = round(float(i + 1) / len(self.traindata), 2) x = [] q1id, q2id = q['q1_id'], q['q2_id'] # trees q1_token2lemma = dict(zip(q['q1_full'], q['q1_lemmas'])) q2_token2lemma = dict(zip(q['q2_full'], q['q2_lemmas'])) q1 = utils.binarize( utils.parse_tree(q['q1_tree'], q1_token2lemma)) q2 = utils.binarize( utils.parse_tree(q['q2_tree'], q2_token2lemma)) # word2vec and elmo vectors q1_w2v = features.encode(q['q1_full'], self.word2vec) q1_elmo = self.fulltrainelmo.get(str(self.fulltrainidx[q1id])) q1_emb = [ np.concatenate([q1_w2v[i], q1_elmo[i]]) for i in range(len(q1_w2v)) ] q2_w2v = features.encode(q['q2_full'], self.word2vec) q2_elmo = self.fulltrainelmo.get(str(self.fulltrainidx[q2id])) q2_emb = [ np.concatenate([q2_w2v[i], q2_elmo[i]]) for i in range(len(q2_w2v)) ] q1, q2 = treekernel.similar_terminals(q1, q2) for j, c in enumerate(self.traindata): c1id, c2id = c['q1_id'], c['q2_id'] # trees c1_token2lemma = dict(zip(c['q1_full'], c['q1_lemmas'])) c2_token2lemma = dict(zip(c['q2_full'], c['q2_lemmas'])) c1 = utils.binarize( utils.parse_tree(c['q1_tree'], c1_token2lemma)) c2 = utils.binarize( utils.parse_tree(c['q2_tree'], c2_token2lemma)) # word2vec vectors c1_w2v = features.encode(c['q1_full'], self.word2vec) c1_elmo = self.fulltrainelmo.get( str(self.fulltrainidx[c1id])) c1_emb = [ np.concatenate([c1_w2v[i], c1_elmo[i]]) for i in range(len(c1_w2v)) ] c2_w2v = features.encode(c['q2_full'], self.word2vec) c2_elmo = self.fulltrainelmo.get( str(self.fulltrainidx[c2id])) c2_emb = [ np.concatenate([c2_w2v[i], c2_elmo[i]]) for i in range(len(c2_w2v)) ] c1, c2 = treekernel.similar_terminals(c1, c2) kq1 = self.memoize(q1id, q1, q1_emb, q1id, q1, q1_emb, treekernel) kc1 = self.memoize(c1id, c1, c1_emb, c1id, c1, c1_emb, treekernel) kq1c1 = float( self.memoize(q1id, q1, q1_emb, c1id, c1, c1_emb, treekernel)) / np.sqrt( kq1 * kc1) # normalized kq2 = self.memoize(q2id, q2, q2_emb, q2id, q2, q2_emb, treekernel) kc2 = self.memoize(c2id, c2, c2_emb, c2id, c2, c2_emb, treekernel) kq2c2 = float( self.memoize(q2id, q2, q2_emb, c2id, c2, c2_emb, treekernel)) / np.sqrt( kq2 * kc2) # normalized # kq1c2 = float(self.memoize(q1id, q1, q1_emb, c2id, c2, c2_emb, treekernel)) / np.sqrt(kq1 * kc2) # normalized # kq2c1 = float(self.memoize(q2id, q2, q2_emb, c1id, c1, c1_emb, treekernel)) / np.sqrt(kq2 * kc1) # normalized k = kq1c1 + kq2c2 x.append(k) print('Preparing kernel: ', percentage, i + 1, j + 1, sep='\t', end='\r') X.append(x) y.append(q['label']) p.dump(list(zip(X, y)), open(KERNEL_PATH, 'wb')) X = np.array(X) else: f = p.load(open(KERNEL_PATH, 'rb')) X = np.array([x[0] for x in f]) y = list(map(lambda x: x[1], f)) self.model = self.train_svm(trainvectors=X, labels=y, c='search', kernel='precomputed', gamma='search', jobs=4) logging.info('Finishing to train tree svm.', extra=d)
def validate(self): logging.info('Validating svm.', extra=d) treekernel = features.TreeKernel(alpha=0, decay=1, ignore_leaves=True, smoothed=False) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(self.devset): ranking[q1id] = [] percentage = round(float(i + 1) / len(self.devset), 2) print('Progress: ', percentage, i + 1, sep='\t', end='\r') query = self.devset[q1id] q1 = query['tokens_proc'] # q1_lemma = query['lemmas'] # q1_pos = query['pos'] # q1_token2lemma = dict(zip(query['tokens'], query['lemmas'])) # q1_tree = utils.parse_tree(query['subj_tree'], q1_token2lemma) q1_elmo = self.develmo.get(str(self.devidx[q1id])) q1_w2v = features.encode(q1, self.word2vec) q1_emb = [ np.concatenate([q1_w2v[i], q1_elmo[i]]) for i in range(len(q1_w2v)) ] duplicates = query['duplicates'] for duplicate in duplicates: rel_question = duplicate['rel_question'] q2id = rel_question['id'] q2 = rel_question['tokens_proc'] # X = self.get_features(q1id, q1, q2id, q2, set='dev') # X = self.__transform__(q1, q2) X = [] q2_elmo = self.develmo.get(str(self.devidx[q2id])) q2_w2v = features.encode(q2, self.word2vec) q2_emb = [ np.concatenate([q2_w2v[i], q2_elmo[i]]) for i in range(len(q2_w2v)) ] # # translation # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb) # X.append(trlmprob) # # # bm25 # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf) # X.append(bm25_score) # # # cosine # q2_lemma = rel_question['lemmas'] # q2_pos = rel_question['pos'] # for n in range(1,5): # try: # X.append(features.cosine(' '.join(q1), ' '.join(q2), n=n)) # except: # X.append(0.0) # try: # X.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n)) # except: # X.append(0.0) # try: # X.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n)) # except: # X.append(0.0) # # # tree kernel # q2_token2lemma = dict(zip(rel_question['tokens'], rel_question['lemmas'])) # q2_tree = utils.parse_tree(rel_question['subj_tree'], q2_token2lemma) # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree) # X.append(treekernel(q1_tree, q2_tree)) # # # frobenius norm # X.append(features.frobenius_norm(q1_emb, q2_emb)) # softcosine simbow = self.simbow.score(q1, q1_emb, q2, q2_emb) X.append(simbow) for comment in duplicate['rel_comments']: q3id = comment['id'] q3 = comment['tokens_proc'] simbow_q1q3, simbow_q2q3 = 0, 0 if len(q3) > 0: # X.extend(self.get_features(q1id, q1, q3id, q3, set='dev')) q3_elmo = self.develmo.get( str(self.devidx[comment['id']])) q3_w2v = features.encode(q3, self.word2vec) q3_emb = [ np.concatenate([q3_w2v[i], q3_elmo[i]]) for i in range(len(q3_w2v)) ] simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb) # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb) # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf) # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb) # X.append(trlmprob) # X.append(bm25_score) X.append(simbow_q1q3) # X.append(simbow_q2q3) # scale X = self.scaler.transform([X]) # feature selection X = self.feat_selector.transform(X) score = self.model.decision_function(X)[0] pred_label = self.model.predict(X)[0] y_pred.append(pred_label) real_label = 0 if rel_question['relevance'] != 'Irrelevant': real_label = 1 y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) with open('data/ranking.txt', 'w') as f: for q1id in ranking: for row in ranking[q1id]: label = 'false' if row[0] == 1: label = 'true' f.write('\t'.join([ str(q1id), str(row[2]), str(0), str(row[1]), label, '\n' ])) logging.info('Finishing to validate svm.', extra=d) return ranking, y_real, y_pred
def train(self): logging.info('Training svm.', extra=d) treekernel = features.TreeKernel(alpha=0, decay=1, ignore_leaves=True, smoothed=False) self.bm25_model, self.avg_idf, self.bm25_qid_index = features.init_bm25( traindata=self.trainset, devdata=self.devset, testdata=[]) if not os.path.exists(FEATURE_PATH): X, y = [], [] for i, query_question in enumerate(self.traindata): percentage = round(float(i + 1) / len(self.traindata), 2) print('Preparing traindata: ', percentage, i + 1, sep='\t', end='\r') q1id = query_question['q1_id'] q2id = query_question['q2_id'] q1, q2 = query_question['q1'], query_question['q2'] # x = self.get_features(q1id, q1, q2id, q2) x = [] # x = self.__transform__(q1, q2) # # # elmo and word2vec embeddings q1_elmo = self.trainelmo.get(str(self.trainidx[q1id])) q1_w2v = features.encode(q1, self.word2vec) q1_emb = [ np.concatenate([q1_w2v[i], q1_elmo[i]]) for i in range(len(q1_w2v)) ] q2_elmo = self.trainelmo.get(str(self.trainidx[q2id])) q2_w2v = features.encode(q2, self.word2vec) q2_emb = [ np.concatenate([q2_w2v[i], q2_elmo[i]]) for i in range(len(q2_w2v)) ] # # translation # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb) # x.append(trlmprob) # # # bm25 # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf) # x.append(bm25_score) # # # cosine # q1_lemma = query_question['q1_lemmas'] # q1_pos = query_question['q1_pos'] # q2_lemma = query_question['q2_lemmas'] # q2_pos = query_question['q2_pos'] # for n in range(1,5): # try: # x.append(features.cosine(' '.join(q1), ' '.join(q2), n=n)) # except: # x.append(0.0) # try: # x.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n)) # except: # x.append(0.0) # try: # x.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n)) # except: # x.append(0.0) # # # tree kernels # q1_token2lemma = dict(zip(query_question['q1_full'], query_question['q1_lemmas'])) # q2_token2lemma = dict(zip(query_question['q2_full'], query_question['q2_lemmas'])) # q1_tree, q2_tree = utils.parse_tree(query_question['q1_tree'], q1_token2lemma), utils.parse_tree(query_question['q2_tree'], q2_token2lemma) # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree) # x.append(treekernel(q1_tree, q2_tree)) # # # frobenius norm # x.append(features.frobenius_norm(q1_emb, q2_emb)) # # # softcosine simbow = self.simbow.score(q1, q1_emb, q2, q2_emb) x.append(simbow) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] simbow_q1q3, simbow_q2q3 = 0, 0 if len(q3) > 0: # x.extend(self.get_features(q1id, q1, q3id, q3)) q3_elmo = self.trainelmo.get(str(self.trainidx[q3id])) q3_w2v = features.encode(q3, self.word2vec) q3_emb = [ np.concatenate([q3_w2v[i], q3_elmo[i]]) for i in range(len(q3_w2v)) ] simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb) # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb) # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb) # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf) # x.append(trlmprob) # x.append(bm25_score) x.append(simbow_q1q3) # x.append(simbow_q2q3) X.append(x) y.append(query_question['label']) p.dump(list(zip(X, y)), open(FEATURE_PATH, 'wb')) else: f = p.load(open(FEATURE_PATH, 'rb')) X = list(map(lambda x: x[0], f)) y = list(map(lambda x: x[1], f)) # scale features self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(X) X = self.scaler.transform(X) clf = LassoCV(cv=10) self.feat_selector = SelectFromModel(clf) self.feat_selector.fit(X, y) X = self.feat_selector.transform(X) self.model = self.train_svm(trainvectors=X, labels=y, c='search', kernel='search', gamma='search', degree='search', jobs=4) # self.model = self.train_regression(trainvectors=X, labels=y, c='search', penalty='search', tol='search') logging.info('Finishing to train svm.')