class SemevalEnsemble: def __init__(self, stop={}, lowercase={}, punctuation={}, vector={}, scale=True, w2vdim=300, kernel_path='', alpha=0.8, sigma=0.2): self.stop = stop self.lowercase = lowercase self.punctuation = punctuation self.scale = scale self.vector = vector self.alpha = alpha self.sigma = sigma self.kernel_path = kernel_path self.w2vdim = w2vdim self.theta = 0.9 self.ensemble = Model() self.train() def format(self, ranking): new_ranking = {} for q1id in ranking: new_ranking[q1id] = {} for question in ranking[q1id]: real_label, score, q2id = question new_ranking[q1id][q2id] = (score, real_label) return new_ranking def train(self): # self.train_feature() self.train_kernel() self.train_classifier() # finding theta in development set thetas = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] best_map = 0.0 for theta in thetas: ranking = {} for q1id in self.devkernel: ranking[q1id] = [] for q2id in self.devkernel[q1id]: X = [] X.append(self.devbm25[q1id][q2id][0]) X.append(self.devtranslation[q1id][q2id][0]) X.append(self.devsoftcosine[q1id][q2id][0]) if self.scale: X = self.scaler.transform([X])[0] clfscore, pred_label = self.ensemble.score(X) kernelscore = self.devkernel[q1id][q2id][0] score = (theta * clfscore) + ((1 - theta) * kernelscore) ranking[q1id].append((pred_label, score, q2id)) map_baseline, map_model = evaluate(copy.copy(ranking), prepare_gold(DEV_GOLD_PATH)) if map_model > best_map: best_map = copy.copy(map_model) print('MAP baseline', map_baseline) print('MAP: ', map_model) print(10 * '-') self.theta = theta def train_kernel(self): vector = self.vector['kernel'] lowercase = self.lowercase['kernel'] path = os.path.join( 'ensemble', 'kernel.lower_' + str(lowercase) + '.vector_' + vector + '.vecdim_' + str(self.w2vdim)) if not os.path.exists(path): self.kernel = SemevalTreeKernel(smoothed=True, vector=vector, lowercase=lowercase, tree='subj_tree', kernel_path=self.kernel_path, w2vdim=self.w2vdim) self.trainkernel, _, _, _ = self.kernel.test(self.kernel.traindata, self.kernel.trainidx, self.kernel.trainelmo, test_='train') self.trainkernel = self.format(self.trainkernel) self.devkernel, _, _, _ = self.kernel.validate() self.devkernel = self.format(self.devkernel) self.test2016kernel, _, _, _ = self.kernel.test( self.kernel.test2016data, self.kernel.test2016idx, self.kernel.test2016elmo, test_='test2016') self.test2016kernel = self.format(self.test2016kernel) self.test2017kernel, _, _, _ = self.kernel.test( self.kernel.test2017data, self.kernel.test2017idx, self.kernel.test2017elmo, test_='test2017') self.test2017kernel = self.format(self.test2017kernel) data = { 'train': self.trainkernel, 'dev': self.devkernel, 'test2016': self.test2016kernel, 'test2017': self.test2017kernel } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.trainkernel = data['train'] self.devkernel = data['dev'] self.test2016kernel = data['test2016'] self.test2017kernel = data['test2017'] def train_classifier(self): lowercase, stop, punctuation = self.lowercase['bm25'], self.stop[ 'bm25'], self.punctuation['bm25'] path = os.path.join( 'ensemble', 'bm25.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation)) if not os.path.exists(path): self.bm25 = SemevalBM25(stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=True) self.trainbm25 = self.format(self.bm25.test(self.bm25.traindata)) self.devbm25 = self.format(self.bm25.validate()) self.test2016bm25 = self.format( self.bm25.test(self.bm25.test2016data)) self.test2017bm25 = self.format( self.bm25.test(self.bm25.test2017data)) del self.bm25 data = { 'train': self.trainbm25, 'dev': self.devbm25, 'test2016': self.test2016bm25, 'test2017': self.test2017bm25 } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.trainbm25 = data['train'] self.devbm25 = data['dev'] self.test2016bm25 = data['test2016'] self.test2017bm25 = data['test2017'] vector = self.vector['translation'] lowercase, stop, punctuation = self.lowercase[ 'translation'], self.stop['translation'], self.punctuation[ 'translation'] path = os.path.join( 'ensemble', 'translation.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation) + '.vector_' + str(vector) + '.vecdim_' + str(self.w2vdim)) if not os.path.exists(path): self.translation = SemevalTranslation(alpha=self.alpha, sigma=self.sigma, punctuation=punctuation, proctrain=True, vector=vector, stop=stop, lowercase=lowercase, w2vdim=self.w2vdim) self.traintranslation = self.format( self.translation.test(self.translation.traindata, self.translation.trainidx, self.translation.trainelmo)) self.devtranslation = self.format(self.translation.validate()) self.test2016translation = self.format( self.translation.test(self.translation.test2016data, self.translation.test2016idx, self.translation.test2016elmo)) self.test2017translation = self.format( self.translation.test(self.translation.test2017data, self.translation.test2017idx, self.translation.test2017elmo)) del self.translation data = { 'train': self.traintranslation, 'dev': self.devtranslation, 'test2016': self.test2016translation, 'test2017': self.test2017translation } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.traintranslation = data['train'] self.devtranslation = data['dev'] self.test2016translation = data['test2016'] self.test2017translation = data['test2017'] vector = self.vector['softcosine'] lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[ 'softcosine'], self.punctuation['softcosine'] path = os.path.join( 'ensemble', 'softcosine.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation) + '.vector_' + str(vector) + '.vecdim_' + str(self.w2vdim)) if not os.path.exists(path): self.softcosine = SemevalSoftCosine(stop=stop, vector=vector, lowercase=lowercase, punctuation=punctuation, proctrain=True, w2vdim=self.w2vdim) self.trainsoftcosine = self.format( self.softcosine.test(self.softcosine.traindata, self.softcosine.trainidx, self.softcosine.trainelmo)) self.devsoftcosine = self.format(self.softcosine.validate()) self.test2016softcosine = self.format( self.softcosine.test(self.softcosine.test2016data, self.softcosine.test2016idx, self.softcosine.test2016elmo)) self.test2017softcosine = self.format( self.softcosine.test(self.softcosine.test2017data, self.softcosine.test2017idx, self.softcosine.test2017elmo)) del self.softcosine data = { 'train': self.trainsoftcosine, 'dev': self.devsoftcosine, 'test2016': self.test2016softcosine, 'test2017': self.test2017softcosine } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.trainsoftcosine = data['train'] self.devsoftcosine = data['dev'] self.test2016softcosine = data['test2016'] self.test2017softcosine = data['test2017'] self.X, self.y = [], [] for q1id in self.trainbm25: for q2id in self.trainbm25[q1id]: X = [self.trainbm25[q1id][q2id][0]] X.append(self.traintranslation[q1id][q2id][0]) X.append(self.trainsoftcosine[q1id][q2id][0]) self.X.append(X) self.y.append(self.trainbm25[q1id][q2id][1]) if self.scale: self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(self.X) self.X = self.scaler.transform(self.X) self.ensemble.train_regression(trainvectors=self.X, labels=self.y, c='search', penalty='search', tol='search', gridsearch='brutal', jobs=10) def train_feature(self): lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[ 'softcosine'], self.punctuation['softcosine'] semeval = SemevalSoftCosine(stop=stop, vector='word2vec', lowercase=lowercase, punctuation=punctuation, proctrain=True) self.trainfeature = {} for i, q1id in enumerate(semeval.traindata): self.trainfeature[q1id] = {} for q2id in semeval.traindata[q1id]: pair = semeval.traindata[q1id][q2id] q1, q2 = pair['q1'], pair['q2'] q1emb = np.mean(semeval.encode(q1id, q1, semeval.trainidx, semeval.trainelmo), axis=0) q2emb = np.mean(semeval.encode(q2id, q2, semeval.trainidx, semeval.trainelmo), axis=0) cos = cosine_similarity([q1emb], [q2emb])[0][0] self.trainfeature[q1id][q2id] = (cos, 0) self.devfeature = {} for i, q1id in enumerate(semeval.devdata): self.devfeature[q1id] = {} for q2id in semeval.devdata[q1id]: pair = semeval.devdata[q1id][q2id] q1, q2 = pair['q1'], pair['q2'] q1emb = np.mean(semeval.encode(q1id, q1, semeval.devidx, semeval.develmo), axis=0) q2emb = np.mean(semeval.encode(q2id, q2, semeval.devidx, semeval.develmo), axis=0) cos = cosine_similarity([q1emb], [q2emb])[0][0] self.devfeature[q1id][q2id] = (cos, 0) self.test2016feature = {} for i, q1id in enumerate(semeval.test2016data): self.test2016feature[q1id] = {} for q2id in semeval.test2016data[q1id]: pair = semeval.test2016data[q1id][q2id] q1, q2 = pair['q1'], pair['q2'] q1emb = np.mean(semeval.encode(q1id, q1, semeval.test2016idx, semeval.test2016elmo), axis=0) q2emb = np.mean(semeval.encode(q2id, q2, semeval.test2016idx, semeval.test2016elmo), axis=0) cos = cosine_similarity([q1emb], [q2emb])[0][0] self.test2016feature[q1id][q2id] = (cos, 0) self.test2017feature = {} for i, q1id in enumerate(semeval.test2017data): self.test2017feature[q1id] = {} for q2id in semeval.test2017data[q1id]: pair = semeval.test2017data[q1id][q2id] q1, q2 = pair['q1'], pair['q2'] q1emb = np.mean(semeval.encode(q1id, q1, semeval.test2017idx, semeval.test2017elmo), axis=0) q2emb = np.mean(semeval.encode(q2id, q2, semeval.test2017idx, semeval.test2017elmo), axis=0) cos = cosine_similarity([q1emb], [q2emb])[0][0] self.test2017feature[q1id][q2id] = (cos, 0) def test(self, set_='dev'): if set_ == 'dev': bm25 = self.devbm25 translation = self.devtranslation softcosine = self.devsoftcosine kernel = self.devkernel elif set_ == 'train': bm25 = self.trainbm25 translation = self.traintranslation softcosine = self.trainsoftcosine kernel = self.trainkernel feature = self.trainfeature elif set_ == 'test2016': bm25 = self.test2016bm25 translation = self.test2016translation softcosine = self.test2016softcosine kernel = self.test2016kernel else: bm25 = self.test2017bm25 translation = self.test2017translation softcosine = self.test2017softcosine kernel = self.test2017kernel ranking = {} y_real, y_pred = [], [] for q1id in bm25: ranking[q1id] = [] for q2id in bm25[q1id]: X = [] X.append(bm25[q1id][q2id][0]) X.append(translation[q1id][q2id][0]) X.append(softcosine[q1id][q2id][0]) if self.scale: X = self.scaler.transform([X])[0] clfscore, pred_label = self.ensemble.score(X) y_pred.append(pred_label) real_label = 1 if bm25[q1id][q2id][1] == 'true' else 0 y_real.append(real_label) kernelscore = kernel[q1id][q2id][0] score = (self.theta * clfscore) + ( (1 - self.theta) * kernelscore) ranking[q1id].append((pred_label, score, q2id)) parameter_settings = self.ensemble.return_parameter_settings( clf='regression') parameter_settings = parameter_settings + ',gamma=' + str( self.theta) + ',alpha=' + str(self.alpha) + ',sigma=' + str( self.sigma) return ranking, y_real, y_pred, parameter_settings def save(self, ranking, path, parameter_settings): with open(path, 'w') as f: f.write(parameter_settings) f.write('\n') for q1id in ranking: for row in ranking[q1id]: label = 'false' if row[0] == 1: label = 'true' f.write('\t'.join([ str(q1id), str(row[2]), str(0), str(row[1]), label, '\n' ]))
class SemevalTreeKernel(Semeval): def __init__(self, alpha=0, decay=1, ignore_leaves=True, smoothed=True, vector='word2vec', w2vdim=300, lowercase=True, tree='tree', kernel_path=KERNEL_PATH): Semeval.__init__(self, vector=vector, stop=False, lowercase=lowercase, punctuation=False, w2vdim=w2vdim) self.path = kernel_path self.tree = tree self.memoization = {} self.svm = Model() self.flat_traindata() self.treekernel = TreeKernel(alpha=alpha, decay=decay, ignore_leaves=ignore_leaves, smoothed=smoothed, lowercase=lowercase) self.train() del self.additional def memoize(self, q1id, q1, q1_emb, q1_token2lemma, q2id, q2, q2_emb, q2_token2lemma, alignments): if q1id in self.memoization: if q2id in self.memoization[q1id]: return self.memoization[q1id][q2id] else: self.memoization[q1id] = {} if q2id in self.memoization: if q1id in self.memoization[q2id]: return self.memoization[q2id][q1id] else: self.memoization[q2id] = {} k = self.treekernel(q1, q1_emb, q1_token2lemma, q2, q2_emb, q2_token2lemma, alignments) self.memoization[q1id][q2id] = k self.memoization[q2id][q1id] = k return k def flat_traindata(self): self.flattraindata = [] for q1id in self.traindata: for q2id in self.traindata[q1id]: self.flattraindata.append(self.traindata[q1id][q2id]) def get_alignment(self, c1, c2): alignments = [] for i, w in enumerate(c1): alignments_i = [] for j, t in enumerate(c2): try: w_t = self.alignments[t[0]][t][w[0]][w] except: w_t = 0.0 alignments_i.append(w_t) alignments.append(alignments_i) return alignments def extract_features(self, procdata, elmoidx, elmovec): feat, X, y = {}, [], [] for i, q1id in enumerate(procdata): feat[q1id] = {} percentage = round(float(i + 1) / len(procdata), 2) for q2id in procdata[q1id]: q_pair = procdata[q1id][q2id] x = [] q1id = q_pair['q1_id'] q1 = q_pair['q1_full'] q1_tree = q_pair['q1_tree'] if self.tree == 'tree' else q_pair[ 'subj_q1_tree'] q1_emb = self.encode(q1id, q1, elmoidx, elmovec) q1_token2lemma = dict(zip(q1, q_pair['q1_lemmas'])) alignments = self.get_alignment( q1, q1) if self.vector == 'alignments' else [] kq1 = self.memoize(q1id, q1_tree, q1_emb, q1_token2lemma, q1id, q1_tree, q1_emb, q1_token2lemma, alignments) q2id = q_pair['q2_id'] q2 = q_pair['q2_full'] q2_tree = q_pair['q2_tree'] if self.tree == 'tree' else q_pair[ 'subj_q2_tree'] q2_emb = self.encode(q2id, q2, elmoidx, elmovec) q2_token2lemma = dict(zip(q2, q_pair['q2_lemmas'])) alignments = self.get_alignment( q2, q2) if self.vector == 'alignments' else [] kq2 = self.memoize(q2id, q2_tree, q2_emb, q2_token2lemma, q2id, q2_tree, q2_emb, q2_token2lemma, alignments) if i % 10 == 0: print('Path: ', self.path, 'Progress: ', percentage, i + 1, sep=10 * ' ', end='\r') for j, c in enumerate(self.flattraindata): c1id = c['q1_id'] c1 = c['q1_full'] c1_tree = c['q1_tree'] if self.tree == 'tree' else c[ 'subj_q1_tree'] c1_emb = self.encode(c1id, c1, self.trainidx, self.trainelmo) c1_token2lemma = dict(zip(c1, c['q1_lemmas'])) alignments = self.get_alignment( c1, c1) if self.vector == 'alignments' else [] kc1 = self.memoize(c1id, c1_tree, c1_emb, c1_token2lemma, c1id, c1_tree, c1_emb, c1_token2lemma, alignments) c2id = c['q2_id'] c2 = c['q2_full'] c2_tree = c['q2_tree'] if self.tree == 'tree' else c[ 'subj_q2_tree'] c2_emb = self.encode(c2id, c2, self.trainidx, self.trainelmo) c2_token2lemma = dict(zip(c2, c['q2_lemmas'])) alignments = self.get_alignment( c2, c2) if self.vector == 'alignments' else [] kc2 = self.memoize(c2id, c2_tree, c2_emb, c2_token2lemma, c2id, c2_tree, c2_emb, c2_token2lemma, alignments) if kq1 == 0 or kc1 == 0: kq1c1 = 0.0 else: alignments = self.get_alignment( q1, c1) if self.vector == 'alignments' else [] kq1c1 = float( self.memoize(q1id, q1_tree, q1_emb, q1_token2lemma, c1id, c1_tree, c1_emb, c1_token2lemma, alignments)) / np.sqrt( kq1 * kc1) # normalized if kq2 == 0 or kc2 == 0: kq2c2 = 0.0 else: alignments = self.get_alignment( q2, c2) if self.vector == 'alignments' else [] kq2c2 = float( self.memoize(q2id, q2_tree, q2_emb, q2_token2lemma, c2id, c2_tree, c2_emb, c2_token2lemma, alignments)) / np.sqrt( kq2 * kc2) # normalized k = kq1c1 + kq2c2 x.append(k) y_ = q_pair['label'] feat[q1id][q2id] = (x, y_) X.append(x) y.append(y_) return feat, X, y def train(self): path = os.path.join('kernel', 'train', self.path) self.X, self.y = [], [] if not os.path.exists(path): feat, self.X, self.y = self.extract_features( self.traindata, self.trainidx, self.trainelmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) for q1id in feat: for q2id in feat[q1id]: self.X.append(feat[q1id][q2id][0]) self.y.append(feat[q1id][q2id][1]) self.X = np.array(self.X) self.svm.train_svm(trainvectors=self.X, labels=self.y, c='search', kernel='precomputed', gamma='search', jobs=10) def validate(self): path = os.path.join('kernel', 'dev', self.path) if not os.path.exists(path): feat, X, y = self.extract_features(self.devdata, self.devidx, self.develmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(feat): ranking[q1id] = [] for q2id in feat[q1id]: X = feat[q1id][q2id][0] score, pred_label = self.svm.score(X) y_pred.append(pred_label) real_label = feat[q1id][q2id][1] y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) parameter_settings = self.svm.return_parameter_settings(clf='svm') return ranking, y_real, y_pred, parameter_settings def test(self, testdata, elmoidx, elmovec, test_='test2016'): if test_ == 'test2016': path = os.path.join('kernel', 'test2016', self.path) elif test_ == 'train': path = os.path.join('kernel', 'train', self.path) elif test_ == 'dev': path = os.path.join('kernel', 'test2016', self.path) else: path = os.path.join('kernel', 'test2017', self.path) self.testdata = testdata if not os.path.exists(path): feat, X, y = self.extract_features(self.testdata, elmoidx, elmovec) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(feat): ranking[q1id] = [] for q2id in feat[q1id]: X = feat[q1id][q2id][0] score, pred_label = self.svm.score(X) y_pred.append(pred_label) real_label = feat[q1id][q2id][1] y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) parameter_settings = self.svm.return_parameter_settings(clf='svm') return ranking, y_real, y_pred, parameter_settings
class Rerank: def __init__(self, stop={}, lowercase={}, punctuation={}, vector={}, scale=True, alpha=0.9, sigma=0.1): self.stop = stop self.lowercase = lowercase self.punctuation = punctuation self.scale = scale self.vector = vector self.alpha = alpha self.sigma = sigma self.questions, self.ranking = self.load() self.ensemble = Model() self.train() ranking = self.test() p.dump(ranking, open(os.path.join(SEMI_PATH, 'reranking'), 'wb')) def load(self): with open(os.path.join(SEMI_PATH, 'index.txt')) as f: indexes = f.read().split('\n') with open(os.path.join(SEMI_PATH, 'question.txt')) as f: questions = [text.replace('<SENTENCE>', ' ').split() for text in f.read().split('\n')] with open(os.path.join(SEMI_PATH, 'ranking')) as f: ranking = [w.split() for w in f.read().split('\n')][:-1] ranking = dict([(w[0], w[1:]) for w in ranking]) for qid in ranking: ranking[qid] = [w.split('-') for w in ranking[qid]] return dict(zip(indexes, questions)), ranking def format(self, ranking): new_ranking = {} for q1id in ranking: new_ranking[q1id] = {} for question in ranking[q1id]: real_label, score, q2id = question new_ranking[q1id][q2id] = (score, real_label) return new_ranking def train(self): lowercase, stop, punctuation = self.lowercase['bm25'], self.stop['bm25'], self.punctuation['bm25'] path = os.path.join(SEMI_PATH, 'bm25.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation)) if not os.path.exists(path): self.bm25 = SemiBM25(stop=stop, lowercase=lowercase, punctuation=punctuation) self.trainbm25 = self.format(self.bm25.test(self.bm25.traindata)) self.devbm25 = self.format(self.bm25.validate()) print('Testing BM25...') self.testbm25 = {} for q1id in self.ranking: self.testbm25[q1id] = {} for question in self.ranking[q1id][1:11]: q2id, score = question self.testbm25[q1id][q2id] = (float(score), 0) del self.bm25 data = {'train': self.trainbm25, 'dev': self.devbm25, 'test': self.testbm25} p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.trainbm25 = data['train'] self.devbm25 = data['dev'] self.testbm25 = data['test'] vector = self.vector['translation'] lowercase, stop, punctuation = self.lowercase['translation'], self.stop['translation'], self.punctuation['translation'] path = os.path.join(SEMI_PATH, 'translation.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation) + '.vector_' + str(vector)) if not os.path.exists(path): translation = SemiTranslation(alpha=self.alpha, sigma=self.sigma, punctuation=punctuation, stop=stop, lowercase=lowercase) self.traintranslation = self.format(translation.test(translation.traindata)) self.devtranslation = self.format(translation.validate()) del translation print('Testing Translation...') testdata = list(self.format_input(lowercase, stop, punctuation).items()) self.testtranslation = run_translation_thread(lowercase=lowercase, stop=stop, punctuation=punctuation, alpha=self.alpha, sigma=self.sigma, testdata=testdata) data = {'train': self.traintranslation, 'dev': self.devtranslation, 'test':self.testtranslation} p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.traintranslation = data['train'] self.devtranslation = data['dev'] self.testtranslation = data['test'] vector = self.vector['softcosine'] lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop['softcosine'], self.punctuation['softcosine'] path = os.path.join(SEMI_PATH, 'softcosine.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation) + '.vector_' + str(vector)) if not os.path.exists(path): softcosine = SemiSoftCosine(stop=stop, lowercase=lowercase, punctuation=punctuation) self.trainsoftcosine = self.format(softcosine.test(softcosine.traindata)) self.devsoftcosine = self.format(softcosine.validate()) del softcosine print('Testing Softcosine...') testdata = list(self.format_input(lowercase, stop, punctuation).items()) self.testsoftcosine = run_softcosine_thread(lowercase=lowercase, stop=stop, punctuation=punctuation, testdata=testdata) data = { 'train': self.trainsoftcosine, 'dev': self.devsoftcosine, 'test': self.testsoftcosine } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.trainsoftcosine = data['train'] self.devsoftcosine = data['dev'] self.testsoftcosine = data['test'] self.X, self.y = [], [] for q1id in self.trainbm25: for q2id in self.trainbm25[q1id]: X = [self.trainbm25[q1id][q2id][0]] X.append(self.traintranslation[q1id][q2id][0]) X.append(self.trainsoftcosine[q1id][q2id][0]) self.X.append(X) self.y.append(self.trainbm25[q1id][q2id][1]) if self.scale: self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(self.X) self.X = self.scaler.transform(self.X) self.ensemble.train_regression(trainvectors=self.X, labels=self.y, c='search', penalty='search', tol='search', gridsearch='brutal', jobs=10) def format_input(self, lowercase, stop, punctuation): def remove_punctuation(tokens): return re.sub(r'[\W]+',' ', ' '.join(tokens)).strip().split() def remove_stopwords(tokens): return [w for w in tokens if w.lower() not in stop_] procset = {} for i, q1id in enumerate(self.ranking): procset[q1id] = {} percentage = str(round((float(i+1) / len(self.ranking)) * 100, 2)) + '%' print('Process: ', percentage, end='\r') q1 = self.questions[q1id] q1 = [w.lower() for w in q1] if lowercase else q1 q1 = remove_punctuation(q1) if punctuation else q1 q1 = remove_stopwords(q1) if stop else q1 for row in self.ranking[q1id][1:11]: q2id, score = row q2 = self.questions[q2id] q2 = [w.lower() for w in q2] if lowercase else q2 q2 = remove_punctuation(q2) if punctuation else q2 q2 = remove_stopwords(q2) if stop else q2 label = 0 procset[q1id][q2id] = { 'q1_id': q1id, 'q1': q1, 'q2_id': q2id, 'q2': q2, 'label':label } return procset def test(self): bm25 = self.testbm25 translation = self.testtranslation softcosine = self.testsoftcosine ranking = {} for q1id in bm25: ranking[q1id] = {} for q2id in bm25[q1id]: X = [bm25[q1id][q2id][0]] X.append(translation[q1id][q2id][0]) X.append(softcosine[q1id][q2id][0]) if self.scale: X = self.scaler.transform([X])[0] clfscore, pred_label = self.ensemble.score(X) ranking[q1id][q2id] = {'score':clfscore, 'label':pred_label} return ranking
class QuoraSVM(Quora): def __init__(self, model='svm', features='bm25,', comment_features='bm25,', stop=True, vector='word2vec', path=FEATURES_PATH, alpha=0.1, sigma=0.9, gridsearch='random'): Quora.__init__(self, stop=stop, vector=vector) self.path = path self.features = features.split(',') self.comment_features = comment_features.split(',') self.gridsearch = gridsearch self.svm = Model() self.model = model self.bm25 = QuoraBM25(stop=stop) if 'bm25' in self.features+self.comment_features else None self.cosine = QuoraCosine(stop=stop) if 'cosine' in self.features+self.comment_features else None self.softcosine = QuoraSoftCosine(stop=stop, vector=vector) if 'softcosine' in self.features+self.comment_features else None self.translation = QuoraTranslations(alpha=alpha, sigma=sigma, stop=stop, vector=self.vector) if 'translation' in self.features+self.comment_features else None self.train() def extract_features(self, pairdata, elmoidx, elmovec, fullelmoidx, fullelmovec): X, y = [], [] feat = [] for i, pair in enumerate(pairdata): try: percentage = round(float(i + 1) / len(pairdata), 2) print('Extracting features: ', percentage, i + 1, sep='\t', end = '\r') q1id = pair['qid1'] if 'qid1' in pair else str(i) + '1' q2id = pair['qid2'] if 'qid2' in pair else str(i) + '2' q1, q2 = pair['tokens_proc1'], pair['tokens_proc2'] x = [] if self.stop: q1_emb = self.encode(q1id, q1, elmoidx, elmovec) else: q1_emb = self.encode(q1id, q1, fullelmoidx, fullelmovec) # bm25 if 'bm25' in self.features: score = self.bm25.model(q1, q2id) x.append(score) # softcosine if 'softcosine' in self.features: if self.stop: q2_emb = self.encode(q2id, q2, elmoidx, elmovec) else: q2_emb = self.encode(q2id, q2, fullelmoidx, fullelmovec) score = self.softcosine.model(q1, q1_emb, q2, q2_emb) x.append(score) # translation if 'translation' in self.features: if self.stop: q2_emb = self.encode(q2id, q2, elmoidx, elmovec) else: q2_emb = self.encode(q2id, q2, fullelmoidx, fullelmovec) lmprob, trmprob, trlmprob, proctime = self.translation.model(q1, q1_emb, q2, q2_emb) x.append(trlmprob) # cosine if 'cosine' in self.features: score = self.cosine.model(q1, q2) x.append(score) y_ = int(pair['is_duplicate']) feat.append((x, y_)) X.append(x) y.append(y_) except: print('Error') print(pair) return feat, X, y def train(self): path = os.path.join(FEATURES_PATH, 'train', self.path) self.X, self.y = [], [] if not os.path.exists(path): feat, self.X, self.y = self.extract_features(self.trainset, self.trainidx, self.trainelmo, self.fulltrainidx, self.fulltrainelmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) for row in feat: self.X.append(row[0]) self.y.append(row[1]) self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(self.X) self.X = self.scaler.transform(self.X) if self.model == 'svm': self.svm.train_svm( trainvectors=self.X, labels=self.y, c='search', kernel='search', gamma='search', jobs=10, gridsearch=self.gridsearch ) else: self.svm.train_regression(trainvectors=self.X, labels=self.y, c='search', penalty='search', tol='search', gridsearch=self.gridsearch, jobs=10) def validate(self): path = os.path.join(FEATURES_PATH, 'dev', self.path) if not os.path.exists(path): feat, X, y = self.extract_features(self.devset, self.devidx, self.develmo, self.fulldevidx, self.fulldevelmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) y_real, y_pred = [], [] for i, pair in enumerate(feat): X = pair[0] X = self.scaler.transform([X])[0] score, pred_label = self.svm.score(X) y_pred.append(pred_label) real_label = pair[1] y_real.append(real_label) parameter_settings = self.svm.return_parameter_settings(clf=self.model) return y_real, y_pred, parameter_settings
class SemevalSVM(Semeval): def __init__(self, model='svm', features='bm25,', comment_features='bm25,', stop=True, vector='word2vec', lowercase=True, punctuation=True, proctrain=True, path=FEATURES_PATH, alpha=0.1, sigma=0.9, gridsearch='random'): Semeval.__init__(self, stop=stop, vector=vector, lowercase=lowercase, punctuation=punctuation) self.path = path self.features = features.split(',') self.comment_features = comment_features.split(',') self.gridsearch = gridsearch self.svm = Model() self.model = model self.bm25 = SemevalBM25( stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain ) if 'bm25' in self.features + self.comment_features else None self.cosine = SemevalCosine( stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain ) if 'cosine' in self.features + self.comment_features else None self.softcosine = SemevalSoftCosine( stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain, vector=vector ) if 'softcosine' in self.features + self.comment_features else None self.translation = SemevalTranslation( alpha=alpha, sigma=sigma, stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain, vector=self.vector ) if 'translation' in self.features + self.comment_features else None self.train() def extract_features(self, procdata, elmoidx, elmovec): X, y = [], [] feat = {} for i, q1id in enumerate(procdata): feat[q1id] = {} percentage = round(float(i + 1) / len(procdata), 2) print('Extracting features: ', percentage, i + 1, sep='\t', end='\r') for q2id in procdata[q1id]: query_question = procdata[q1id][q2id] q1, q2 = query_question['q1'], query_question['q2'] x = [] q1_emb = self.encode(q1id, q1, elmoidx, elmovec) # bm25 if 'bm25' in self.features: score = self.bm25.model(q1, q2id) x.append(score) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] if len(q3) > 0: score = self.bm25.model(q1, q3id) x.append(score) else: x.append(0) # softcosine elif 'softcosine' in self.features: if self.vector == 'alignments': score = self.softcosine.model.score( q1, q2, self.alignments) else: q2_emb = self.encode(q2id, q2, elmoidx, elmovec) score = self.softcosine.model(q1, q1_emb, q2, q2_emb) x.append(score) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] if len(q3) > 0: if self.vector == 'alignments': score = self.softcosine.model.score( q1, q2, self.alignments) else: q3_emb = self.encode(q3id, q3, elmoidx, elmovec) score = self.softcosine.model( q1, q1_emb, q3, q3_emb) x.append(score) else: x.append(0) # translation elif 'translation' in self.features: if self.vector == 'alignments': lmprob, trmprob, trlmprob, proctime = self.translation.model.score( q1, q2) else: q2_emb = self.encode(q2id, q2, elmoidx, elmovec) lmprob, trmprob, trlmprob, proctime = self.translation.model( q1, q1_emb, q2, q2_emb) x.append(trlmprob) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] if len(q3) > 0: if self.vector == 'alignments': lmprob, trmprob, trlmprob, proctime = self.translation.model.score( q1, q3) else: q3_emb = self.encode(q3id, q3, elmoidx, elmovec) lmprob, trmprob, trlmprob, proctime = self.translation.model( q1, q1_emb, q3, q3_emb) x.append(trlmprob) else: x.append(0) # cosine elif 'cosine' in self.features: score = self.cosine.model(q1, q2) x.append(score) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] if len(q3) > 0: score = self.cosine.model(q1, q3) x.append(score) else: x.append(0) y_ = query_question['label'] feat[q1id][q2id] = (x, y_) X.append(x) y.append(y_) return feat, X, y def train(self): self.X, self.y = [], [] path = os.path.join('feature', 'train', self.path) if not os.path.exists(path): feat, self.X, self.y = self.extract_features( self.traindata, self.trainidx, self.trainelmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) for q1id in feat: for q2id in feat[q1id]: self.X.append(feat[q1id][q2id][0]) self.y.append(feat[q1id][q2id][1]) self.X = np.array(self.X) self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(self.X) self.X = self.scaler.transform(self.X) if self.model == 'svm': self.svm.train_svm(trainvectors=self.X, labels=self.y, c='search', kernel='search', gamma='search', jobs=10, gridsearch=self.gridsearch) else: self.svm.train_regression(trainvectors=self.X, labels=self.y, c='search', penalty='search', tol='search', gridsearch=self.gridsearch, jobs=10) def validate(self): path = os.path.join('feature', 'dev', self.path) if not os.path.exists(path): feat, X, y = self.extract_features(self.devdata, self.devidx, self.develmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(feat): ranking[q1id] = [] for q2id in feat[q1id]: X = feat[q1id][q2id][0] X = self.scaler.transform([X])[0] score, pred_label = self.svm.score(X) y_pred.append(pred_label) real_label = feat[q1id][q2id][1] y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) parameter_settings = self.svm.return_parameter_settings(clf=self.model) return ranking, y_real, y_pred, parameter_settings def test(self, testdata, elmoidx, elmovec, test_='test2016'): if test_ == 'test2016': path = os.path.join('feature', 'test2016', self.path) else: path = os.path.join('feature', 'test2017', self.path) self.testdata = testdata if not os.path.exists(path): feat, X, y = self.extract_features(self.testdata, elmoidx, elmovec) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(feat): ranking[q1id] = [] for q2id in feat[q1id]: X = feat[q1id][q2id][0] X = self.scaler.transform([X])[0] score, pred_label = self.svm.score(X) y_pred.append(pred_label) real_label = feat[q1id][q2id][1] y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) parameter_settings = self.svm.return_parameter_settings(clf=self.model) return ranking, y_real, y_pred, parameter_settings