def __init__(self, model='svm', features='bm25,', comment_features='bm25,', stop=True, vector='word2vec', lowercase=True, punctuation=True, proctrain=True, path=FEATURES_PATH, alpha=0.1, sigma=0.9, gridsearch='random'): Semeval.__init__(self, stop=stop, vector=vector, lowercase=lowercase, punctuation=punctuation) self.path = path self.features = features.split(',') self.comment_features = comment_features.split(',') self.gridsearch = gridsearch self.svm = Model() self.model = model self.bm25 = SemevalBM25( stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain ) if 'bm25' in self.features + self.comment_features else None self.cosine = SemevalCosine( stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain ) if 'cosine' in self.features + self.comment_features else None self.softcosine = SemevalSoftCosine( stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain, vector=vector ) if 'softcosine' in self.features + self.comment_features else None self.translation = SemevalTranslation( alpha=alpha, sigma=sigma, stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain, vector=self.vector ) if 'translation' in self.features + self.comment_features else None self.train()
class SemevalEnsemble: def __init__(self, stop={}, lowercase={}, punctuation={}, vector={}, scale=True, w2vdim=300, kernel_path='', alpha=0.8, sigma=0.2): self.stop = stop self.lowercase = lowercase self.punctuation = punctuation self.scale = scale self.vector = vector self.alpha = alpha self.sigma = sigma self.kernel_path = kernel_path self.w2vdim = w2vdim self.theta = 0.9 self.ensemble = Model() self.train() def format(self, ranking): new_ranking = {} for q1id in ranking: new_ranking[q1id] = {} for question in ranking[q1id]: real_label, score, q2id = question new_ranking[q1id][q2id] = (score, real_label) return new_ranking def train(self): # self.train_feature() self.train_kernel() self.train_classifier() # finding theta in development set thetas = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] best_map = 0.0 for theta in thetas: ranking = {} for q1id in self.devkernel: ranking[q1id] = [] for q2id in self.devkernel[q1id]: X = [] X.append(self.devbm25[q1id][q2id][0]) X.append(self.devtranslation[q1id][q2id][0]) X.append(self.devsoftcosine[q1id][q2id][0]) if self.scale: X = self.scaler.transform([X])[0] clfscore, pred_label = self.ensemble.score(X) kernelscore = self.devkernel[q1id][q2id][0] score = (theta * clfscore) + ((1 - theta) * kernelscore) ranking[q1id].append((pred_label, score, q2id)) map_baseline, map_model = evaluate(copy.copy(ranking), prepare_gold(DEV_GOLD_PATH)) if map_model > best_map: best_map = copy.copy(map_model) print('MAP baseline', map_baseline) print('MAP: ', map_model) print(10 * '-') self.theta = theta def train_kernel(self): vector = self.vector['kernel'] lowercase = self.lowercase['kernel'] path = os.path.join( 'ensemble', 'kernel.lower_' + str(lowercase) + '.vector_' + vector + '.vecdim_' + str(self.w2vdim)) if not os.path.exists(path): self.kernel = SemevalTreeKernel(smoothed=True, vector=vector, lowercase=lowercase, tree='subj_tree', kernel_path=self.kernel_path, w2vdim=self.w2vdim) self.trainkernel, _, _, _ = self.kernel.test(self.kernel.traindata, self.kernel.trainidx, self.kernel.trainelmo, test_='train') self.trainkernel = self.format(self.trainkernel) self.devkernel, _, _, _ = self.kernel.validate() self.devkernel = self.format(self.devkernel) self.test2016kernel, _, _, _ = self.kernel.test( self.kernel.test2016data, self.kernel.test2016idx, self.kernel.test2016elmo, test_='test2016') self.test2016kernel = self.format(self.test2016kernel) self.test2017kernel, _, _, _ = self.kernel.test( self.kernel.test2017data, self.kernel.test2017idx, self.kernel.test2017elmo, test_='test2017') self.test2017kernel = self.format(self.test2017kernel) data = { 'train': self.trainkernel, 'dev': self.devkernel, 'test2016': self.test2016kernel, 'test2017': self.test2017kernel } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.trainkernel = data['train'] self.devkernel = data['dev'] self.test2016kernel = data['test2016'] self.test2017kernel = data['test2017'] def train_classifier(self): lowercase, stop, punctuation = self.lowercase['bm25'], self.stop[ 'bm25'], self.punctuation['bm25'] path = os.path.join( 'ensemble', 'bm25.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation)) if not os.path.exists(path): self.bm25 = SemevalBM25(stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=True) self.trainbm25 = self.format(self.bm25.test(self.bm25.traindata)) self.devbm25 = self.format(self.bm25.validate()) self.test2016bm25 = self.format( self.bm25.test(self.bm25.test2016data)) self.test2017bm25 = self.format( self.bm25.test(self.bm25.test2017data)) del self.bm25 data = { 'train': self.trainbm25, 'dev': self.devbm25, 'test2016': self.test2016bm25, 'test2017': self.test2017bm25 } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.trainbm25 = data['train'] self.devbm25 = data['dev'] self.test2016bm25 = data['test2016'] self.test2017bm25 = data['test2017'] vector = self.vector['translation'] lowercase, stop, punctuation = self.lowercase[ 'translation'], self.stop['translation'], self.punctuation[ 'translation'] path = os.path.join( 'ensemble', 'translation.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation) + '.vector_' + str(vector) + '.vecdim_' + str(self.w2vdim)) if not os.path.exists(path): self.translation = SemevalTranslation(alpha=self.alpha, sigma=self.sigma, punctuation=punctuation, proctrain=True, vector=vector, stop=stop, lowercase=lowercase, w2vdim=self.w2vdim) self.traintranslation = self.format( self.translation.test(self.translation.traindata, self.translation.trainidx, self.translation.trainelmo)) self.devtranslation = self.format(self.translation.validate()) self.test2016translation = self.format( self.translation.test(self.translation.test2016data, self.translation.test2016idx, self.translation.test2016elmo)) self.test2017translation = self.format( self.translation.test(self.translation.test2017data, self.translation.test2017idx, self.translation.test2017elmo)) del self.translation data = { 'train': self.traintranslation, 'dev': self.devtranslation, 'test2016': self.test2016translation, 'test2017': self.test2017translation } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.traintranslation = data['train'] self.devtranslation = data['dev'] self.test2016translation = data['test2016'] self.test2017translation = data['test2017'] vector = self.vector['softcosine'] lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[ 'softcosine'], self.punctuation['softcosine'] path = os.path.join( 'ensemble', 'softcosine.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation) + '.vector_' + str(vector) + '.vecdim_' + str(self.w2vdim)) if not os.path.exists(path): self.softcosine = SemevalSoftCosine(stop=stop, vector=vector, lowercase=lowercase, punctuation=punctuation, proctrain=True, w2vdim=self.w2vdim) self.trainsoftcosine = self.format( self.softcosine.test(self.softcosine.traindata, self.softcosine.trainidx, self.softcosine.trainelmo)) self.devsoftcosine = self.format(self.softcosine.validate()) self.test2016softcosine = self.format( self.softcosine.test(self.softcosine.test2016data, self.softcosine.test2016idx, self.softcosine.test2016elmo)) self.test2017softcosine = self.format( self.softcosine.test(self.softcosine.test2017data, self.softcosine.test2017idx, self.softcosine.test2017elmo)) del self.softcosine data = { 'train': self.trainsoftcosine, 'dev': self.devsoftcosine, 'test2016': self.test2016softcosine, 'test2017': self.test2017softcosine } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.trainsoftcosine = data['train'] self.devsoftcosine = data['dev'] self.test2016softcosine = data['test2016'] self.test2017softcosine = data['test2017'] self.X, self.y = [], [] for q1id in self.trainbm25: for q2id in self.trainbm25[q1id]: X = [self.trainbm25[q1id][q2id][0]] X.append(self.traintranslation[q1id][q2id][0]) X.append(self.trainsoftcosine[q1id][q2id][0]) self.X.append(X) self.y.append(self.trainbm25[q1id][q2id][1]) if self.scale: self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(self.X) self.X = self.scaler.transform(self.X) self.ensemble.train_regression(trainvectors=self.X, labels=self.y, c='search', penalty='search', tol='search', gridsearch='brutal', jobs=10) def train_feature(self): lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[ 'softcosine'], self.punctuation['softcosine'] semeval = SemevalSoftCosine(stop=stop, vector='word2vec', lowercase=lowercase, punctuation=punctuation, proctrain=True) self.trainfeature = {} for i, q1id in enumerate(semeval.traindata): self.trainfeature[q1id] = {} for q2id in semeval.traindata[q1id]: pair = semeval.traindata[q1id][q2id] q1, q2 = pair['q1'], pair['q2'] q1emb = np.mean(semeval.encode(q1id, q1, semeval.trainidx, semeval.trainelmo), axis=0) q2emb = np.mean(semeval.encode(q2id, q2, semeval.trainidx, semeval.trainelmo), axis=0) cos = cosine_similarity([q1emb], [q2emb])[0][0] self.trainfeature[q1id][q2id] = (cos, 0) self.devfeature = {} for i, q1id in enumerate(semeval.devdata): self.devfeature[q1id] = {} for q2id in semeval.devdata[q1id]: pair = semeval.devdata[q1id][q2id] q1, q2 = pair['q1'], pair['q2'] q1emb = np.mean(semeval.encode(q1id, q1, semeval.devidx, semeval.develmo), axis=0) q2emb = np.mean(semeval.encode(q2id, q2, semeval.devidx, semeval.develmo), axis=0) cos = cosine_similarity([q1emb], [q2emb])[0][0] self.devfeature[q1id][q2id] = (cos, 0) self.test2016feature = {} for i, q1id in enumerate(semeval.test2016data): self.test2016feature[q1id] = {} for q2id in semeval.test2016data[q1id]: pair = semeval.test2016data[q1id][q2id] q1, q2 = pair['q1'], pair['q2'] q1emb = np.mean(semeval.encode(q1id, q1, semeval.test2016idx, semeval.test2016elmo), axis=0) q2emb = np.mean(semeval.encode(q2id, q2, semeval.test2016idx, semeval.test2016elmo), axis=0) cos = cosine_similarity([q1emb], [q2emb])[0][0] self.test2016feature[q1id][q2id] = (cos, 0) self.test2017feature = {} for i, q1id in enumerate(semeval.test2017data): self.test2017feature[q1id] = {} for q2id in semeval.test2017data[q1id]: pair = semeval.test2017data[q1id][q2id] q1, q2 = pair['q1'], pair['q2'] q1emb = np.mean(semeval.encode(q1id, q1, semeval.test2017idx, semeval.test2017elmo), axis=0) q2emb = np.mean(semeval.encode(q2id, q2, semeval.test2017idx, semeval.test2017elmo), axis=0) cos = cosine_similarity([q1emb], [q2emb])[0][0] self.test2017feature[q1id][q2id] = (cos, 0) def test(self, set_='dev'): if set_ == 'dev': bm25 = self.devbm25 translation = self.devtranslation softcosine = self.devsoftcosine kernel = self.devkernel elif set_ == 'train': bm25 = self.trainbm25 translation = self.traintranslation softcosine = self.trainsoftcosine kernel = self.trainkernel feature = self.trainfeature elif set_ == 'test2016': bm25 = self.test2016bm25 translation = self.test2016translation softcosine = self.test2016softcosine kernel = self.test2016kernel else: bm25 = self.test2017bm25 translation = self.test2017translation softcosine = self.test2017softcosine kernel = self.test2017kernel ranking = {} y_real, y_pred = [], [] for q1id in bm25: ranking[q1id] = [] for q2id in bm25[q1id]: X = [] X.append(bm25[q1id][q2id][0]) X.append(translation[q1id][q2id][0]) X.append(softcosine[q1id][q2id][0]) if self.scale: X = self.scaler.transform([X])[0] clfscore, pred_label = self.ensemble.score(X) y_pred.append(pred_label) real_label = 1 if bm25[q1id][q2id][1] == 'true' else 0 y_real.append(real_label) kernelscore = kernel[q1id][q2id][0] score = (self.theta * clfscore) + ( (1 - self.theta) * kernelscore) ranking[q1id].append((pred_label, score, q2id)) parameter_settings = self.ensemble.return_parameter_settings( clf='regression') parameter_settings = parameter_settings + ',gamma=' + str( self.theta) + ',alpha=' + str(self.alpha) + ',sigma=' + str( self.sigma) return ranking, y_real, y_pred, parameter_settings def save(self, ranking, path, parameter_settings): with open(path, 'w') as f: f.write(parameter_settings) f.write('\n') for q1id in ranking: for row in ranking[q1id]: label = 'false' if row[0] == 1: label = 'true' f.write('\t'.join([ str(q1id), str(row[2]), str(0), str(row[1]), label, '\n' ]))
def train_feature(self): lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[ 'softcosine'], self.punctuation['softcosine'] semeval = SemevalSoftCosine(stop=stop, vector='word2vec', lowercase=lowercase, punctuation=punctuation, proctrain=True) self.trainfeature = {} for i, q1id in enumerate(semeval.traindata): self.trainfeature[q1id] = {} for q2id in semeval.traindata[q1id]: pair = semeval.traindata[q1id][q2id] q1, q2 = pair['q1'], pair['q2'] q1emb = np.mean(semeval.encode(q1id, q1, semeval.trainidx, semeval.trainelmo), axis=0) q2emb = np.mean(semeval.encode(q2id, q2, semeval.trainidx, semeval.trainelmo), axis=0) cos = cosine_similarity([q1emb], [q2emb])[0][0] self.trainfeature[q1id][q2id] = (cos, 0) self.devfeature = {} for i, q1id in enumerate(semeval.devdata): self.devfeature[q1id] = {} for q2id in semeval.devdata[q1id]: pair = semeval.devdata[q1id][q2id] q1, q2 = pair['q1'], pair['q2'] q1emb = np.mean(semeval.encode(q1id, q1, semeval.devidx, semeval.develmo), axis=0) q2emb = np.mean(semeval.encode(q2id, q2, semeval.devidx, semeval.develmo), axis=0) cos = cosine_similarity([q1emb], [q2emb])[0][0] self.devfeature[q1id][q2id] = (cos, 0) self.test2016feature = {} for i, q1id in enumerate(semeval.test2016data): self.test2016feature[q1id] = {} for q2id in semeval.test2016data[q1id]: pair = semeval.test2016data[q1id][q2id] q1, q2 = pair['q1'], pair['q2'] q1emb = np.mean(semeval.encode(q1id, q1, semeval.test2016idx, semeval.test2016elmo), axis=0) q2emb = np.mean(semeval.encode(q2id, q2, semeval.test2016idx, semeval.test2016elmo), axis=0) cos = cosine_similarity([q1emb], [q2emb])[0][0] self.test2016feature[q1id][q2id] = (cos, 0) self.test2017feature = {} for i, q1id in enumerate(semeval.test2017data): self.test2017feature[q1id] = {} for q2id in semeval.test2017data[q1id]: pair = semeval.test2017data[q1id][q2id] q1, q2 = pair['q1'], pair['q2'] q1emb = np.mean(semeval.encode(q1id, q1, semeval.test2017idx, semeval.test2017elmo), axis=0) q2emb = np.mean(semeval.encode(q2id, q2, semeval.test2017idx, semeval.test2017elmo), axis=0) cos = cosine_similarity([q1emb], [q2emb])[0][0] self.test2017feature[q1id][q2id] = (cos, 0)
def train_classifier(self): lowercase, stop, punctuation = self.lowercase['bm25'], self.stop[ 'bm25'], self.punctuation['bm25'] path = os.path.join( 'ensemble', 'bm25.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation)) if not os.path.exists(path): self.bm25 = SemevalBM25(stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=True) self.trainbm25 = self.format(self.bm25.test(self.bm25.traindata)) self.devbm25 = self.format(self.bm25.validate()) self.test2016bm25 = self.format( self.bm25.test(self.bm25.test2016data)) self.test2017bm25 = self.format( self.bm25.test(self.bm25.test2017data)) del self.bm25 data = { 'train': self.trainbm25, 'dev': self.devbm25, 'test2016': self.test2016bm25, 'test2017': self.test2017bm25 } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.trainbm25 = data['train'] self.devbm25 = data['dev'] self.test2016bm25 = data['test2016'] self.test2017bm25 = data['test2017'] vector = self.vector['translation'] lowercase, stop, punctuation = self.lowercase[ 'translation'], self.stop['translation'], self.punctuation[ 'translation'] path = os.path.join( 'ensemble', 'translation.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation) + '.vector_' + str(vector) + '.vecdim_' + str(self.w2vdim)) if not os.path.exists(path): self.translation = SemevalTranslation(alpha=self.alpha, sigma=self.sigma, punctuation=punctuation, proctrain=True, vector=vector, stop=stop, lowercase=lowercase, w2vdim=self.w2vdim) self.traintranslation = self.format( self.translation.test(self.translation.traindata, self.translation.trainidx, self.translation.trainelmo)) self.devtranslation = self.format(self.translation.validate()) self.test2016translation = self.format( self.translation.test(self.translation.test2016data, self.translation.test2016idx, self.translation.test2016elmo)) self.test2017translation = self.format( self.translation.test(self.translation.test2017data, self.translation.test2017idx, self.translation.test2017elmo)) del self.translation data = { 'train': self.traintranslation, 'dev': self.devtranslation, 'test2016': self.test2016translation, 'test2017': self.test2017translation } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.traintranslation = data['train'] self.devtranslation = data['dev'] self.test2016translation = data['test2016'] self.test2017translation = data['test2017'] vector = self.vector['softcosine'] lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[ 'softcosine'], self.punctuation['softcosine'] path = os.path.join( 'ensemble', 'softcosine.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation) + '.vector_' + str(vector) + '.vecdim_' + str(self.w2vdim)) if not os.path.exists(path): self.softcosine = SemevalSoftCosine(stop=stop, vector=vector, lowercase=lowercase, punctuation=punctuation, proctrain=True, w2vdim=self.w2vdim) self.trainsoftcosine = self.format( self.softcosine.test(self.softcosine.traindata, self.softcosine.trainidx, self.softcosine.trainelmo)) self.devsoftcosine = self.format(self.softcosine.validate()) self.test2016softcosine = self.format( self.softcosine.test(self.softcosine.test2016data, self.softcosine.test2016idx, self.softcosine.test2016elmo)) self.test2017softcosine = self.format( self.softcosine.test(self.softcosine.test2017data, self.softcosine.test2017idx, self.softcosine.test2017elmo)) del self.softcosine data = { 'train': self.trainsoftcosine, 'dev': self.devsoftcosine, 'test2016': self.test2016softcosine, 'test2017': self.test2017softcosine } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.trainsoftcosine = data['train'] self.devsoftcosine = data['dev'] self.test2016softcosine = data['test2016'] self.test2017softcosine = data['test2017'] self.X, self.y = [], [] for q1id in self.trainbm25: for q2id in self.trainbm25[q1id]: X = [self.trainbm25[q1id][q2id][0]] X.append(self.traintranslation[q1id][q2id][0]) X.append(self.trainsoftcosine[q1id][q2id][0]) self.X.append(X) self.y.append(self.trainbm25[q1id][q2id][1]) if self.scale: self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(self.X) self.X = self.scaler.transform(self.X) self.ensemble.train_regression(trainvectors=self.X, labels=self.y, c='search', penalty='search', tol='search', gridsearch='brutal', jobs=10)
def train(self): print('Initializing BM25...') lowercase, stop, punctuation = self.lowercase['bm25'], self.stop[ 'bm25'], self.punctuation['bm25'] path = os.path.join( 'ensemble', 'bm25.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation)) if not os.path.exists(path): self.bm25 = SemevalBM25(stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=True) self.trainbm25 = self.format(self.bm25.test(self.bm25.traindata)) self.devbm25 = self.format(self.bm25.validate()) self.test2016bm25 = self.format( self.bm25.test(self.bm25.test2016data)) self.test2017bm25 = self.format( self.bm25.test(self.bm25.test2017data)) del self.bm25 data = { 'train': self.trainbm25, 'dev': self.devbm25, 'test2016': self.test2016bm25, 'test2017': self.test2017bm25 } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.trainbm25 = data['train'] self.devbm25 = data['dev'] self.test2016bm25 = data['test2016'] self.test2017bm25 = data['test2017'] print('Initializing Translation...') vector = self.vector['translation'] lowercase, stop, punctuation = self.lowercase[ 'translation'], self.stop['translation'], self.punctuation[ 'translation'] path = os.path.join( 'ensemble', 'translation.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation) + '.vector_' + str(vector) + '.vecdim_' + str(self.w2vdim)) if not os.path.exists(path): self.translation = SemevalTranslation(alpha=self.alpha, sigma=self.sigma, punctuation=punctuation, proctrain=True, vector=vector, stop=stop, lowercase=lowercase, w2vdim=self.w2vdim) self.traintranslation = self.format( self.translation.test(self.translation.traindata, self.translation.trainidx, self.translation.trainelmo)) self.devtranslation = self.format(self.translation.validate()) self.test2016translation = self.format( self.translation.test(self.translation.test2016data, self.translation.test2016idx, self.translation.test2016elmo)) self.test2017translation = self.format( self.translation.test(self.translation.test2017data, self.translation.test2017idx, self.translation.test2017elmo)) del self.translation data = { 'train': self.traintranslation, 'dev': self.devtranslation, 'test2016': self.test2016translation, 'test2017': self.test2017translation } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.traintranslation = data['train'] self.devtranslation = data['dev'] self.test2016translation = data['test2016'] self.test2017translation = data['test2017'] print('Initializing Softcosine...') vector = self.vector['softcosine'] lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[ 'softcosine'], self.punctuation['softcosine'] path = os.path.join( 'ensemble', 'softcosine.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation) + '.vector_' + str(vector) + '.vecdim_' + str(self.w2vdim)) if not os.path.exists(path): self.softcosine = SemevalSoftCosine(stop=stop, vector=vector, lowercase=lowercase, punctuation=punctuation, proctrain=True, w2vdim=self.w2vdim) self.trainsoftcosine = self.format( self.softcosine.test(self.softcosine.traindata, self.softcosine.trainidx, self.softcosine.trainelmo)) self.devsoftcosine = self.format(self.softcosine.validate()) self.test2016softcosine = self.format( self.softcosine.test(self.softcosine.test2016data, self.softcosine.test2016idx, self.softcosine.test2016elmo)) self.test2017softcosine = self.format( self.softcosine.test(self.softcosine.test2017data, self.softcosine.test2017idx, self.softcosine.test2017elmo)) del self.softcosine data = { 'train': self.trainsoftcosine, 'dev': self.devsoftcosine, 'test2016': self.test2016softcosine, 'test2017': self.test2017softcosine } p.dump(data, open(path, 'wb')) else: data = p.load(open(path, 'rb')) self.trainsoftcosine = data['train'] self.devsoftcosine = data['dev'] self.test2016softcosine = data['test2016'] self.test2017softcosine = data['test2017'] vector = self.vector['kernel'] lowercase = self.lowercase['kernel'] path = os.path.join( 'ensemble', 'kernel.lower_' + str(lowercase) + '.vector_' + vector) data = p.load(open(path, 'rb')) self.trainkernel = data['train'] self.devkernel = data['dev'] self.test2016kernel = data['test2016'] self.test2017kernel = data['test2017'] print('Initializing LambdaMART...') TX, Ty, Tqids = [], [], [] for q1id in self.trainbm25: for q2id in self.trainbm25[q1id]: Tqids.append(q1id) X = [self.trainbm25[q1id][q2id][0]] # X.append(self.traintranslation[q1id][q2id][0]) X.append(self.trainsoftcosine[q1id][q2id][0]) # X.append(self.trainkernel[q1id][q2id][0]) TX.append(X) Ty.append(self.trainbm25[q1id][q2id][1]) if self.scale: self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(TX) TX = self.scaler.transform(TX) VX, Vy, Vqids = [], [], [] for q1id in self.devbm25: for q2id in self.devbm25[q1id]: Vqids.append(q1id) X = [self.devbm25[q1id][q2id][0]] # X.append(self.devtranslation[q1id][q2id][0]) X.append(self.devsoftcosine[q1id][q2id][0]) # X.append(self.devkernel[q1id][q2id][0]) VX.append(X) Vy.append(self.devbm25[q1id][q2id][1]) if self.scale: VX = self.scaler.transform(VX) E2016X, E2016y, E2016qids = [], [], [] for q1id in self.test2016bm25: for q2id in self.test2016bm25[q1id]: E2016qids.append(q1id) X = [self.test2016bm25[q1id][q2id][0]] # X.append(self.test2016translation[q1id][q2id][0]) X.append(self.test2016softcosine[q1id][q2id][0]) # X.append(self.test2016kernel[q1id][q2id][0]) E2016X.append(X) E2016y.append(self.test2016bm25[q1id][q2id][1]) if self.scale: E2016X = self.scaler.transform(E2016X) E2017X, E2017y, E2017qids = [], [], [] for q1id in self.test2017bm25: for q2id in self.test2017bm25[q1id]: E2017qids.append(q1id) X = [self.test2017bm25[q1id][q2id][0]] # X.append(self.test2017translation[q1id][q2id][0]) X.append(self.test2017softcosine[q1id][q2id][0]) # X.append(self.test2017kernel[q1id][q2id][0]) E2017X.append(X) E2017y.append(self.test2017bm25[q1id][q2id][1]) if self.scale: E2017X = self.scaler.transform(E2017X) metric = pyltr.metrics.AP(k=10) monitor = pyltr.models.monitors.ValidationMonitor(VX, Vy, Vqids, metric=metric, stop_after=250) model = pyltr.models.LambdaMART( metric=metric, n_estimators=1000, learning_rate=0.02, max_features=0.5, query_subsample=0.5, max_leaf_nodes=10, min_samples_leaf=64, verbose=1, ) model.fit(TX, Ty, Tqids, monitor=monitor) Vpred = model.predict(VX) print('Dev:', metric.calc_mean(Vqids, np.array(Vy), Vpred)) E2016pred = model.predict(E2016X) print('Test 2016:', metric.calc_mean(E2016qids, np.array(E2016y), E2016pred)) E2017pred = model.predict(E2017X) print('Test 2017:', metric.calc_mean(E2017qids, np.array(E2017y), E2017pred))
class SemevalSVM(Semeval): def __init__(self, model='svm', features='bm25,', comment_features='bm25,', stop=True, vector='word2vec', lowercase=True, punctuation=True, proctrain=True, path=FEATURES_PATH, alpha=0.1, sigma=0.9, gridsearch='random'): Semeval.__init__(self, stop=stop, vector=vector, lowercase=lowercase, punctuation=punctuation) self.path = path self.features = features.split(',') self.comment_features = comment_features.split(',') self.gridsearch = gridsearch self.svm = Model() self.model = model self.bm25 = SemevalBM25( stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain ) if 'bm25' in self.features + self.comment_features else None self.cosine = SemevalCosine( stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain ) if 'cosine' in self.features + self.comment_features else None self.softcosine = SemevalSoftCosine( stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain, vector=vector ) if 'softcosine' in self.features + self.comment_features else None self.translation = SemevalTranslation( alpha=alpha, sigma=sigma, stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain, vector=self.vector ) if 'translation' in self.features + self.comment_features else None self.train() def extract_features(self, procdata, elmoidx, elmovec): X, y = [], [] feat = {} for i, q1id in enumerate(procdata): feat[q1id] = {} percentage = round(float(i + 1) / len(procdata), 2) print('Extracting features: ', percentage, i + 1, sep='\t', end='\r') for q2id in procdata[q1id]: query_question = procdata[q1id][q2id] q1, q2 = query_question['q1'], query_question['q2'] x = [] q1_emb = self.encode(q1id, q1, elmoidx, elmovec) # bm25 if 'bm25' in self.features: score = self.bm25.model(q1, q2id) x.append(score) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] if len(q3) > 0: score = self.bm25.model(q1, q3id) x.append(score) else: x.append(0) # softcosine elif 'softcosine' in self.features: if self.vector == 'alignments': score = self.softcosine.model.score( q1, q2, self.alignments) else: q2_emb = self.encode(q2id, q2, elmoidx, elmovec) score = self.softcosine.model(q1, q1_emb, q2, q2_emb) x.append(score) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] if len(q3) > 0: if self.vector == 'alignments': score = self.softcosine.model.score( q1, q2, self.alignments) else: q3_emb = self.encode(q3id, q3, elmoidx, elmovec) score = self.softcosine.model( q1, q1_emb, q3, q3_emb) x.append(score) else: x.append(0) # translation elif 'translation' in self.features: if self.vector == 'alignments': lmprob, trmprob, trlmprob, proctime = self.translation.model.score( q1, q2) else: q2_emb = self.encode(q2id, q2, elmoidx, elmovec) lmprob, trmprob, trlmprob, proctime = self.translation.model( q1, q1_emb, q2, q2_emb) x.append(trlmprob) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] if len(q3) > 0: if self.vector == 'alignments': lmprob, trmprob, trlmprob, proctime = self.translation.model.score( q1, q3) else: q3_emb = self.encode(q3id, q3, elmoidx, elmovec) lmprob, trmprob, trlmprob, proctime = self.translation.model( q1, q1_emb, q3, q3_emb) x.append(trlmprob) else: x.append(0) # cosine elif 'cosine' in self.features: score = self.cosine.model(q1, q2) x.append(score) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] if len(q3) > 0: score = self.cosine.model(q1, q3) x.append(score) else: x.append(0) y_ = query_question['label'] feat[q1id][q2id] = (x, y_) X.append(x) y.append(y_) return feat, X, y def train(self): self.X, self.y = [], [] path = os.path.join('feature', 'train', self.path) if not os.path.exists(path): feat, self.X, self.y = self.extract_features( self.traindata, self.trainidx, self.trainelmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) for q1id in feat: for q2id in feat[q1id]: self.X.append(feat[q1id][q2id][0]) self.y.append(feat[q1id][q2id][1]) self.X = np.array(self.X) self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(self.X) self.X = self.scaler.transform(self.X) if self.model == 'svm': self.svm.train_svm(trainvectors=self.X, labels=self.y, c='search', kernel='search', gamma='search', jobs=10, gridsearch=self.gridsearch) else: self.svm.train_regression(trainvectors=self.X, labels=self.y, c='search', penalty='search', tol='search', gridsearch=self.gridsearch, jobs=10) def validate(self): path = os.path.join('feature', 'dev', self.path) if not os.path.exists(path): feat, X, y = self.extract_features(self.devdata, self.devidx, self.develmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(feat): ranking[q1id] = [] for q2id in feat[q1id]: X = feat[q1id][q2id][0] X = self.scaler.transform([X])[0] score, pred_label = self.svm.score(X) y_pred.append(pred_label) real_label = feat[q1id][q2id][1] y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) parameter_settings = self.svm.return_parameter_settings(clf=self.model) return ranking, y_real, y_pred, parameter_settings def test(self, testdata, elmoidx, elmovec, test_='test2016'): if test_ == 'test2016': path = os.path.join('feature', 'test2016', self.path) else: path = os.path.join('feature', 'test2017', self.path) self.testdata = testdata if not os.path.exists(path): feat, X, y = self.extract_features(self.testdata, elmoidx, elmovec) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(feat): ranking[q1id] = [] for q2id in feat[q1id]: X = feat[q1id][q2id][0] X = self.scaler.transform([X])[0] score, pred_label = self.svm.score(X) y_pred.append(pred_label) real_label = feat[q1id][q2id][1] y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) parameter_settings = self.svm.return_parameter_settings(clf=self.model) return ranking, y_real, y_pred, parameter_settings
def run_softcosine(stop, lowercase, punctuation, proctrain, vector, evaluation_path): model = SemevalSoftCosine(stop=stop, vector=vector, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain) result_dev = model.validate() dev_path = os.path.join(DEV_EVAL_PATH, evaluation_path) result_test2016 = model.test(model.test2016data, model.test2016idx, model.test2016elmo) test2016_path = os.path.join(TEST2016_EVAL_PATH, evaluation_path) result_test2017 = model.test(model.test2017data, model.test2017idx, model.test2017elmo) test2017_path = os.path.join(TEST2017_EVAL_PATH, evaluation_path) model.save(ranking=result_test2016, path=test2016_path, parameter_settings='') model.save(ranking=result_test2017, path=test2017_path, parameter_settings='') model.save(ranking=result_dev, path=dev_path, parameter_settings='') map_baseline, map_model = evaluate(copy.copy(result_dev), prepare_gold(DEV_GOLD_PATH)) print('Evaluation: ', evaluation_path) print('MAP baseline: ', map_baseline) print('MAP model: ', map_model) print(10 * '-')