Exemple #1
0
    def __init__(self,
                 model='svm',
                 features='bm25,',
                 comment_features='bm25,',
                 stop=True,
                 vector='word2vec',
                 lowercase=True,
                 punctuation=True,
                 proctrain=True,
                 path=FEATURES_PATH,
                 alpha=0.1,
                 sigma=0.9,
                 gridsearch='random'):
        Semeval.__init__(self,
                         stop=stop,
                         vector=vector,
                         lowercase=lowercase,
                         punctuation=punctuation)
        self.path = path
        self.features = features.split(',')
        self.comment_features = comment_features.split(',')
        self.gridsearch = gridsearch
        self.svm = Model()

        self.model = model
        self.bm25 = SemevalBM25(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain
        ) if 'bm25' in self.features + self.comment_features else None
        self.cosine = SemevalCosine(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain
        ) if 'cosine' in self.features + self.comment_features else None
        self.softcosine = SemevalSoftCosine(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain,
            vector=vector
        ) if 'softcosine' in self.features + self.comment_features else None
        self.translation = SemevalTranslation(
            alpha=alpha,
            sigma=sigma,
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain,
            vector=self.vector
        ) if 'translation' in self.features + self.comment_features else None

        self.train()
class SemevalEnsemble:
    def __init__(self,
                 stop={},
                 lowercase={},
                 punctuation={},
                 vector={},
                 scale=True,
                 w2vdim=300,
                 kernel_path='',
                 alpha=0.8,
                 sigma=0.2):
        self.stop = stop
        self.lowercase = lowercase
        self.punctuation = punctuation
        self.scale = scale
        self.vector = vector
        self.alpha = alpha
        self.sigma = sigma
        self.kernel_path = kernel_path
        self.w2vdim = w2vdim
        self.theta = 0.9

        self.ensemble = Model()
        self.train()

    def format(self, ranking):
        new_ranking = {}
        for q1id in ranking:
            new_ranking[q1id] = {}
            for question in ranking[q1id]:
                real_label, score, q2id = question
                new_ranking[q1id][q2id] = (score, real_label)
        return new_ranking

    def train(self):
        # self.train_feature()
        self.train_kernel()
        self.train_classifier()

        # finding theta in development set
        thetas = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        best_map = 0.0
        for theta in thetas:
            ranking = {}
            for q1id in self.devkernel:
                ranking[q1id] = []
                for q2id in self.devkernel[q1id]:
                    X = []
                    X.append(self.devbm25[q1id][q2id][0])
                    X.append(self.devtranslation[q1id][q2id][0])
                    X.append(self.devsoftcosine[q1id][q2id][0])

                    if self.scale:
                        X = self.scaler.transform([X])[0]
                    clfscore, pred_label = self.ensemble.score(X)

                    kernelscore = self.devkernel[q1id][q2id][0]
                    score = (theta * clfscore) + ((1 - theta) * kernelscore)

                    ranking[q1id].append((pred_label, score, q2id))

            map_baseline, map_model = evaluate(copy.copy(ranking),
                                               prepare_gold(DEV_GOLD_PATH))
            if map_model > best_map:
                best_map = copy.copy(map_model)
                print('MAP baseline', map_baseline)
                print('MAP: ', map_model)
                print(10 * '-')
                self.theta = theta

    def train_kernel(self):
        vector = self.vector['kernel']
        lowercase = self.lowercase['kernel']
        path = os.path.join(
            'ensemble', 'kernel.lower_' + str(lowercase) + '.vector_' +
            vector + '.vecdim_' + str(self.w2vdim))
        if not os.path.exists(path):
            self.kernel = SemevalTreeKernel(smoothed=True,
                                            vector=vector,
                                            lowercase=lowercase,
                                            tree='subj_tree',
                                            kernel_path=self.kernel_path,
                                            w2vdim=self.w2vdim)
            self.trainkernel, _, _, _ = self.kernel.test(self.kernel.traindata,
                                                         self.kernel.trainidx,
                                                         self.kernel.trainelmo,
                                                         test_='train')
            self.trainkernel = self.format(self.trainkernel)

            self.devkernel, _, _, _ = self.kernel.validate()
            self.devkernel = self.format(self.devkernel)

            self.test2016kernel, _, _, _ = self.kernel.test(
                self.kernel.test2016data,
                self.kernel.test2016idx,
                self.kernel.test2016elmo,
                test_='test2016')
            self.test2016kernel = self.format(self.test2016kernel)

            self.test2017kernel, _, _, _ = self.kernel.test(
                self.kernel.test2017data,
                self.kernel.test2017idx,
                self.kernel.test2017elmo,
                test_='test2017')
            self.test2017kernel = self.format(self.test2017kernel)

            data = {
                'train': self.trainkernel,
                'dev': self.devkernel,
                'test2016': self.test2016kernel,
                'test2017': self.test2017kernel
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.trainkernel = data['train']
            self.devkernel = data['dev']
            self.test2016kernel = data['test2016']
            self.test2017kernel = data['test2017']

    def train_classifier(self):
        lowercase, stop, punctuation = self.lowercase['bm25'], self.stop[
            'bm25'], self.punctuation['bm25']
        path = os.path.join(
            'ensemble', 'bm25.lower_' + str(lowercase) + '.stop_' + str(stop) +
            '.punct_' + str(punctuation))
        if not os.path.exists(path):
            self.bm25 = SemevalBM25(stop=stop,
                                    lowercase=lowercase,
                                    punctuation=punctuation,
                                    proctrain=True)
            self.trainbm25 = self.format(self.bm25.test(self.bm25.traindata))
            self.devbm25 = self.format(self.bm25.validate())
            self.test2016bm25 = self.format(
                self.bm25.test(self.bm25.test2016data))
            self.test2017bm25 = self.format(
                self.bm25.test(self.bm25.test2017data))
            del self.bm25

            data = {
                'train': self.trainbm25,
                'dev': self.devbm25,
                'test2016': self.test2016bm25,
                'test2017': self.test2017bm25
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.trainbm25 = data['train']
            self.devbm25 = data['dev']
            self.test2016bm25 = data['test2016']
            self.test2017bm25 = data['test2017']

        vector = self.vector['translation']
        lowercase, stop, punctuation = self.lowercase[
            'translation'], self.stop['translation'], self.punctuation[
                'translation']
        path = os.path.join(
            'ensemble', 'translation.lower_' + str(lowercase) + '.stop_' +
            str(stop) + '.punct_' + str(punctuation) + '.vector_' +
            str(vector) + '.vecdim_' + str(self.w2vdim))
        if not os.path.exists(path):
            self.translation = SemevalTranslation(alpha=self.alpha,
                                                  sigma=self.sigma,
                                                  punctuation=punctuation,
                                                  proctrain=True,
                                                  vector=vector,
                                                  stop=stop,
                                                  lowercase=lowercase,
                                                  w2vdim=self.w2vdim)
            self.traintranslation = self.format(
                self.translation.test(self.translation.traindata,
                                      self.translation.trainidx,
                                      self.translation.trainelmo))
            self.devtranslation = self.format(self.translation.validate())
            self.test2016translation = self.format(
                self.translation.test(self.translation.test2016data,
                                      self.translation.test2016idx,
                                      self.translation.test2016elmo))
            self.test2017translation = self.format(
                self.translation.test(self.translation.test2017data,
                                      self.translation.test2017idx,
                                      self.translation.test2017elmo))
            del self.translation

            data = {
                'train': self.traintranslation,
                'dev': self.devtranslation,
                'test2016': self.test2016translation,
                'test2017': self.test2017translation
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.traintranslation = data['train']
            self.devtranslation = data['dev']
            self.test2016translation = data['test2016']
            self.test2017translation = data['test2017']

        vector = self.vector['softcosine']
        lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[
            'softcosine'], self.punctuation['softcosine']
        path = os.path.join(
            'ensemble', 'softcosine.lower_' + str(lowercase) + '.stop_' +
            str(stop) + '.punct_' + str(punctuation) + '.vector_' +
            str(vector) + '.vecdim_' + str(self.w2vdim))
        if not os.path.exists(path):
            self.softcosine = SemevalSoftCosine(stop=stop,
                                                vector=vector,
                                                lowercase=lowercase,
                                                punctuation=punctuation,
                                                proctrain=True,
                                                w2vdim=self.w2vdim)
            self.trainsoftcosine = self.format(
                self.softcosine.test(self.softcosine.traindata,
                                     self.softcosine.trainidx,
                                     self.softcosine.trainelmo))
            self.devsoftcosine = self.format(self.softcosine.validate())
            self.test2016softcosine = self.format(
                self.softcosine.test(self.softcosine.test2016data,
                                     self.softcosine.test2016idx,
                                     self.softcosine.test2016elmo))
            self.test2017softcosine = self.format(
                self.softcosine.test(self.softcosine.test2017data,
                                     self.softcosine.test2017idx,
                                     self.softcosine.test2017elmo))
            del self.softcosine

            data = {
                'train': self.trainsoftcosine,
                'dev': self.devsoftcosine,
                'test2016': self.test2016softcosine,
                'test2017': self.test2017softcosine
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.trainsoftcosine = data['train']
            self.devsoftcosine = data['dev']
            self.test2016softcosine = data['test2016']
            self.test2017softcosine = data['test2017']

        self.X, self.y = [], []

        for q1id in self.trainbm25:
            for q2id in self.trainbm25[q1id]:
                X = [self.trainbm25[q1id][q2id][0]]
                X.append(self.traintranslation[q1id][q2id][0])
                X.append(self.trainsoftcosine[q1id][q2id][0])
                self.X.append(X)
                self.y.append(self.trainbm25[q1id][q2id][1])

        if self.scale:
            self.scaler = MinMaxScaler(feature_range=(-1, 1))
            self.scaler.fit(self.X)
            self.X = self.scaler.transform(self.X)
        self.ensemble.train_regression(trainvectors=self.X,
                                       labels=self.y,
                                       c='search',
                                       penalty='search',
                                       tol='search',
                                       gridsearch='brutal',
                                       jobs=10)

    def train_feature(self):
        lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[
            'softcosine'], self.punctuation['softcosine']
        semeval = SemevalSoftCosine(stop=stop,
                                    vector='word2vec',
                                    lowercase=lowercase,
                                    punctuation=punctuation,
                                    proctrain=True)

        self.trainfeature = {}
        for i, q1id in enumerate(semeval.traindata):
            self.trainfeature[q1id] = {}

            for q2id in semeval.traindata[q1id]:
                pair = semeval.traindata[q1id][q2id]
                q1, q2 = pair['q1'], pair['q2']

                q1emb = np.mean(semeval.encode(q1id, q1, semeval.trainidx,
                                               semeval.trainelmo),
                                axis=0)
                q2emb = np.mean(semeval.encode(q2id, q2, semeval.trainidx,
                                               semeval.trainelmo),
                                axis=0)

                cos = cosine_similarity([q1emb], [q2emb])[0][0]
                self.trainfeature[q1id][q2id] = (cos, 0)

        self.devfeature = {}
        for i, q1id in enumerate(semeval.devdata):
            self.devfeature[q1id] = {}

            for q2id in semeval.devdata[q1id]:
                pair = semeval.devdata[q1id][q2id]
                q1, q2 = pair['q1'], pair['q2']

                q1emb = np.mean(semeval.encode(q1id, q1, semeval.devidx,
                                               semeval.develmo),
                                axis=0)
                q2emb = np.mean(semeval.encode(q2id, q2, semeval.devidx,
                                               semeval.develmo),
                                axis=0)

                cos = cosine_similarity([q1emb], [q2emb])[0][0]
                self.devfeature[q1id][q2id] = (cos, 0)

        self.test2016feature = {}
        for i, q1id in enumerate(semeval.test2016data):
            self.test2016feature[q1id] = {}

            for q2id in semeval.test2016data[q1id]:
                pair = semeval.test2016data[q1id][q2id]
                q1, q2 = pair['q1'], pair['q2']

                q1emb = np.mean(semeval.encode(q1id, q1, semeval.test2016idx,
                                               semeval.test2016elmo),
                                axis=0)
                q2emb = np.mean(semeval.encode(q2id, q2, semeval.test2016idx,
                                               semeval.test2016elmo),
                                axis=0)

                cos = cosine_similarity([q1emb], [q2emb])[0][0]
                self.test2016feature[q1id][q2id] = (cos, 0)

        self.test2017feature = {}
        for i, q1id in enumerate(semeval.test2017data):
            self.test2017feature[q1id] = {}

            for q2id in semeval.test2017data[q1id]:
                pair = semeval.test2017data[q1id][q2id]
                q1, q2 = pair['q1'], pair['q2']

                q1emb = np.mean(semeval.encode(q1id, q1, semeval.test2017idx,
                                               semeval.test2017elmo),
                                axis=0)
                q2emb = np.mean(semeval.encode(q2id, q2, semeval.test2017idx,
                                               semeval.test2017elmo),
                                axis=0)

                cos = cosine_similarity([q1emb], [q2emb])[0][0]
                self.test2017feature[q1id][q2id] = (cos, 0)

    def test(self, set_='dev'):
        if set_ == 'dev':
            bm25 = self.devbm25
            translation = self.devtranslation
            softcosine = self.devsoftcosine
            kernel = self.devkernel
        elif set_ == 'train':
            bm25 = self.trainbm25
            translation = self.traintranslation
            softcosine = self.trainsoftcosine
            kernel = self.trainkernel
            feature = self.trainfeature
        elif set_ == 'test2016':
            bm25 = self.test2016bm25
            translation = self.test2016translation
            softcosine = self.test2016softcosine
            kernel = self.test2016kernel
        else:
            bm25 = self.test2017bm25
            translation = self.test2017translation
            softcosine = self.test2017softcosine
            kernel = self.test2017kernel

        ranking = {}
        y_real, y_pred = [], []
        for q1id in bm25:
            ranking[q1id] = []
            for q2id in bm25[q1id]:
                X = []
                X.append(bm25[q1id][q2id][0])
                X.append(translation[q1id][q2id][0])
                X.append(softcosine[q1id][q2id][0])

                if self.scale:
                    X = self.scaler.transform([X])[0]
                clfscore, pred_label = self.ensemble.score(X)
                y_pred.append(pred_label)

                real_label = 1 if bm25[q1id][q2id][1] == 'true' else 0
                y_real.append(real_label)

                kernelscore = kernel[q1id][q2id][0]
                score = (self.theta * clfscore) + (
                    (1 - self.theta) * kernelscore)

                ranking[q1id].append((pred_label, score, q2id))

        parameter_settings = self.ensemble.return_parameter_settings(
            clf='regression')
        parameter_settings = parameter_settings + ',gamma=' + str(
            self.theta) + ',alpha=' + str(self.alpha) + ',sigma=' + str(
                self.sigma)
        return ranking, y_real, y_pred, parameter_settings

    def save(self, ranking, path, parameter_settings):
        with open(path, 'w') as f:
            f.write(parameter_settings)
            f.write('\n')
            for q1id in ranking:
                for row in ranking[q1id]:
                    label = 'false'
                    if row[0] == 1:
                        label = 'true'
                    f.write('\t'.join([
                        str(q1id),
                        str(row[2]),
                        str(0),
                        str(row[1]), label, '\n'
                    ]))
    def train_feature(self):
        lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[
            'softcosine'], self.punctuation['softcosine']
        semeval = SemevalSoftCosine(stop=stop,
                                    vector='word2vec',
                                    lowercase=lowercase,
                                    punctuation=punctuation,
                                    proctrain=True)

        self.trainfeature = {}
        for i, q1id in enumerate(semeval.traindata):
            self.trainfeature[q1id] = {}

            for q2id in semeval.traindata[q1id]:
                pair = semeval.traindata[q1id][q2id]
                q1, q2 = pair['q1'], pair['q2']

                q1emb = np.mean(semeval.encode(q1id, q1, semeval.trainidx,
                                               semeval.trainelmo),
                                axis=0)
                q2emb = np.mean(semeval.encode(q2id, q2, semeval.trainidx,
                                               semeval.trainelmo),
                                axis=0)

                cos = cosine_similarity([q1emb], [q2emb])[0][0]
                self.trainfeature[q1id][q2id] = (cos, 0)

        self.devfeature = {}
        for i, q1id in enumerate(semeval.devdata):
            self.devfeature[q1id] = {}

            for q2id in semeval.devdata[q1id]:
                pair = semeval.devdata[q1id][q2id]
                q1, q2 = pair['q1'], pair['q2']

                q1emb = np.mean(semeval.encode(q1id, q1, semeval.devidx,
                                               semeval.develmo),
                                axis=0)
                q2emb = np.mean(semeval.encode(q2id, q2, semeval.devidx,
                                               semeval.develmo),
                                axis=0)

                cos = cosine_similarity([q1emb], [q2emb])[0][0]
                self.devfeature[q1id][q2id] = (cos, 0)

        self.test2016feature = {}
        for i, q1id in enumerate(semeval.test2016data):
            self.test2016feature[q1id] = {}

            for q2id in semeval.test2016data[q1id]:
                pair = semeval.test2016data[q1id][q2id]
                q1, q2 = pair['q1'], pair['q2']

                q1emb = np.mean(semeval.encode(q1id, q1, semeval.test2016idx,
                                               semeval.test2016elmo),
                                axis=0)
                q2emb = np.mean(semeval.encode(q2id, q2, semeval.test2016idx,
                                               semeval.test2016elmo),
                                axis=0)

                cos = cosine_similarity([q1emb], [q2emb])[0][0]
                self.test2016feature[q1id][q2id] = (cos, 0)

        self.test2017feature = {}
        for i, q1id in enumerate(semeval.test2017data):
            self.test2017feature[q1id] = {}

            for q2id in semeval.test2017data[q1id]:
                pair = semeval.test2017data[q1id][q2id]
                q1, q2 = pair['q1'], pair['q2']

                q1emb = np.mean(semeval.encode(q1id, q1, semeval.test2017idx,
                                               semeval.test2017elmo),
                                axis=0)
                q2emb = np.mean(semeval.encode(q2id, q2, semeval.test2017idx,
                                               semeval.test2017elmo),
                                axis=0)

                cos = cosine_similarity([q1emb], [q2emb])[0][0]
                self.test2017feature[q1id][q2id] = (cos, 0)
    def train_classifier(self):
        lowercase, stop, punctuation = self.lowercase['bm25'], self.stop[
            'bm25'], self.punctuation['bm25']
        path = os.path.join(
            'ensemble', 'bm25.lower_' + str(lowercase) + '.stop_' + str(stop) +
            '.punct_' + str(punctuation))
        if not os.path.exists(path):
            self.bm25 = SemevalBM25(stop=stop,
                                    lowercase=lowercase,
                                    punctuation=punctuation,
                                    proctrain=True)
            self.trainbm25 = self.format(self.bm25.test(self.bm25.traindata))
            self.devbm25 = self.format(self.bm25.validate())
            self.test2016bm25 = self.format(
                self.bm25.test(self.bm25.test2016data))
            self.test2017bm25 = self.format(
                self.bm25.test(self.bm25.test2017data))
            del self.bm25

            data = {
                'train': self.trainbm25,
                'dev': self.devbm25,
                'test2016': self.test2016bm25,
                'test2017': self.test2017bm25
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.trainbm25 = data['train']
            self.devbm25 = data['dev']
            self.test2016bm25 = data['test2016']
            self.test2017bm25 = data['test2017']

        vector = self.vector['translation']
        lowercase, stop, punctuation = self.lowercase[
            'translation'], self.stop['translation'], self.punctuation[
                'translation']
        path = os.path.join(
            'ensemble', 'translation.lower_' + str(lowercase) + '.stop_' +
            str(stop) + '.punct_' + str(punctuation) + '.vector_' +
            str(vector) + '.vecdim_' + str(self.w2vdim))
        if not os.path.exists(path):
            self.translation = SemevalTranslation(alpha=self.alpha,
                                                  sigma=self.sigma,
                                                  punctuation=punctuation,
                                                  proctrain=True,
                                                  vector=vector,
                                                  stop=stop,
                                                  lowercase=lowercase,
                                                  w2vdim=self.w2vdim)
            self.traintranslation = self.format(
                self.translation.test(self.translation.traindata,
                                      self.translation.trainidx,
                                      self.translation.trainelmo))
            self.devtranslation = self.format(self.translation.validate())
            self.test2016translation = self.format(
                self.translation.test(self.translation.test2016data,
                                      self.translation.test2016idx,
                                      self.translation.test2016elmo))
            self.test2017translation = self.format(
                self.translation.test(self.translation.test2017data,
                                      self.translation.test2017idx,
                                      self.translation.test2017elmo))
            del self.translation

            data = {
                'train': self.traintranslation,
                'dev': self.devtranslation,
                'test2016': self.test2016translation,
                'test2017': self.test2017translation
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.traintranslation = data['train']
            self.devtranslation = data['dev']
            self.test2016translation = data['test2016']
            self.test2017translation = data['test2017']

        vector = self.vector['softcosine']
        lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[
            'softcosine'], self.punctuation['softcosine']
        path = os.path.join(
            'ensemble', 'softcosine.lower_' + str(lowercase) + '.stop_' +
            str(stop) + '.punct_' + str(punctuation) + '.vector_' +
            str(vector) + '.vecdim_' + str(self.w2vdim))
        if not os.path.exists(path):
            self.softcosine = SemevalSoftCosine(stop=stop,
                                                vector=vector,
                                                lowercase=lowercase,
                                                punctuation=punctuation,
                                                proctrain=True,
                                                w2vdim=self.w2vdim)
            self.trainsoftcosine = self.format(
                self.softcosine.test(self.softcosine.traindata,
                                     self.softcosine.trainidx,
                                     self.softcosine.trainelmo))
            self.devsoftcosine = self.format(self.softcosine.validate())
            self.test2016softcosine = self.format(
                self.softcosine.test(self.softcosine.test2016data,
                                     self.softcosine.test2016idx,
                                     self.softcosine.test2016elmo))
            self.test2017softcosine = self.format(
                self.softcosine.test(self.softcosine.test2017data,
                                     self.softcosine.test2017idx,
                                     self.softcosine.test2017elmo))
            del self.softcosine

            data = {
                'train': self.trainsoftcosine,
                'dev': self.devsoftcosine,
                'test2016': self.test2016softcosine,
                'test2017': self.test2017softcosine
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.trainsoftcosine = data['train']
            self.devsoftcosine = data['dev']
            self.test2016softcosine = data['test2016']
            self.test2017softcosine = data['test2017']

        self.X, self.y = [], []

        for q1id in self.trainbm25:
            for q2id in self.trainbm25[q1id]:
                X = [self.trainbm25[q1id][q2id][0]]
                X.append(self.traintranslation[q1id][q2id][0])
                X.append(self.trainsoftcosine[q1id][q2id][0])
                self.X.append(X)
                self.y.append(self.trainbm25[q1id][q2id][1])

        if self.scale:
            self.scaler = MinMaxScaler(feature_range=(-1, 1))
            self.scaler.fit(self.X)
            self.X = self.scaler.transform(self.X)
        self.ensemble.train_regression(trainvectors=self.X,
                                       labels=self.y,
                                       c='search',
                                       penalty='search',
                                       tol='search',
                                       gridsearch='brutal',
                                       jobs=10)
Exemple #5
0
    def train(self):
        print('Initializing BM25...')
        lowercase, stop, punctuation = self.lowercase['bm25'], self.stop[
            'bm25'], self.punctuation['bm25']
        path = os.path.join(
            'ensemble', 'bm25.lower_' + str(lowercase) + '.stop_' + str(stop) +
            '.punct_' + str(punctuation))
        if not os.path.exists(path):
            self.bm25 = SemevalBM25(stop=stop,
                                    lowercase=lowercase,
                                    punctuation=punctuation,
                                    proctrain=True)
            self.trainbm25 = self.format(self.bm25.test(self.bm25.traindata))
            self.devbm25 = self.format(self.bm25.validate())
            self.test2016bm25 = self.format(
                self.bm25.test(self.bm25.test2016data))
            self.test2017bm25 = self.format(
                self.bm25.test(self.bm25.test2017data))
            del self.bm25

            data = {
                'train': self.trainbm25,
                'dev': self.devbm25,
                'test2016': self.test2016bm25,
                'test2017': self.test2017bm25
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.trainbm25 = data['train']
            self.devbm25 = data['dev']
            self.test2016bm25 = data['test2016']
            self.test2017bm25 = data['test2017']

        print('Initializing Translation...')
        vector = self.vector['translation']
        lowercase, stop, punctuation = self.lowercase[
            'translation'], self.stop['translation'], self.punctuation[
                'translation']
        path = os.path.join(
            'ensemble', 'translation.lower_' + str(lowercase) + '.stop_' +
            str(stop) + '.punct_' + str(punctuation) + '.vector_' +
            str(vector) + '.vecdim_' + str(self.w2vdim))
        if not os.path.exists(path):
            self.translation = SemevalTranslation(alpha=self.alpha,
                                                  sigma=self.sigma,
                                                  punctuation=punctuation,
                                                  proctrain=True,
                                                  vector=vector,
                                                  stop=stop,
                                                  lowercase=lowercase,
                                                  w2vdim=self.w2vdim)
            self.traintranslation = self.format(
                self.translation.test(self.translation.traindata,
                                      self.translation.trainidx,
                                      self.translation.trainelmo))
            self.devtranslation = self.format(self.translation.validate())
            self.test2016translation = self.format(
                self.translation.test(self.translation.test2016data,
                                      self.translation.test2016idx,
                                      self.translation.test2016elmo))
            self.test2017translation = self.format(
                self.translation.test(self.translation.test2017data,
                                      self.translation.test2017idx,
                                      self.translation.test2017elmo))
            del self.translation

            data = {
                'train': self.traintranslation,
                'dev': self.devtranslation,
                'test2016': self.test2016translation,
                'test2017': self.test2017translation
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.traintranslation = data['train']
            self.devtranslation = data['dev']
            self.test2016translation = data['test2016']
            self.test2017translation = data['test2017']

        print('Initializing Softcosine...')
        vector = self.vector['softcosine']
        lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[
            'softcosine'], self.punctuation['softcosine']
        path = os.path.join(
            'ensemble', 'softcosine.lower_' + str(lowercase) + '.stop_' +
            str(stop) + '.punct_' + str(punctuation) + '.vector_' +
            str(vector) + '.vecdim_' + str(self.w2vdim))
        if not os.path.exists(path):
            self.softcosine = SemevalSoftCosine(stop=stop,
                                                vector=vector,
                                                lowercase=lowercase,
                                                punctuation=punctuation,
                                                proctrain=True,
                                                w2vdim=self.w2vdim)
            self.trainsoftcosine = self.format(
                self.softcosine.test(self.softcosine.traindata,
                                     self.softcosine.trainidx,
                                     self.softcosine.trainelmo))
            self.devsoftcosine = self.format(self.softcosine.validate())
            self.test2016softcosine = self.format(
                self.softcosine.test(self.softcosine.test2016data,
                                     self.softcosine.test2016idx,
                                     self.softcosine.test2016elmo))
            self.test2017softcosine = self.format(
                self.softcosine.test(self.softcosine.test2017data,
                                     self.softcosine.test2017idx,
                                     self.softcosine.test2017elmo))
            del self.softcosine

            data = {
                'train': self.trainsoftcosine,
                'dev': self.devsoftcosine,
                'test2016': self.test2016softcosine,
                'test2017': self.test2017softcosine
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.trainsoftcosine = data['train']
            self.devsoftcosine = data['dev']
            self.test2016softcosine = data['test2016']
            self.test2017softcosine = data['test2017']

        vector = self.vector['kernel']
        lowercase = self.lowercase['kernel']
        path = os.path.join(
            'ensemble', 'kernel.lower_' + str(lowercase) + '.vector_' + vector)
        data = p.load(open(path, 'rb'))
        self.trainkernel = data['train']
        self.devkernel = data['dev']
        self.test2016kernel = data['test2016']
        self.test2017kernel = data['test2017']

        print('Initializing LambdaMART...')
        TX, Ty, Tqids = [], [], []
        for q1id in self.trainbm25:
            for q2id in self.trainbm25[q1id]:
                Tqids.append(q1id)
                X = [self.trainbm25[q1id][q2id][0]]
                # X.append(self.traintranslation[q1id][q2id][0])
                X.append(self.trainsoftcosine[q1id][q2id][0])
                # X.append(self.trainkernel[q1id][q2id][0])
                TX.append(X)
                Ty.append(self.trainbm25[q1id][q2id][1])

        if self.scale:
            self.scaler = MinMaxScaler(feature_range=(-1, 1))
            self.scaler.fit(TX)
            TX = self.scaler.transform(TX)

        VX, Vy, Vqids = [], [], []
        for q1id in self.devbm25:
            for q2id in self.devbm25[q1id]:
                Vqids.append(q1id)
                X = [self.devbm25[q1id][q2id][0]]
                # X.append(self.devtranslation[q1id][q2id][0])
                X.append(self.devsoftcosine[q1id][q2id][0])
                # X.append(self.devkernel[q1id][q2id][0])
                VX.append(X)
                Vy.append(self.devbm25[q1id][q2id][1])

        if self.scale:
            VX = self.scaler.transform(VX)

        E2016X, E2016y, E2016qids = [], [], []
        for q1id in self.test2016bm25:
            for q2id in self.test2016bm25[q1id]:
                E2016qids.append(q1id)
                X = [self.test2016bm25[q1id][q2id][0]]
                # X.append(self.test2016translation[q1id][q2id][0])
                X.append(self.test2016softcosine[q1id][q2id][0])
                # X.append(self.test2016kernel[q1id][q2id][0])
                E2016X.append(X)
                E2016y.append(self.test2016bm25[q1id][q2id][1])

        if self.scale:
            E2016X = self.scaler.transform(E2016X)

        E2017X, E2017y, E2017qids = [], [], []
        for q1id in self.test2017bm25:
            for q2id in self.test2017bm25[q1id]:
                E2017qids.append(q1id)
                X = [self.test2017bm25[q1id][q2id][0]]
                # X.append(self.test2017translation[q1id][q2id][0])
                X.append(self.test2017softcosine[q1id][q2id][0])
                # X.append(self.test2017kernel[q1id][q2id][0])
                E2017X.append(X)
                E2017y.append(self.test2017bm25[q1id][q2id][1])

        if self.scale:
            E2017X = self.scaler.transform(E2017X)

        metric = pyltr.metrics.AP(k=10)

        monitor = pyltr.models.monitors.ValidationMonitor(VX,
                                                          Vy,
                                                          Vqids,
                                                          metric=metric,
                                                          stop_after=250)

        model = pyltr.models.LambdaMART(
            metric=metric,
            n_estimators=1000,
            learning_rate=0.02,
            max_features=0.5,
            query_subsample=0.5,
            max_leaf_nodes=10,
            min_samples_leaf=64,
            verbose=1,
        )

        model.fit(TX, Ty, Tqids, monitor=monitor)

        Vpred = model.predict(VX)
        print('Dev:', metric.calc_mean(Vqids, np.array(Vy), Vpred))
        E2016pred = model.predict(E2016X)
        print('Test 2016:',
              metric.calc_mean(E2016qids, np.array(E2016y), E2016pred))
        E2017pred = model.predict(E2017X)
        print('Test 2017:',
              metric.calc_mean(E2017qids, np.array(E2017y), E2017pred))
Exemple #6
0
class SemevalSVM(Semeval):
    def __init__(self,
                 model='svm',
                 features='bm25,',
                 comment_features='bm25,',
                 stop=True,
                 vector='word2vec',
                 lowercase=True,
                 punctuation=True,
                 proctrain=True,
                 path=FEATURES_PATH,
                 alpha=0.1,
                 sigma=0.9,
                 gridsearch='random'):
        Semeval.__init__(self,
                         stop=stop,
                         vector=vector,
                         lowercase=lowercase,
                         punctuation=punctuation)
        self.path = path
        self.features = features.split(',')
        self.comment_features = comment_features.split(',')
        self.gridsearch = gridsearch
        self.svm = Model()

        self.model = model
        self.bm25 = SemevalBM25(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain
        ) if 'bm25' in self.features + self.comment_features else None
        self.cosine = SemevalCosine(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain
        ) if 'cosine' in self.features + self.comment_features else None
        self.softcosine = SemevalSoftCosine(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain,
            vector=vector
        ) if 'softcosine' in self.features + self.comment_features else None
        self.translation = SemevalTranslation(
            alpha=alpha,
            sigma=sigma,
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain,
            vector=self.vector
        ) if 'translation' in self.features + self.comment_features else None

        self.train()

    def extract_features(self, procdata, elmoidx, elmovec):
        X, y = [], []
        feat = {}
        for i, q1id in enumerate(procdata):
            feat[q1id] = {}
            percentage = round(float(i + 1) / len(procdata), 2)
            print('Extracting features: ',
                  percentage,
                  i + 1,
                  sep='\t',
                  end='\r')
            for q2id in procdata[q1id]:
                query_question = procdata[q1id][q2id]
                q1, q2 = query_question['q1'], query_question['q2']
                x = []

                q1_emb = self.encode(q1id, q1, elmoidx, elmovec)

                # bm25
                if 'bm25' in self.features:
                    score = self.bm25.model(q1, q2id)
                    x.append(score)

                    for comment in query_question['comments']:
                        q3id = comment['id']
                        q3 = comment['tokens']

                        if len(q3) > 0:
                            score = self.bm25.model(q1, q3id)
                            x.append(score)
                        else:
                            x.append(0)

                # softcosine
                elif 'softcosine' in self.features:
                    if self.vector == 'alignments':
                        score = self.softcosine.model.score(
                            q1, q2, self.alignments)
                    else:
                        q2_emb = self.encode(q2id, q2, elmoidx, elmovec)
                        score = self.softcosine.model(q1, q1_emb, q2, q2_emb)
                    x.append(score)

                    for comment in query_question['comments']:
                        q3id = comment['id']
                        q3 = comment['tokens']

                        if len(q3) > 0:
                            if self.vector == 'alignments':
                                score = self.softcosine.model.score(
                                    q1, q2, self.alignments)
                            else:
                                q3_emb = self.encode(q3id, q3, elmoidx,
                                                     elmovec)
                                score = self.softcosine.model(
                                    q1, q1_emb, q3, q3_emb)
                            x.append(score)
                        else:
                            x.append(0)

                # translation
                elif 'translation' in self.features:
                    if self.vector == 'alignments':
                        lmprob, trmprob, trlmprob, proctime = self.translation.model.score(
                            q1, q2)
                    else:
                        q2_emb = self.encode(q2id, q2, elmoidx, elmovec)
                        lmprob, trmprob, trlmprob, proctime = self.translation.model(
                            q1, q1_emb, q2, q2_emb)
                    x.append(trlmprob)

                    for comment in query_question['comments']:
                        q3id = comment['id']
                        q3 = comment['tokens']

                        if len(q3) > 0:
                            if self.vector == 'alignments':
                                lmprob, trmprob, trlmprob, proctime = self.translation.model.score(
                                    q1, q3)
                            else:
                                q3_emb = self.encode(q3id, q3, elmoidx,
                                                     elmovec)
                                lmprob, trmprob, trlmprob, proctime = self.translation.model(
                                    q1, q1_emb, q3, q3_emb)
                            x.append(trlmprob)
                        else:
                            x.append(0)

                # cosine
                elif 'cosine' in self.features:
                    score = self.cosine.model(q1, q2)
                    x.append(score)

                    for comment in query_question['comments']:
                        q3id = comment['id']
                        q3 = comment['tokens']

                        if len(q3) > 0:
                            score = self.cosine.model(q1, q3)
                            x.append(score)
                        else:
                            x.append(0)

                y_ = query_question['label']
                feat[q1id][q2id] = (x, y_)
                X.append(x)
                y.append(y_)
        return feat, X, y

    def train(self):
        self.X, self.y = [], []
        path = os.path.join('feature', 'train', self.path)
        if not os.path.exists(path):
            feat, self.X, self.y = self.extract_features(
                self.traindata, self.trainidx, self.trainelmo)

            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))
            for q1id in feat:
                for q2id in feat[q1id]:
                    self.X.append(feat[q1id][q2id][0])
                    self.y.append(feat[q1id][q2id][1])

        self.X = np.array(self.X)
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        self.scaler.fit(self.X)
        self.X = self.scaler.transform(self.X)

        if self.model == 'svm':
            self.svm.train_svm(trainvectors=self.X,
                               labels=self.y,
                               c='search',
                               kernel='search',
                               gamma='search',
                               jobs=10,
                               gridsearch=self.gridsearch)
        else:
            self.svm.train_regression(trainvectors=self.X,
                                      labels=self.y,
                                      c='search',
                                      penalty='search',
                                      tol='search',
                                      gridsearch=self.gridsearch,
                                      jobs=10)

    def validate(self):
        path = os.path.join('feature', 'dev', self.path)
        if not os.path.exists(path):
            feat, X, y = self.extract_features(self.devdata, self.devidx,
                                               self.develmo)
            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))

        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(feat):
            ranking[q1id] = []
            for q2id in feat[q1id]:
                X = feat[q1id][q2id][0]

                X = self.scaler.transform([X])[0]
                score, pred_label = self.svm.score(X)
                y_pred.append(pred_label)

                real_label = feat[q1id][q2id][1]
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        parameter_settings = self.svm.return_parameter_settings(clf=self.model)

        return ranking, y_real, y_pred, parameter_settings

    def test(self, testdata, elmoidx, elmovec, test_='test2016'):
        if test_ == 'test2016':
            path = os.path.join('feature', 'test2016', self.path)
        else:
            path = os.path.join('feature', 'test2017', self.path)

        self.testdata = testdata
        if not os.path.exists(path):
            feat, X, y = self.extract_features(self.testdata, elmoidx, elmovec)
            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))

        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(feat):
            ranking[q1id] = []
            for q2id in feat[q1id]:
                X = feat[q1id][q2id][0]

                X = self.scaler.transform([X])[0]
                score, pred_label = self.svm.score(X)
                y_pred.append(pred_label)

                real_label = feat[q1id][q2id][1]
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        parameter_settings = self.svm.return_parameter_settings(clf=self.model)

        return ranking, y_real, y_pred, parameter_settings
Exemple #7
0
def run_softcosine(stop, lowercase, punctuation, proctrain, vector,
                   evaluation_path):
    model = SemevalSoftCosine(stop=stop,
                              vector=vector,
                              lowercase=lowercase,
                              punctuation=punctuation,
                              proctrain=proctrain)
    result_dev = model.validate()
    dev_path = os.path.join(DEV_EVAL_PATH, evaluation_path)

    result_test2016 = model.test(model.test2016data, model.test2016idx,
                                 model.test2016elmo)
    test2016_path = os.path.join(TEST2016_EVAL_PATH, evaluation_path)

    result_test2017 = model.test(model.test2017data, model.test2017idx,
                                 model.test2017elmo)
    test2017_path = os.path.join(TEST2017_EVAL_PATH, evaluation_path)

    model.save(ranking=result_test2016,
               path=test2016_path,
               parameter_settings='')

    model.save(ranking=result_test2017,
               path=test2017_path,
               parameter_settings='')

    model.save(ranking=result_dev, path=dev_path, parameter_settings='')
    map_baseline, map_model = evaluate(copy.copy(result_dev),
                                       prepare_gold(DEV_GOLD_PATH))

    print('Evaluation: ', evaluation_path)
    print('MAP baseline: ', map_baseline)
    print('MAP model: ', map_model)
    print(10 * '-')