Ejemplo n.º 1
0
    def __init__(self,
                 alpha=0,
                 decay=1,
                 ignore_leaves=True,
                 smoothed=True,
                 vector='word2vec',
                 w2vdim=300,
                 lowercase=True,
                 tree='tree',
                 kernel_path=KERNEL_PATH):
        Semeval.__init__(self,
                         vector=vector,
                         stop=False,
                         lowercase=lowercase,
                         punctuation=False,
                         w2vdim=w2vdim)
        self.path = kernel_path
        self.tree = tree
        self.memoization = {}
        self.svm = Model()
        self.flat_traindata()
        self.treekernel = TreeKernel(alpha=alpha,
                                     decay=decay,
                                     ignore_leaves=ignore_leaves,
                                     smoothed=smoothed,
                                     lowercase=lowercase)
        self.train()

        del self.additional
Ejemplo n.º 2
0
    def __init__(self,
                 model='svm',
                 features='bm25,',
                 comment_features='bm25,',
                 stop=True,
                 vector='word2vec',
                 lowercase=True,
                 punctuation=True,
                 proctrain=True,
                 path=FEATURES_PATH,
                 alpha=0.1,
                 sigma=0.9,
                 gridsearch='random'):
        Semeval.__init__(self,
                         stop=stop,
                         vector=vector,
                         lowercase=lowercase,
                         punctuation=punctuation)
        self.path = path
        self.features = features.split(',')
        self.comment_features = comment_features.split(',')
        self.gridsearch = gridsearch
        self.svm = Model()

        self.model = model
        self.bm25 = SemevalBM25(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain
        ) if 'bm25' in self.features + self.comment_features else None
        self.cosine = SemevalCosine(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain
        ) if 'cosine' in self.features + self.comment_features else None
        self.softcosine = SemevalSoftCosine(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain,
            vector=vector
        ) if 'softcosine' in self.features + self.comment_features else None
        self.translation = SemevalTranslation(
            alpha=alpha,
            sigma=sigma,
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain,
            vector=self.vector
        ) if 'translation' in self.features + self.comment_features else None

        self.train()
Ejemplo n.º 3
0
    def __init__(self, model='svm', features='bm25,', comment_features='bm25,', stop=True, vector='word2vec', path=FEATURES_PATH, alpha=0.1, sigma=0.9, gridsearch='random'):
        Quora.__init__(self, stop=stop, vector=vector)
        self.path = path
        self.features = features.split(',')
        self.comment_features = comment_features.split(',')
        self.gridsearch = gridsearch
        self.svm = Model()

        self.model = model
        self.bm25 = QuoraBM25(stop=stop) if 'bm25' in self.features+self.comment_features else None
        self.cosine = QuoraCosine(stop=stop) if 'cosine' in self.features+self.comment_features else None
        self.softcosine = QuoraSoftCosine(stop=stop, vector=vector) if 'softcosine' in self.features+self.comment_features else None
        self.translation = QuoraTranslations(alpha=alpha, sigma=sigma, stop=stop, vector=self.vector) if 'translation' in self.features+self.comment_features else None

        self.train()
Ejemplo n.º 4
0
    def __init__(self, stop={}, lowercase={}, punctuation={}, vector={}, scale=True, alpha=0.9, sigma=0.1):
        self.stop = stop
        self.lowercase = lowercase
        self.punctuation = punctuation
        self.scale = scale
        self.vector = vector
        self.alpha = alpha
        self.sigma = sigma

        self.questions, self.ranking = self.load()
        self.ensemble = Model()
        self.train()

        ranking = self.test()
        p.dump(ranking, open(os.path.join(SEMI_PATH, 'reranking'), 'wb'))
Ejemplo n.º 5
0
    def __init__(self,
                 stop={},
                 lowercase={},
                 punctuation={},
                 vector={},
                 scale=True,
                 w2vdim=300,
                 kernel_path='',
                 alpha=0.8,
                 sigma=0.2):
        self.stop = stop
        self.lowercase = lowercase
        self.punctuation = punctuation
        self.scale = scale
        self.vector = vector
        self.alpha = alpha
        self.sigma = sigma
        self.kernel_path = kernel_path
        self.w2vdim = w2vdim
        self.theta = 0.9

        self.ensemble = Model()
        self.train()
Ejemplo n.º 6
0
class SemevalEnsemble:
    def __init__(self,
                 stop={},
                 lowercase={},
                 punctuation={},
                 vector={},
                 scale=True,
                 w2vdim=300,
                 kernel_path='',
                 alpha=0.8,
                 sigma=0.2):
        self.stop = stop
        self.lowercase = lowercase
        self.punctuation = punctuation
        self.scale = scale
        self.vector = vector
        self.alpha = alpha
        self.sigma = sigma
        self.kernel_path = kernel_path
        self.w2vdim = w2vdim
        self.theta = 0.9

        self.ensemble = Model()
        self.train()

    def format(self, ranking):
        new_ranking = {}
        for q1id in ranking:
            new_ranking[q1id] = {}
            for question in ranking[q1id]:
                real_label, score, q2id = question
                new_ranking[q1id][q2id] = (score, real_label)
        return new_ranking

    def train(self):
        # self.train_feature()
        self.train_kernel()
        self.train_classifier()

        # finding theta in development set
        thetas = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        best_map = 0.0
        for theta in thetas:
            ranking = {}
            for q1id in self.devkernel:
                ranking[q1id] = []
                for q2id in self.devkernel[q1id]:
                    X = []
                    X.append(self.devbm25[q1id][q2id][0])
                    X.append(self.devtranslation[q1id][q2id][0])
                    X.append(self.devsoftcosine[q1id][q2id][0])

                    if self.scale:
                        X = self.scaler.transform([X])[0]
                    clfscore, pred_label = self.ensemble.score(X)

                    kernelscore = self.devkernel[q1id][q2id][0]
                    score = (theta * clfscore) + ((1 - theta) * kernelscore)

                    ranking[q1id].append((pred_label, score, q2id))

            map_baseline, map_model = evaluate(copy.copy(ranking),
                                               prepare_gold(DEV_GOLD_PATH))
            if map_model > best_map:
                best_map = copy.copy(map_model)
                print('MAP baseline', map_baseline)
                print('MAP: ', map_model)
                print(10 * '-')
                self.theta = theta

    def train_kernel(self):
        vector = self.vector['kernel']
        lowercase = self.lowercase['kernel']
        path = os.path.join(
            'ensemble', 'kernel.lower_' + str(lowercase) + '.vector_' +
            vector + '.vecdim_' + str(self.w2vdim))
        if not os.path.exists(path):
            self.kernel = SemevalTreeKernel(smoothed=True,
                                            vector=vector,
                                            lowercase=lowercase,
                                            tree='subj_tree',
                                            kernel_path=self.kernel_path,
                                            w2vdim=self.w2vdim)
            self.trainkernel, _, _, _ = self.kernel.test(self.kernel.traindata,
                                                         self.kernel.trainidx,
                                                         self.kernel.trainelmo,
                                                         test_='train')
            self.trainkernel = self.format(self.trainkernel)

            self.devkernel, _, _, _ = self.kernel.validate()
            self.devkernel = self.format(self.devkernel)

            self.test2016kernel, _, _, _ = self.kernel.test(
                self.kernel.test2016data,
                self.kernel.test2016idx,
                self.kernel.test2016elmo,
                test_='test2016')
            self.test2016kernel = self.format(self.test2016kernel)

            self.test2017kernel, _, _, _ = self.kernel.test(
                self.kernel.test2017data,
                self.kernel.test2017idx,
                self.kernel.test2017elmo,
                test_='test2017')
            self.test2017kernel = self.format(self.test2017kernel)

            data = {
                'train': self.trainkernel,
                'dev': self.devkernel,
                'test2016': self.test2016kernel,
                'test2017': self.test2017kernel
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.trainkernel = data['train']
            self.devkernel = data['dev']
            self.test2016kernel = data['test2016']
            self.test2017kernel = data['test2017']

    def train_classifier(self):
        lowercase, stop, punctuation = self.lowercase['bm25'], self.stop[
            'bm25'], self.punctuation['bm25']
        path = os.path.join(
            'ensemble', 'bm25.lower_' + str(lowercase) + '.stop_' + str(stop) +
            '.punct_' + str(punctuation))
        if not os.path.exists(path):
            self.bm25 = SemevalBM25(stop=stop,
                                    lowercase=lowercase,
                                    punctuation=punctuation,
                                    proctrain=True)
            self.trainbm25 = self.format(self.bm25.test(self.bm25.traindata))
            self.devbm25 = self.format(self.bm25.validate())
            self.test2016bm25 = self.format(
                self.bm25.test(self.bm25.test2016data))
            self.test2017bm25 = self.format(
                self.bm25.test(self.bm25.test2017data))
            del self.bm25

            data = {
                'train': self.trainbm25,
                'dev': self.devbm25,
                'test2016': self.test2016bm25,
                'test2017': self.test2017bm25
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.trainbm25 = data['train']
            self.devbm25 = data['dev']
            self.test2016bm25 = data['test2016']
            self.test2017bm25 = data['test2017']

        vector = self.vector['translation']
        lowercase, stop, punctuation = self.lowercase[
            'translation'], self.stop['translation'], self.punctuation[
                'translation']
        path = os.path.join(
            'ensemble', 'translation.lower_' + str(lowercase) + '.stop_' +
            str(stop) + '.punct_' + str(punctuation) + '.vector_' +
            str(vector) + '.vecdim_' + str(self.w2vdim))
        if not os.path.exists(path):
            self.translation = SemevalTranslation(alpha=self.alpha,
                                                  sigma=self.sigma,
                                                  punctuation=punctuation,
                                                  proctrain=True,
                                                  vector=vector,
                                                  stop=stop,
                                                  lowercase=lowercase,
                                                  w2vdim=self.w2vdim)
            self.traintranslation = self.format(
                self.translation.test(self.translation.traindata,
                                      self.translation.trainidx,
                                      self.translation.trainelmo))
            self.devtranslation = self.format(self.translation.validate())
            self.test2016translation = self.format(
                self.translation.test(self.translation.test2016data,
                                      self.translation.test2016idx,
                                      self.translation.test2016elmo))
            self.test2017translation = self.format(
                self.translation.test(self.translation.test2017data,
                                      self.translation.test2017idx,
                                      self.translation.test2017elmo))
            del self.translation

            data = {
                'train': self.traintranslation,
                'dev': self.devtranslation,
                'test2016': self.test2016translation,
                'test2017': self.test2017translation
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.traintranslation = data['train']
            self.devtranslation = data['dev']
            self.test2016translation = data['test2016']
            self.test2017translation = data['test2017']

        vector = self.vector['softcosine']
        lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[
            'softcosine'], self.punctuation['softcosine']
        path = os.path.join(
            'ensemble', 'softcosine.lower_' + str(lowercase) + '.stop_' +
            str(stop) + '.punct_' + str(punctuation) + '.vector_' +
            str(vector) + '.vecdim_' + str(self.w2vdim))
        if not os.path.exists(path):
            self.softcosine = SemevalSoftCosine(stop=stop,
                                                vector=vector,
                                                lowercase=lowercase,
                                                punctuation=punctuation,
                                                proctrain=True,
                                                w2vdim=self.w2vdim)
            self.trainsoftcosine = self.format(
                self.softcosine.test(self.softcosine.traindata,
                                     self.softcosine.trainidx,
                                     self.softcosine.trainelmo))
            self.devsoftcosine = self.format(self.softcosine.validate())
            self.test2016softcosine = self.format(
                self.softcosine.test(self.softcosine.test2016data,
                                     self.softcosine.test2016idx,
                                     self.softcosine.test2016elmo))
            self.test2017softcosine = self.format(
                self.softcosine.test(self.softcosine.test2017data,
                                     self.softcosine.test2017idx,
                                     self.softcosine.test2017elmo))
            del self.softcosine

            data = {
                'train': self.trainsoftcosine,
                'dev': self.devsoftcosine,
                'test2016': self.test2016softcosine,
                'test2017': self.test2017softcosine
            }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.trainsoftcosine = data['train']
            self.devsoftcosine = data['dev']
            self.test2016softcosine = data['test2016']
            self.test2017softcosine = data['test2017']

        self.X, self.y = [], []

        for q1id in self.trainbm25:
            for q2id in self.trainbm25[q1id]:
                X = [self.trainbm25[q1id][q2id][0]]
                X.append(self.traintranslation[q1id][q2id][0])
                X.append(self.trainsoftcosine[q1id][q2id][0])
                self.X.append(X)
                self.y.append(self.trainbm25[q1id][q2id][1])

        if self.scale:
            self.scaler = MinMaxScaler(feature_range=(-1, 1))
            self.scaler.fit(self.X)
            self.X = self.scaler.transform(self.X)
        self.ensemble.train_regression(trainvectors=self.X,
                                       labels=self.y,
                                       c='search',
                                       penalty='search',
                                       tol='search',
                                       gridsearch='brutal',
                                       jobs=10)

    def train_feature(self):
        lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop[
            'softcosine'], self.punctuation['softcosine']
        semeval = SemevalSoftCosine(stop=stop,
                                    vector='word2vec',
                                    lowercase=lowercase,
                                    punctuation=punctuation,
                                    proctrain=True)

        self.trainfeature = {}
        for i, q1id in enumerate(semeval.traindata):
            self.trainfeature[q1id] = {}

            for q2id in semeval.traindata[q1id]:
                pair = semeval.traindata[q1id][q2id]
                q1, q2 = pair['q1'], pair['q2']

                q1emb = np.mean(semeval.encode(q1id, q1, semeval.trainidx,
                                               semeval.trainelmo),
                                axis=0)
                q2emb = np.mean(semeval.encode(q2id, q2, semeval.trainidx,
                                               semeval.trainelmo),
                                axis=0)

                cos = cosine_similarity([q1emb], [q2emb])[0][0]
                self.trainfeature[q1id][q2id] = (cos, 0)

        self.devfeature = {}
        for i, q1id in enumerate(semeval.devdata):
            self.devfeature[q1id] = {}

            for q2id in semeval.devdata[q1id]:
                pair = semeval.devdata[q1id][q2id]
                q1, q2 = pair['q1'], pair['q2']

                q1emb = np.mean(semeval.encode(q1id, q1, semeval.devidx,
                                               semeval.develmo),
                                axis=0)
                q2emb = np.mean(semeval.encode(q2id, q2, semeval.devidx,
                                               semeval.develmo),
                                axis=0)

                cos = cosine_similarity([q1emb], [q2emb])[0][0]
                self.devfeature[q1id][q2id] = (cos, 0)

        self.test2016feature = {}
        for i, q1id in enumerate(semeval.test2016data):
            self.test2016feature[q1id] = {}

            for q2id in semeval.test2016data[q1id]:
                pair = semeval.test2016data[q1id][q2id]
                q1, q2 = pair['q1'], pair['q2']

                q1emb = np.mean(semeval.encode(q1id, q1, semeval.test2016idx,
                                               semeval.test2016elmo),
                                axis=0)
                q2emb = np.mean(semeval.encode(q2id, q2, semeval.test2016idx,
                                               semeval.test2016elmo),
                                axis=0)

                cos = cosine_similarity([q1emb], [q2emb])[0][0]
                self.test2016feature[q1id][q2id] = (cos, 0)

        self.test2017feature = {}
        for i, q1id in enumerate(semeval.test2017data):
            self.test2017feature[q1id] = {}

            for q2id in semeval.test2017data[q1id]:
                pair = semeval.test2017data[q1id][q2id]
                q1, q2 = pair['q1'], pair['q2']

                q1emb = np.mean(semeval.encode(q1id, q1, semeval.test2017idx,
                                               semeval.test2017elmo),
                                axis=0)
                q2emb = np.mean(semeval.encode(q2id, q2, semeval.test2017idx,
                                               semeval.test2017elmo),
                                axis=0)

                cos = cosine_similarity([q1emb], [q2emb])[0][0]
                self.test2017feature[q1id][q2id] = (cos, 0)

    def test(self, set_='dev'):
        if set_ == 'dev':
            bm25 = self.devbm25
            translation = self.devtranslation
            softcosine = self.devsoftcosine
            kernel = self.devkernel
        elif set_ == 'train':
            bm25 = self.trainbm25
            translation = self.traintranslation
            softcosine = self.trainsoftcosine
            kernel = self.trainkernel
            feature = self.trainfeature
        elif set_ == 'test2016':
            bm25 = self.test2016bm25
            translation = self.test2016translation
            softcosine = self.test2016softcosine
            kernel = self.test2016kernel
        else:
            bm25 = self.test2017bm25
            translation = self.test2017translation
            softcosine = self.test2017softcosine
            kernel = self.test2017kernel

        ranking = {}
        y_real, y_pred = [], []
        for q1id in bm25:
            ranking[q1id] = []
            for q2id in bm25[q1id]:
                X = []
                X.append(bm25[q1id][q2id][0])
                X.append(translation[q1id][q2id][0])
                X.append(softcosine[q1id][q2id][0])

                if self.scale:
                    X = self.scaler.transform([X])[0]
                clfscore, pred_label = self.ensemble.score(X)
                y_pred.append(pred_label)

                real_label = 1 if bm25[q1id][q2id][1] == 'true' else 0
                y_real.append(real_label)

                kernelscore = kernel[q1id][q2id][0]
                score = (self.theta * clfscore) + (
                    (1 - self.theta) * kernelscore)

                ranking[q1id].append((pred_label, score, q2id))

        parameter_settings = self.ensemble.return_parameter_settings(
            clf='regression')
        parameter_settings = parameter_settings + ',gamma=' + str(
            self.theta) + ',alpha=' + str(self.alpha) + ',sigma=' + str(
                self.sigma)
        return ranking, y_real, y_pred, parameter_settings

    def save(self, ranking, path, parameter_settings):
        with open(path, 'w') as f:
            f.write(parameter_settings)
            f.write('\n')
            for q1id in ranking:
                for row in ranking[q1id]:
                    label = 'false'
                    if row[0] == 1:
                        label = 'true'
                    f.write('\t'.join([
                        str(q1id),
                        str(row[2]),
                        str(0),
                        str(row[1]), label, '\n'
                    ]))
Ejemplo n.º 7
0
class SemevalTreeKernel(Semeval):
    def __init__(self,
                 alpha=0,
                 decay=1,
                 ignore_leaves=True,
                 smoothed=True,
                 vector='word2vec',
                 w2vdim=300,
                 lowercase=True,
                 tree='tree',
                 kernel_path=KERNEL_PATH):
        Semeval.__init__(self,
                         vector=vector,
                         stop=False,
                         lowercase=lowercase,
                         punctuation=False,
                         w2vdim=w2vdim)
        self.path = kernel_path
        self.tree = tree
        self.memoization = {}
        self.svm = Model()
        self.flat_traindata()
        self.treekernel = TreeKernel(alpha=alpha,
                                     decay=decay,
                                     ignore_leaves=ignore_leaves,
                                     smoothed=smoothed,
                                     lowercase=lowercase)
        self.train()

        del self.additional

    def memoize(self, q1id, q1, q1_emb, q1_token2lemma, q2id, q2, q2_emb,
                q2_token2lemma, alignments):
        if q1id in self.memoization:
            if q2id in self.memoization[q1id]:
                return self.memoization[q1id][q2id]
        else:
            self.memoization[q1id] = {}

        if q2id in self.memoization:
            if q1id in self.memoization[q2id]:
                return self.memoization[q2id][q1id]
        else:
            self.memoization[q2id] = {}

        k = self.treekernel(q1, q1_emb, q1_token2lemma, q2, q2_emb,
                            q2_token2lemma, alignments)
        self.memoization[q1id][q2id] = k
        self.memoization[q2id][q1id] = k

        return k

    def flat_traindata(self):
        self.flattraindata = []
        for q1id in self.traindata:
            for q2id in self.traindata[q1id]:
                self.flattraindata.append(self.traindata[q1id][q2id])

    def get_alignment(self, c1, c2):
        alignments = []
        for i, w in enumerate(c1):
            alignments_i = []

            for j, t in enumerate(c2):
                try:
                    w_t = self.alignments[t[0]][t][w[0]][w]
                except:
                    w_t = 0.0
                alignments_i.append(w_t)
            alignments.append(alignments_i)
        return alignments

    def extract_features(self, procdata, elmoidx, elmovec):
        feat, X, y = {}, [], []

        for i, q1id in enumerate(procdata):
            feat[q1id] = {}
            percentage = round(float(i + 1) / len(procdata), 2)
            for q2id in procdata[q1id]:
                q_pair = procdata[q1id][q2id]

                x = []
                q1id = q_pair['q1_id']
                q1 = q_pair['q1_full']
                q1_tree = q_pair['q1_tree'] if self.tree == 'tree' else q_pair[
                    'subj_q1_tree']
                q1_emb = self.encode(q1id, q1, elmoidx, elmovec)
                q1_token2lemma = dict(zip(q1, q_pair['q1_lemmas']))
                alignments = self.get_alignment(
                    q1, q1) if self.vector == 'alignments' else []
                kq1 = self.memoize(q1id, q1_tree, q1_emb, q1_token2lemma, q1id,
                                   q1_tree, q1_emb, q1_token2lemma, alignments)

                q2id = q_pair['q2_id']
                q2 = q_pair['q2_full']
                q2_tree = q_pair['q2_tree'] if self.tree == 'tree' else q_pair[
                    'subj_q2_tree']
                q2_emb = self.encode(q2id, q2, elmoidx, elmovec)
                q2_token2lemma = dict(zip(q2, q_pair['q2_lemmas']))
                alignments = self.get_alignment(
                    q2, q2) if self.vector == 'alignments' else []
                kq2 = self.memoize(q2id, q2_tree, q2_emb, q2_token2lemma, q2id,
                                   q2_tree, q2_emb, q2_token2lemma, alignments)

                if i % 10 == 0:
                    print('Path: ',
                          self.path,
                          'Progress: ',
                          percentage,
                          i + 1,
                          sep=10 * ' ',
                          end='\r')
                for j, c in enumerate(self.flattraindata):
                    c1id = c['q1_id']
                    c1 = c['q1_full']
                    c1_tree = c['q1_tree'] if self.tree == 'tree' else c[
                        'subj_q1_tree']
                    c1_emb = self.encode(c1id, c1, self.trainidx,
                                         self.trainelmo)
                    c1_token2lemma = dict(zip(c1, c['q1_lemmas']))
                    alignments = self.get_alignment(
                        c1, c1) if self.vector == 'alignments' else []
                    kc1 = self.memoize(c1id, c1_tree, c1_emb, c1_token2lemma,
                                       c1id, c1_tree, c1_emb, c1_token2lemma,
                                       alignments)

                    c2id = c['q2_id']
                    c2 = c['q2_full']
                    c2_tree = c['q2_tree'] if self.tree == 'tree' else c[
                        'subj_q2_tree']
                    c2_emb = self.encode(c2id, c2, self.trainidx,
                                         self.trainelmo)
                    c2_token2lemma = dict(zip(c2, c['q2_lemmas']))
                    alignments = self.get_alignment(
                        c2, c2) if self.vector == 'alignments' else []
                    kc2 = self.memoize(c2id, c2_tree, c2_emb, c2_token2lemma,
                                       c2id, c2_tree, c2_emb, c2_token2lemma,
                                       alignments)

                    if kq1 == 0 or kc1 == 0:
                        kq1c1 = 0.0
                    else:
                        alignments = self.get_alignment(
                            q1, c1) if self.vector == 'alignments' else []
                        kq1c1 = float(
                            self.memoize(q1id, q1_tree, q1_emb, q1_token2lemma,
                                         c1id, c1_tree, c1_emb, c1_token2lemma,
                                         alignments)) / np.sqrt(
                                             kq1 * kc1)  # normalized

                    if kq2 == 0 or kc2 == 0:
                        kq2c2 = 0.0
                    else:
                        alignments = self.get_alignment(
                            q2, c2) if self.vector == 'alignments' else []
                        kq2c2 = float(
                            self.memoize(q2id, q2_tree, q2_emb, q2_token2lemma,
                                         c2id, c2_tree, c2_emb, c2_token2lemma,
                                         alignments)) / np.sqrt(
                                             kq2 * kc2)  # normalized

                    k = kq1c1 + kq2c2
                    x.append(k)

                y_ = q_pair['label']
                feat[q1id][q2id] = (x, y_)
                X.append(x)
                y.append(y_)
        return feat, X, y

    def train(self):
        path = os.path.join('kernel', 'train', self.path)
        self.X, self.y = [], []
        if not os.path.exists(path):
            feat, self.X, self.y = self.extract_features(
                self.traindata, self.trainidx, self.trainelmo)

            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))
            for q1id in feat:
                for q2id in feat[q1id]:
                    self.X.append(feat[q1id][q2id][0])
                    self.y.append(feat[q1id][q2id][1])

        self.X = np.array(self.X)
        self.svm.train_svm(trainvectors=self.X,
                           labels=self.y,
                           c='search',
                           kernel='precomputed',
                           gamma='search',
                           jobs=10)

    def validate(self):
        path = os.path.join('kernel', 'dev', self.path)
        if not os.path.exists(path):
            feat, X, y = self.extract_features(self.devdata, self.devidx,
                                               self.develmo)
            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))

        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(feat):
            ranking[q1id] = []
            for q2id in feat[q1id]:
                X = feat[q1id][q2id][0]

                score, pred_label = self.svm.score(X)
                y_pred.append(pred_label)

                real_label = feat[q1id][q2id][1]
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        parameter_settings = self.svm.return_parameter_settings(clf='svm')

        return ranking, y_real, y_pred, parameter_settings

    def test(self, testdata, elmoidx, elmovec, test_='test2016'):
        if test_ == 'test2016':
            path = os.path.join('kernel', 'test2016', self.path)
        elif test_ == 'train':
            path = os.path.join('kernel', 'train', self.path)
        elif test_ == 'dev':
            path = os.path.join('kernel', 'test2016', self.path)
        else:
            path = os.path.join('kernel', 'test2017', self.path)

        self.testdata = testdata
        if not os.path.exists(path):
            feat, X, y = self.extract_features(self.testdata, elmoidx, elmovec)
            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))

        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(feat):
            ranking[q1id] = []
            for q2id in feat[q1id]:
                X = feat[q1id][q2id][0]

                score, pred_label = self.svm.score(X)
                y_pred.append(pred_label)

                real_label = feat[q1id][q2id][1]
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        parameter_settings = self.svm.return_parameter_settings(clf='svm')

        return ranking, y_real, y_pred, parameter_settings
Ejemplo n.º 8
0
class QuoraSVM(Quora):
    def __init__(self, model='svm', features='bm25,', comment_features='bm25,', stop=True, vector='word2vec', path=FEATURES_PATH, alpha=0.1, sigma=0.9, gridsearch='random'):
        Quora.__init__(self, stop=stop, vector=vector)
        self.path = path
        self.features = features.split(',')
        self.comment_features = comment_features.split(',')
        self.gridsearch = gridsearch
        self.svm = Model()

        self.model = model
        self.bm25 = QuoraBM25(stop=stop) if 'bm25' in self.features+self.comment_features else None
        self.cosine = QuoraCosine(stop=stop) if 'cosine' in self.features+self.comment_features else None
        self.softcosine = QuoraSoftCosine(stop=stop, vector=vector) if 'softcosine' in self.features+self.comment_features else None
        self.translation = QuoraTranslations(alpha=alpha, sigma=sigma, stop=stop, vector=self.vector) if 'translation' in self.features+self.comment_features else None

        self.train()


    def extract_features(self, pairdata, elmoidx, elmovec, fullelmoidx, fullelmovec):
        X, y = [], []
        feat = []

        for i, pair in enumerate(pairdata):
            try:
                percentage = round(float(i + 1) / len(pairdata), 2)
                print('Extracting features: ', percentage, i + 1, sep='\t', end = '\r')
                q1id = pair['qid1'] if 'qid1' in pair else str(i) + '1'
                q2id = pair['qid2'] if 'qid2' in pair else str(i) + '2'
                q1, q2 = pair['tokens_proc1'], pair['tokens_proc2']

                x = []

                if self.stop:
                    q1_emb = self.encode(q1id, q1, elmoidx, elmovec)
                else:
                    q1_emb = self.encode(q1id, q1, fullelmoidx, fullelmovec)

                # bm25
                if 'bm25' in self.features:
                    score = self.bm25.model(q1, q2id)
                    x.append(score)
                # softcosine
                if 'softcosine' in self.features:
                    if self.stop:
                        q2_emb = self.encode(q2id, q2, elmoidx, elmovec)
                    else:
                        q2_emb = self.encode(q2id, q2, fullelmoidx, fullelmovec)
                    score = self.softcosine.model(q1, q1_emb, q2, q2_emb)
                    x.append(score)
                # translation
                if 'translation' in self.features:
                    if self.stop:
                        q2_emb = self.encode(q2id, q2, elmoidx, elmovec)
                    else:
                        q2_emb = self.encode(q2id, q2, fullelmoidx, fullelmovec)
                    lmprob, trmprob, trlmprob, proctime = self.translation.model(q1, q1_emb, q2, q2_emb)
                    x.append(trlmprob)
                # cosine
                if 'cosine' in self.features:
                    score = self.cosine.model(q1, q2)
                    x.append(score)

                y_ = int(pair['is_duplicate'])
                feat.append((x, y_))
                X.append(x)
                y.append(y_)
            except:
                print('Error')
                print(pair)
        return feat, X, y


    def train(self):
        path = os.path.join(FEATURES_PATH, 'train', self.path)
        self.X, self.y = [], []
        if not os.path.exists(path):
            feat, self.X, self.y = self.extract_features(self.trainset, self.trainidx, self.trainelmo, self.fulltrainidx, self.fulltrainelmo)

            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))
            for row in feat:
                self.X.append(row[0])
                self.y.append(row[1])

        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        self.scaler.fit(self.X)
        self.X = self.scaler.transform(self.X)

        if self.model == 'svm':
            self.svm.train_svm(
                trainvectors=self.X,
                labels=self.y,
                c='search',
                kernel='search',
                gamma='search',
                jobs=10,
                gridsearch=self.gridsearch
            )
        else:
            self.svm.train_regression(trainvectors=self.X, labels=self.y, c='search', penalty='search', tol='search', gridsearch=self.gridsearch, jobs=10)


    def validate(self):
        path = os.path.join(FEATURES_PATH, 'dev', self.path)
        if not os.path.exists(path):
            feat, X, y = self.extract_features(self.devset, self.devidx, self.develmo, self.fulldevidx, self.fulldevelmo)
            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))

        y_real, y_pred = [], []
        for i, pair in enumerate(feat):
            X = pair[0]
            X = self.scaler.transform([X])[0]
            score, pred_label = self.svm.score(X)
            y_pred.append(pred_label)

            real_label = pair[1]
            y_real.append(real_label)

        parameter_settings = self.svm.return_parameter_settings(clf=self.model)

        return y_real, y_pred, parameter_settings
Ejemplo n.º 9
0
class Rerank:
    def __init__(self, stop={}, lowercase={}, punctuation={}, vector={}, scale=True, alpha=0.9, sigma=0.1):
        self.stop = stop
        self.lowercase = lowercase
        self.punctuation = punctuation
        self.scale = scale
        self.vector = vector
        self.alpha = alpha
        self.sigma = sigma

        self.questions, self.ranking = self.load()
        self.ensemble = Model()
        self.train()

        ranking = self.test()
        p.dump(ranking, open(os.path.join(SEMI_PATH, 'reranking'), 'wb'))


    def load(self):
        with open(os.path.join(SEMI_PATH, 'index.txt')) as f:
            indexes = f.read().split('\n')

        with open(os.path.join(SEMI_PATH, 'question.txt')) as f:
            questions = [text.replace('<SENTENCE>', ' ').split() for text in f.read().split('\n')]

        with open(os.path.join(SEMI_PATH, 'ranking')) as f:
            ranking = [w.split() for w in f.read().split('\n')][:-1]

        ranking = dict([(w[0], w[1:]) for w in ranking])
        for qid in ranking:
            ranking[qid] = [w.split('-') for w in ranking[qid]]

        return dict(zip(indexes, questions)), ranking


    def format(self, ranking):
        new_ranking = {}
        for q1id in ranking:
            new_ranking[q1id] = {}
            for question in ranking[q1id]:
                real_label, score, q2id = question
                new_ranking[q1id][q2id] = (score, real_label)
        return new_ranking


    def train(self):
        lowercase, stop, punctuation = self.lowercase['bm25'], self.stop['bm25'], self.punctuation['bm25']
        path = os.path.join(SEMI_PATH, 'bm25.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation))
        if not os.path.exists(path):
            self.bm25 = SemiBM25(stop=stop, lowercase=lowercase, punctuation=punctuation)
            self.trainbm25 = self.format(self.bm25.test(self.bm25.traindata))
            self.devbm25 = self.format(self.bm25.validate())

            print('Testing BM25...')
            self.testbm25 = {}
            for q1id in self.ranking:
                self.testbm25[q1id] = {}
                for question in self.ranking[q1id][1:11]:
                    q2id, score = question
                    self.testbm25[q1id][q2id] = (float(score), 0)
            del self.bm25

            data = {'train': self.trainbm25, 'dev': self.devbm25, 'test': self.testbm25}
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.trainbm25 = data['train']
            self.devbm25 = data['dev']
            self.testbm25 = data['test']

        vector = self.vector['translation']
        lowercase, stop, punctuation = self.lowercase['translation'], self.stop['translation'], self.punctuation['translation']
        path = os.path.join(SEMI_PATH, 'translation.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation) + '.vector_' + str(vector))
        if not os.path.exists(path):
            translation = SemiTranslation(alpha=self.alpha, sigma=self.sigma, punctuation=punctuation, stop=stop, lowercase=lowercase)
            self.traintranslation = self.format(translation.test(translation.traindata))
            self.devtranslation = self.format(translation.validate())
            del translation

            print('Testing Translation...')
            testdata = list(self.format_input(lowercase, stop, punctuation).items())
            self.testtranslation = run_translation_thread(lowercase=lowercase, stop=stop, punctuation=punctuation, alpha=self.alpha, sigma=self.sigma, testdata=testdata)

            data = {'train': self.traintranslation, 'dev': self.devtranslation, 'test':self.testtranslation}
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.traintranslation = data['train']
            self.devtranslation = data['dev']
            self.testtranslation = data['test']

        vector = self.vector['softcosine']
        lowercase, stop, punctuation = self.lowercase['softcosine'], self.stop['softcosine'], self.punctuation['softcosine']
        path = os.path.join(SEMI_PATH, 'softcosine.lower_' + str(lowercase) + '.stop_' + str(stop) + '.punct_' + str(punctuation) + '.vector_' + str(vector))
        if not os.path.exists(path):
            softcosine = SemiSoftCosine(stop=stop, lowercase=lowercase, punctuation=punctuation)
            self.trainsoftcosine = self.format(softcosine.test(softcosine.traindata))
            self.devsoftcosine = self.format(softcosine.validate())
            del softcosine

            print('Testing Softcosine...')
            testdata = list(self.format_input(lowercase, stop, punctuation).items())
            self.testsoftcosine = run_softcosine_thread(lowercase=lowercase, stop=stop, punctuation=punctuation, testdata=testdata)

            data = { 'train': self.trainsoftcosine, 'dev': self.devsoftcosine, 'test': self.testsoftcosine }
            p.dump(data, open(path, 'wb'))
        else:
            data = p.load(open(path, 'rb'))
            self.trainsoftcosine = data['train']
            self.devsoftcosine = data['dev']
            self.testsoftcosine = data['test']

        self.X, self.y = [], []

        for q1id in self.trainbm25:
            for q2id in self.trainbm25[q1id]:
                X = [self.trainbm25[q1id][q2id][0]]
                X.append(self.traintranslation[q1id][q2id][0])
                X.append(self.trainsoftcosine[q1id][q2id][0])
                self.X.append(X)
                self.y.append(self.trainbm25[q1id][q2id][1])

        if self.scale:
            self.scaler = MinMaxScaler(feature_range=(-1, 1))
            self.scaler.fit(self.X)
            self.X = self.scaler.transform(self.X)
        self.ensemble.train_regression(trainvectors=self.X, labels=self.y, c='search', penalty='search', tol='search', gridsearch='brutal', jobs=10)


    def format_input(self, lowercase, stop, punctuation):
        def remove_punctuation(tokens):
            return re.sub(r'[\W]+',' ', ' '.join(tokens)).strip().split()

        def remove_stopwords(tokens):
            return [w for w in tokens if w.lower() not in stop_]

        procset = {}

        for i, q1id in enumerate(self.ranking):
            procset[q1id] = {}
            percentage = str(round((float(i+1) / len(self.ranking)) * 100, 2)) + '%'
            print('Process: ', percentage, end='\r')

            q1 = self.questions[q1id]
            q1 = [w.lower() for w in q1] if lowercase else q1
            q1 = remove_punctuation(q1) if punctuation else q1
            q1 = remove_stopwords(q1) if stop else q1

            for row in self.ranking[q1id][1:11]:
                q2id, score = row
                q2 = self.questions[q2id]
                q2 = [w.lower() for w in q2] if lowercase else q2
                q2 = remove_punctuation(q2) if punctuation else q2
                q2 = remove_stopwords(q2) if stop else q2

                label = 0
                procset[q1id][q2id] = {
                    'q1_id': q1id,
                    'q1': q1,
                    'q2_id': q2id,
                    'q2': q2,
                    'label':label
                }

        return procset


    def test(self):
        bm25 = self.testbm25
        translation = self.testtranslation
        softcosine = self.testsoftcosine

        ranking = {}
        for q1id in bm25:
            ranking[q1id] = {}
            for q2id in bm25[q1id]:
                X = [bm25[q1id][q2id][0]]
                X.append(translation[q1id][q2id][0])
                X.append(softcosine[q1id][q2id][0])

                if self.scale:
                    X = self.scaler.transform([X])[0]
                clfscore, pred_label = self.ensemble.score(X)
                ranking[q1id][q2id] = {'score':clfscore, 'label':pred_label}

        return ranking
Ejemplo n.º 10
0
class SemevalSVM(Semeval):
    def __init__(self,
                 model='svm',
                 features='bm25,',
                 comment_features='bm25,',
                 stop=True,
                 vector='word2vec',
                 lowercase=True,
                 punctuation=True,
                 proctrain=True,
                 path=FEATURES_PATH,
                 alpha=0.1,
                 sigma=0.9,
                 gridsearch='random'):
        Semeval.__init__(self,
                         stop=stop,
                         vector=vector,
                         lowercase=lowercase,
                         punctuation=punctuation)
        self.path = path
        self.features = features.split(',')
        self.comment_features = comment_features.split(',')
        self.gridsearch = gridsearch
        self.svm = Model()

        self.model = model
        self.bm25 = SemevalBM25(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain
        ) if 'bm25' in self.features + self.comment_features else None
        self.cosine = SemevalCosine(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain
        ) if 'cosine' in self.features + self.comment_features else None
        self.softcosine = SemevalSoftCosine(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain,
            vector=vector
        ) if 'softcosine' in self.features + self.comment_features else None
        self.translation = SemevalTranslation(
            alpha=alpha,
            sigma=sigma,
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain,
            vector=self.vector
        ) if 'translation' in self.features + self.comment_features else None

        self.train()

    def extract_features(self, procdata, elmoidx, elmovec):
        X, y = [], []
        feat = {}
        for i, q1id in enumerate(procdata):
            feat[q1id] = {}
            percentage = round(float(i + 1) / len(procdata), 2)
            print('Extracting features: ',
                  percentage,
                  i + 1,
                  sep='\t',
                  end='\r')
            for q2id in procdata[q1id]:
                query_question = procdata[q1id][q2id]
                q1, q2 = query_question['q1'], query_question['q2']
                x = []

                q1_emb = self.encode(q1id, q1, elmoidx, elmovec)

                # bm25
                if 'bm25' in self.features:
                    score = self.bm25.model(q1, q2id)
                    x.append(score)

                    for comment in query_question['comments']:
                        q3id = comment['id']
                        q3 = comment['tokens']

                        if len(q3) > 0:
                            score = self.bm25.model(q1, q3id)
                            x.append(score)
                        else:
                            x.append(0)

                # softcosine
                elif 'softcosine' in self.features:
                    if self.vector == 'alignments':
                        score = self.softcosine.model.score(
                            q1, q2, self.alignments)
                    else:
                        q2_emb = self.encode(q2id, q2, elmoidx, elmovec)
                        score = self.softcosine.model(q1, q1_emb, q2, q2_emb)
                    x.append(score)

                    for comment in query_question['comments']:
                        q3id = comment['id']
                        q3 = comment['tokens']

                        if len(q3) > 0:
                            if self.vector == 'alignments':
                                score = self.softcosine.model.score(
                                    q1, q2, self.alignments)
                            else:
                                q3_emb = self.encode(q3id, q3, elmoidx,
                                                     elmovec)
                                score = self.softcosine.model(
                                    q1, q1_emb, q3, q3_emb)
                            x.append(score)
                        else:
                            x.append(0)

                # translation
                elif 'translation' in self.features:
                    if self.vector == 'alignments':
                        lmprob, trmprob, trlmprob, proctime = self.translation.model.score(
                            q1, q2)
                    else:
                        q2_emb = self.encode(q2id, q2, elmoidx, elmovec)
                        lmprob, trmprob, trlmprob, proctime = self.translation.model(
                            q1, q1_emb, q2, q2_emb)
                    x.append(trlmprob)

                    for comment in query_question['comments']:
                        q3id = comment['id']
                        q3 = comment['tokens']

                        if len(q3) > 0:
                            if self.vector == 'alignments':
                                lmprob, trmprob, trlmprob, proctime = self.translation.model.score(
                                    q1, q3)
                            else:
                                q3_emb = self.encode(q3id, q3, elmoidx,
                                                     elmovec)
                                lmprob, trmprob, trlmprob, proctime = self.translation.model(
                                    q1, q1_emb, q3, q3_emb)
                            x.append(trlmprob)
                        else:
                            x.append(0)

                # cosine
                elif 'cosine' in self.features:
                    score = self.cosine.model(q1, q2)
                    x.append(score)

                    for comment in query_question['comments']:
                        q3id = comment['id']
                        q3 = comment['tokens']

                        if len(q3) > 0:
                            score = self.cosine.model(q1, q3)
                            x.append(score)
                        else:
                            x.append(0)

                y_ = query_question['label']
                feat[q1id][q2id] = (x, y_)
                X.append(x)
                y.append(y_)
        return feat, X, y

    def train(self):
        self.X, self.y = [], []
        path = os.path.join('feature', 'train', self.path)
        if not os.path.exists(path):
            feat, self.X, self.y = self.extract_features(
                self.traindata, self.trainidx, self.trainelmo)

            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))
            for q1id in feat:
                for q2id in feat[q1id]:
                    self.X.append(feat[q1id][q2id][0])
                    self.y.append(feat[q1id][q2id][1])

        self.X = np.array(self.X)
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        self.scaler.fit(self.X)
        self.X = self.scaler.transform(self.X)

        if self.model == 'svm':
            self.svm.train_svm(trainvectors=self.X,
                               labels=self.y,
                               c='search',
                               kernel='search',
                               gamma='search',
                               jobs=10,
                               gridsearch=self.gridsearch)
        else:
            self.svm.train_regression(trainvectors=self.X,
                                      labels=self.y,
                                      c='search',
                                      penalty='search',
                                      tol='search',
                                      gridsearch=self.gridsearch,
                                      jobs=10)

    def validate(self):
        path = os.path.join('feature', 'dev', self.path)
        if not os.path.exists(path):
            feat, X, y = self.extract_features(self.devdata, self.devidx,
                                               self.develmo)
            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))

        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(feat):
            ranking[q1id] = []
            for q2id in feat[q1id]:
                X = feat[q1id][q2id][0]

                X = self.scaler.transform([X])[0]
                score, pred_label = self.svm.score(X)
                y_pred.append(pred_label)

                real_label = feat[q1id][q2id][1]
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        parameter_settings = self.svm.return_parameter_settings(clf=self.model)

        return ranking, y_real, y_pred, parameter_settings

    def test(self, testdata, elmoidx, elmovec, test_='test2016'):
        if test_ == 'test2016':
            path = os.path.join('feature', 'test2016', self.path)
        else:
            path = os.path.join('feature', 'test2017', self.path)

        self.testdata = testdata
        if not os.path.exists(path):
            feat, X, y = self.extract_features(self.testdata, elmoidx, elmovec)
            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))

        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(feat):
            ranking[q1id] = []
            for q2id in feat[q1id]:
                X = feat[q1id][q2id][0]

                X = self.scaler.transform([X])[0]
                score, pred_label = self.svm.score(X)
                y_pred.append(pred_label)

                real_label = feat[q1id][q2id][1]
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        parameter_settings = self.svm.return_parameter_settings(clf=self.model)

        return ranking, y_real, y_pred, parameter_settings