コード例 #1
0
    def pre_process(self, text):
        if self.no_dictionary:
            x_processed = tu.normalize_punctuation(text).split()
        else:
            x_processed = [word for word in tu.normalize_punctuation(text).split()
                           if word in self.dictionary.token2id and word not in self.stoplist]

        return x_processed
コード例 #2
0
    def fit(self, X, y=None):

        logging.info("Loaded from file")

        x_clean = [tu.normalize_punctuation(text).split() for text in X]

        if self.w2v_model is None:
            self.w2v_model = w2v_models.build_word2vec(x_clean, size=100, window=10, min_count=1, dataname="test")

        if self.no_below == 1 and self.no_above == 1:
            self.no_dictionary = True
        else:
            self.dictionary = corpora.Dictionary(x_clean)
            self.dictionary.filter_extremes(no_above=self.no_above, no_below=self.no_below)

        # setting the coordinates for different models (start, stop)
        size = self.w2v_model.layer1_size
        self.feature_crd = {'00_avg': (0, size),
                            '01_std': (size, 2*size)}
        feature_cnt = 2
        start = 2*size
        l = size
        for i in range(1,self.diffmax0):
            #name = "%02d_diff0_%i" % (feature_cnt, i)
            #feature_cnt += 1
            #val = (start, start + l)
            #self.feature_crd[name] = val
            #start += l
            name = "%02d_diff0_std_%i" % (feature_cnt, i)
            feature_cnt += 1
            val = (start, start + l)
            self.feature_crd[name] = val
            start += l
        for i in range(1,self.diffmax1):
            name = "%02d_diff1_%i" % (feature_cnt, i)
            feature_cnt += 1
            val = (start, start + l)
            self.feature_crd[name] = val
            start += l
            name = "%02d_diff1_std_%i" % (feature_cnt, i)
            feature_cnt += 1
            val = (start, start + l)
            self.feature_crd[name] = val
            start += l
        self.length = start
        logging.info("Total feature length %i " % self.length )
        logging.info("W2V: got a model %s " % (self.w2v_model,))
        return self
コード例 #3
0
    def transform(self, X):

        # Text pre-processing
        x_clean = [tu.normalize_punctuation(text).split() for text in X]
        logging.info("DPGGM: Text prepocessed")

        # Vectorize using W2V model
        if self.dpgmm is not None:
            logging.info("Vectorizing a corpus")
            size = self.w2v_model.layer1_size
            if len(X) > 0:
                vecs = np.concatenate([self.clusterize(z) for z in x_clean], axis=0)
            else:
                vecs = np.zeros(size).reshape((1, size))
            logging.info("DPGMM: returning pre-processed data of shape %s" % (vecs.shape, ))
        else:
            logging.info("W2V Averaged: no model was provided.")
            vecs = np.zeros((len(X), 1))

        return vecs