Python _document_frequencyの例、sklearn.feature_extraction.text._document_frequency Pythonの例

コード例 #1

0

ファイルを表示

ファイル: __init__.py プロジェクト: AegonITargaryen/sklearn-deltatfidf

    def fit(self, X_pos, X_neg, y):
        if not sp.issparse(X_pos):
            X_pos = sp.csc_matrix(X_pos)
        if not sp.issparse(X_neg):
            X_neg = sp.csc_matrix(X_neg)
        if self.use_idf:
            n_samples, n_features = X_pos.shape
            counter = Counter(y)
            n_pos_samples = counter[1]
            n_neg_samples = counter[-1]
            df_pos = _document_frequency(X_pos)
            df_neg = _document_frequency(X_neg)

            # perform idf smoothing if required
            df_pos += int(self.smooth_idf)
            df_neg += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)
            n_pos_samples += int(self.smooth_idf)
            n_neg_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = np.log(float(n_pos_samples) / df_pos) - np.log(float(n_neg_samples) / df_neg) + 1.0
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
                                        n=n_features, format='csr')

        return self

コード例 #2

0

ファイルを表示

ファイル: transformer.py プロジェクト: r-m-n/sklearn-deltatfidf

    def fit(self, X_pos, X_neg, y):
        if not sp.issparse(X_pos):
            X_pos = sp.csc_matrix(X_pos)
        if not sp.issparse(X_neg):
            X_neg = sp.csc_matrix(X_neg)
        if self.use_idf:
            n_samples, n_features = X_pos.shape
            counter = Counter(y)
            n_pos_samples = counter[1]
            n_neg_samples = counter[-1]
            df_pos = _document_frequency(X_pos)
            df_neg = _document_frequency(X_neg)

            # perform idf smoothing if required
            df_pos += int(self.smooth_idf)
            df_neg += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)
            n_pos_samples += int(self.smooth_idf)
            n_neg_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = np.log(float(n_pos_samples) / df_pos) - np.log(float(n_neg_samples) / df_neg) + 1.0
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
                                        n=n_features, format='csr')

        return self

コード例 #3

0

ファイルを表示

    def _limit_features(self, X, X_pos, X_neg, vocabulary, high=None, low=None,
                        limit=None):
        if high is None and low is None and limit is None:
            return X, set()

        # Calculate a mask based on document frequencies
        dfs = _document_frequency(X)
        tfs = np.asarray(X.sum(axis=0)).ravel()
        mask = np.ones(len(dfs), dtype=bool)
        if high is not None:
            mask &= dfs <= high
        if low is not None:
            mask &= dfs >= low
        if limit is not None and mask.sum() > limit:
            mask_inds = (-tfs[mask]).argsort()[:limit]
            new_mask = np.zeros(len(dfs), dtype=bool)
            new_mask[np.where(mask)[0][mask_inds]] = True
            mask = new_mask

        new_indices = np.cumsum(mask) - 1  # maps old indices to new
        removed_terms = set()
        for term, old_index in list(six.iteritems(vocabulary)):
            if mask[old_index]:
                vocabulary[term] = new_indices[old_index]
            else:
                del vocabulary[term]
                removed_terms.add(term)
        kept_indices = np.where(mask)[0]
        if len(kept_indices) == 0:
            raise ValueError("After pruning, no terms remain. Try a lower"
                             " min_df or a higher max_df.")
        return X[:, kept_indices], X_pos[:, kept_indices], X_neg[:, kept_indices], removed_terms

コード例 #4

0

ファイルを表示

    def fit(self, X, y=None):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        _X = X.toarray()
        self.avdl = _X.sum() / _X.shape[0]  # 句子的平均长度
        # print("原来的fit的数据：\n",X)

        # 计算每个词语的tf的值
        self.tf = _X.sum(0) / _X.sum()  # [M] #M表示总词语的数量
        self.tf = self.tf.reshape([1, self.tf.shape[0]])  # [1,M]
        # print("tf\n",self.tf)
        ##################以下是TFIDFtransform代码##########################
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = np.log(float(n_samples) / df) + 1.0
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
                                        n=n_features, format='csr')

        return self

コード例 #5

0

ファイルを表示

    def fit(self, X, y=None):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = np.log(float(n_samples) / df) + 1.0
            self._idf_diag = sp.spdiags(idf,
                                        diags=0,
                                        m=n_features,
                                        n=n_features,
                                        format='csr')

        return self

コード例 #6

0

ファイルを表示

ファイル: my_text.py プロジェクト: hanmarookim/p3-mrc-gigoedokhae

    def fit(self, X, y=None):
        """Learn the idf vector (global term weights).

        Parameters
        ----------
        X : sparse matrix of shape n_samples, n_features)
            A matrix of term/token counts.
        """
        X = check_array(X, accept_sparse=('csr', 'csc'))
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64

        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            df = df.astype(dtype, **_astype_copy_false(df))

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get suppressed entirely.
            # original idf = np.log(n_samples / df) + 1
            # BM25 idf = np.log((n_samples - df + 0.5)/(df + 0.5) + 1)
            #          = np.log(n_samples - df + 0.5 + df + 0.5) - np.log(df + 0.5)
            idf = np.log(n_samples + 1) - np.log(df + 0.5)
            self._idf_diag = sp.diags(idf,
                                      offsets=0,
                                      shape=(n_features, n_features),
                                      format='csr',
                                      dtype=dtype)

        return self

コード例 #7

0

ファイルを表示

    def partial_refit_transform(self, raw_documents):
        """Update the exiting vocabulary dictionary and idf and return term-document matrix.

        This is equivalent to partial_refit followed by transform, but more efficiently
        implemented.

        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.

        Returns
        -------
        X : array, [n_samples, n_features]
            Document-term matrix.
        """
        logger.info("validate: checking {} records for new tokens".format(len(raw_documents)))
        current_vocabulary_size = len(self.vocabulary_)
        X = super().partial_refit_transform(raw_documents)
        vocabulary_size_change = len(self.vocabulary_) - current_vocabulary_size
        if vocabulary_size_change > 0:
            df = _document_frequency(X)
            self.n_features += vocabulary_size_change
            self.n_samples += X.shape[0]
            self.df = np.vstack((np.hstack((self.df, np.zeros(vocabulary_size_change))), df))
            self.df = self.df.sum(0)
            self._update_idf()

        return self._tfidf.transform(X)

コード例 #8

0

ファイルを表示

    def fit(self,
            X: scipy.sparse.csr_matrix,
            y: Any = None) -> "BM25Transformer":
        """Learn the idf vector (global term weights).

        Parameters
        ----------
        X : sparse matrix of shape n_samples, n_features)
            A matrix of term/token counts.
        """
        X = check_array(X, accept_sparse=("csr", "csc"))
        if not scipy.sparse.issparse(X):
            X = scipy.sparse.csr_matrix(X)
        dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64

        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            df = df.astype(dtype, **_astype_copy_false(df))

            idf = np.log(1 + (n_samples - df + 0.5) / (df + 0.5))
            self._idf_diag = scipy.sparse.diags(idf,
                                                offsets=0,
                                                shape=(n_features, n_features),
                                                format="csr",
                                                dtype=dtype)
        return self

コード例 #9

0

ファイルを表示

 def fit(self, X):
     if not sp.issparse(X):
         X = sp.csc_matrix(X)
     if self.use_idf:
         n_samples, n_features = X.shape
         df = _document_frequency(X)
         idf = np.log((n_samples - df + 0.5) / (df + 0.5))
         self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)
     return self

コード例 #10

0

ファイルを表示

 def fit_transform(self, raw_documents, y=None):
     """Standard TfidfVectorizer fit method plus storing some meta-data needed for partial_refit methods
     (document frequency vector and number of samples).
     """
     X = super().fit_transform(raw_documents)
     self.n_samples, self.n_features = X.shape
     self.df = _document_frequency(X)
     self.n_samples += int(self._tfidf.smooth_idf)
     self._update_idf()
     return X

コード例 #11

0

ファイルを表示

ファイル: bm25.py プロジェクト: wararaki718/scrapbox3

 def fit(self, X: Union[sps.csr_matrix, np.ndarray]) -> 'BM25Transformer':
     if not sps.issparse(X):
         X = sps.csr_matrix(X)
     
     if self.use_idf:
         n_samples, n_features = X.shape
         df = _document_frequency(X)
         idf = np.log((n_samples - df + 0.5) / (df + 0.5))
         self._idf_diag = sps.spdiags(idf, diags=0, m=n_features, n=n_features)
     return self

コード例 #12

0

ファイルを表示

ファイル: text_transformers.py プロジェクト: phamhoanxda/end-to-end-qna

    def fit(self, X):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        """
        X = check_array(X, accept_sparse=("csr", "csc"))
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            idf = np.log((n_samples - df + 0.5) / (df + 0.5))
            if self.floor is not None:
                idf = idf * (idf > self.floor) + self.floor * (idf <
                                                               self.floor)
            self._idf_diag = sp.spdiags(idf,
                                        diags=0,
                                        m=n_features,
                                        n=n_features)

        # Create BM25 features

        # Document length (number of terms) in each row
        # Shape is (n_samples, 1)
        dl = X.sum(axis=1)
        # Number of non-zero elements in each row
        # Shape is (n_samples, )
        sz = X.indptr[1:] - X.indptr[0:-1]
        # In each row, repeat `dl` for `sz` times
        # Shape is (sum(sz), )
        # Example
        # -------
        # dl = [4, 5, 6]
        # sz = [1, 2, 3]
        # rep = [4, 5, 5, 6, 6, 6]
        rep = np.repeat(np.asarray(dl), sz)
        # Average document length
        # Scalar value
        avgdl = np.average(dl)
        # Compute BM25 score only for non-zero elements
        data = (X.data * (self.k1 + 1) / (X.data + self.k1 *
                                          (1 - self.b + self.b * rep / avgdl)))
        X = sp.csr_matrix((data, X.indices, X.indptr), shape=X.shape)

        if self.norm:
            X = normalize(X, norm=self.norm, copy=False)

        self._doc_matrix = X
        return self

コード例 #13

0

ファイルを表示

ファイル: bm25_tfidf.py プロジェクト: filrougestif2016/STIF_FILROUGE2016

 def fit(self, X):
     """
     Parameters
     ----------
     X : sparse matrix, [n_samples, n_features]
         document-term matrix
     """
     if not sp.issparse(X):
         X = sp.csc_matrix(X)
     if self.use_idf:
         n_samples, n_features = X.shape
         df = _document_frequency(X)
         idf = np.log((n_samples - df + 0.5) / (df + 0.5))
         self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)
     return self

コード例 #14

0

ファイルを表示

    def fit(self, X, y):
        """
        Learn the idf vector (global term weights)
        don't care the specifict value in X
        :param X: sparse matrix, [n_samples, n_features]
                  a matrix of term counts
        :param y: class_label, [n_samples]
        :return: [n_class, n_features]
        """
        if self.use_idf:
            labelbin = LabelBinarizer()
            # 计算样本属于哪个类别 [n_samples, n_classes]
            Y = labelbin.fit_transform(y)
            # LabelBinarizer 对于二分类的返回结果跟多分类的返回结果有点不一样
            # so deal with binary
            if labelbin.y_type_ == "binary":
                Y = np.hstack((1 - Y, Y))
            self.classes_ = labelbin.classes_

            # 计算每个特征词属于每个类别的样本数 [n_classes, n_features]
            class_df_ = vectorize.class_df(X, Y)

            # 如果特征词所在的类别不确定或不知道时，用这个特征词出现的总样本数来代替
            unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1)
            class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0)
            self.classes_ = np.concatenate(
                (self.classes_, np.array(["unknow"])), axis=0)

            # smooth class_df_
            class_df_ += int(self.smooth_idf)

            n_samples, n_features = X.shape
            df = _document_frequency(X)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = float(n_samples) / df
            idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)

            # [n_classes, n_features]
            self._idf = np.log(safe_sparse_dot(class_df_, idf_diag)) + 1.0

        return self

コード例 #15

0

ファイルを表示

 def train(self, contexts, responses):
     """Fit the tf-idf transform and compute idf statistics."""
     with ignore_warnings():
         # Ignore deprecated `non_negative` warning.
         self._vectorizer = HashingVectorizer(non_negative=True)
     self._tfidf_transform = TfidfTransformer()
     count_matrix = self._tfidf_transform.fit_transform(
         self._vectorizer.transform(contexts + responses))
     n_samples, n_features = count_matrix.shape
     df = _document_frequency(count_matrix)
     idf = np.log((n_samples - df + 0.5) / (df + 0.5))
     self._idf_diag = sp.spdiags(
         idf, diags=0, m=n_features, n=n_features
     )
     document_lengths = count_matrix.sum(axis=1)
     self._average_document_length = np.mean(document_lengths)
     print(self._average_document_length)

コード例 #16

0

ファイルを表示

ファイル: improve_tf_idf.py プロジェクト: zqlhuanying/Image_Emotion

    def fit(self, X, y):
        """
        Learn the idf vector (global term weights)
        don't care the specifict value in X
        :param X: sparse matrix, [n_samples, n_features]
                  a matrix of term counts
        :param y: class_label, [n_samples]
        :return: [n_class, n_features]
        """
        if self.use_idf:
            labelbin = LabelBinarizer()
            # 计算样本属于哪个类别 [n_samples, n_classes]
            Y = labelbin.fit_transform(y)
            # LabelBinarizer 对于二分类的返回结果跟多分类的返回结果有点不一样
            # so deal with binary
            if labelbin.y_type_ == "binary":
                Y = np.hstack((1 - Y, Y))
            self.classes_ = labelbin.classes_

            # 计算每个特征词属于每个类别的样本数 [n_classes, n_features]
            class_df_ = vectorize.class_df(X, Y)

            # 如果特征词所在的类别不确定或不知道时，用这个特征词出现的总样本数来代替
            unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1)
            class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0)
            self.classes_ = np.concatenate((self.classes_, np.array(["unknow"])), axis=0)

            # smooth class_df_
            class_df_ += int(self.smooth_idf)

            n_samples, n_features = X.shape
            df = _document_frequency(X)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = float(n_samples) / df
            idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)

            # [n_classes, n_features]
            self._idf = np.log(safe_sparse_dot(class_df_, idf_diag)) + 1.0

        return self

コード例 #17

0

ファイルを表示

    def fit(self, X):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features] document-term matrix
        """
        if not sp.sparse.issparse(X):
            X = sp.sparse.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            idf = np.log((n_samples - df + 0.5) / (df + 0.5))
            self._idf_diag = sp.sparse.spdiags(idf, diags=0, m=n_features, n=n_features)

        doc_len = X.sum(axis=1)
        self._average_document_len = np.average(doc_len)

        return self

コード例 #18

0

ファイルを表示

    def fit(self, X, y=None):
        """Learn the idf vector (global term weights)
		Parameters
		X : sparse matrix, [n_samples, n_features]
			a matrix of term/token counts
		"""
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            idf = np.log10(float(n_samples) / df)  #remove 1? should I add TF?
            self._idf_diag = sp.spdiags(idf,
                                        diags=0,
                                        m=n_features,
                                        n=n_features,
                                        format='csr')

            return self

コード例 #19

0

ファイルを表示

    def fit(self, X, y=None):
        """Learn the idf vector (global term weights)
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape

            df = _document_frequency(X)

            idf = np.log10((float(n_samples) - df + 0.5) / (df+0.5))
            self._avgdl = avgdl = np.average(X.sum(axis=1))
            #print self._avgdl
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features, format='csr')

            return self

コード例 #20

0

ファイルを表示

ファイル: BM25Vectorizer.py プロジェクト: yhzhunan/Bm25Vectorizer

    def fit(self, X, y=None):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        X = X.toarray()
        self.avdl = X.sum() / X.shape[0]  #句子的平均长度
        # print("原来的fit的数据：\n",X)

        #计算每个词语的tf的值
        self.tf = X.sum(0) / X.sum()  #[M] #M表示总词语的数量
        self.tf = self.tf.reshape([1, self.tf.shape[0]])  #[1,M]
        # print("tf\n",self.tf)
        ######       原来tfidf的代码  ######

        X = check_array(X, accept_sparse=('csr', 'csc'))
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64

        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X).astype(dtype)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = np.log(n_samples / df) + 1
            self._idf_diag = sp.diags(idf,
                                      offsets=0,
                                      shape=(n_features, n_features),
                                      format='csr',
                                      dtype=dtype)

        return self

コード例 #21

0

ファイルを表示

ファイル: recall.py プロジェクト: YaboSun/my_wsdm_doc_retrieval

    def fit(self, X):
        """
        TODO 用来计算相似度？
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        """
        if not sp.isspmatrix(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape

            # _document_frequency计算某个词在文档中出现的次数
            # Count the number of non-zero values for each feature in sparse X.
            df = _document_frequency(X)
            # 逆文档频率
            idf = np.log((n_samples - df + 0.5) / (df + 0.5))
            self._idf_log = sp.spdiags(idf,
                                       diags=0,
                                       m=n_features,
                                       n=n_features)
        return self

コード例 #22

0

ファイルを表示

ファイル: DataDrivenRelation.py プロジェクト: pieterlukasse/data_pipeline-1

    def fit(self, X, y=None):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            n_samples=float(n_samples)
            df = _document_frequency(X)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            # idf = np.log(df / n_samples)
            idf = df / n_samples
            self._idf_diag = sp.spdiags(idf,
                diags=0, m=n_features, n=n_features)
        return self

コード例 #23

0

ファイルを表示

    def fit(self, X, y=None):
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)

            # perform idf smoothing if required
            #df += int(self.smooth_idf)
            #n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = np.log(1 + 1 / df)

            dfidf = df * idf
            self._dfidf = dfidf
            self._dfidf_diag = sp.spdiags(dfidf,
                                          diags=0,
                                          m=n_features,
                                          n=n_features,
                                          format='csr')
        return self

コード例 #24

0

ファイルを表示

def get_tf_idf_scores(msgs):
    count_vec = CountVectorizer(binary=False)
    count_df = count_vec.fit_transform(msgs)
    transformer = TfidfTransformer(use_idf=True, smooth_idf=False)
    x1 = transformer.fit_transform(count_df)
    posts_cnt = len(msgs)
    vals = [
        math.log(x) * math.log(posts_cnt / float(y)) for x, y in zip(
            count_df.sum(axis=0).tolist()[0], _document_frequency(x1))
    ]
    score_map = {k: vals[v] for k, v in count_vec.vocabulary_.items()}

    pattern = re.compile('[\W_]+', re.UNICODE)

    scores = []
    for msg in msgs:
        score = 0
        msg = pattern.sub(' ', msg).lower()
        for word in msg.split():
            if word in score_map:
                score += score_map[word]
        scores.append({"tf_idf_score": score})

    return scores

コード例 #25

0

ファイルを表示

ファイル: BM25Vectorizer.py プロジェクト: filrougestif2016/STIF_FILROUGE2016

    def fit(self, X, y=None):
        """Learn the bm25 vector (global term weights)
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = np.log((float(n_samples) - df + 0.5) / (df + 0.5))
            self._idf_diag = sp.spdiags(idf,
                                        diags=0, m=n_features, n=n_features)

        return self

コード例 #26

0

ファイルを表示

ファイル: text.py プロジェクト: HendryLi/sparkit-learn

 def mapper(X, use_idf=self.use_idf):
     if not sp.issparse(X):
         X = sp.csc_matrix(X)
     if use_idf:
         return _document_frequency(X)

コード例 #27

0

ファイルを表示

 def mapper(X, use_idf=self.use_idf):
     if not sp.issparse(X):
         X = sp.csc_matrix(X)
     if use_idf:
         return _document_frequency(X)

コード例 #28

0

ファイルを表示

ファイル: project_ex3.py プロジェクト: davidfrickert/PRI-19-20

def getBM25Score(dataset,
                 k1=1.2,
                 b=0.75,
                 mergetype='list',
                 min_df=2,
                 cands=None):
    if not cands:
        cands = getAllCandidates(dataset, deliver_as='sentences')
        ds = [list(itertools.chain.from_iterable(doc)) for doc in cands]
    else:
        ds = cands
    words = listOfTaggedToListOfWords(dataset)
    # documents = listOfTaggedToString(dataset)
    # stopW = set(nltk.corpus.stopwords.words('english'))

    vec_tf = TfidfVectorizer(tokenizer=lambda e: e,
                             lowercase=False,
                             use_idf=False)

    vec_tf.fit(ds)

    #
    vec_tf.ngram_range = (1, findBiggestGram(ds))
    # vec_tf.tokenizer = None
    # vec_tf.stop_words = stopW
    # vec_tf.min_df = 2

    terms = vec_tf.get_feature_names()

    X = vec_tf.transform(words)

    tf_arr = X.toarray()

    N = len(dataset)
    avgDL = getAvgDL(ds)
    DF_all = _document_frequency(X)  # .sum()
    score = []

    for i, doc in enumerate(dataset.values()):
        temp = []
        dl = len(list(itertools.chain.from_iterable(doc)))

        for j in range(len(terms)):
            DF = DF_all[j]
            tf = tf_arr[i][j]

            bm25_idf = log((N - DF + 0.5) / (DF + 0.5), 10)
            bm25_tf = (tf * (k1 + 1)) / (tf + k1 * (1 - b + (b *
                                                             (dl / avgDL))))

            bm25 = bm25_tf * (bm25_idf + 1.)
            if DF >= min_df:
                temp.append(bm25 * (len(terms[j]) / len(terms[j].split())))
            else:
                temp.append(0.)
        score.append(temp)

    if mergetype == 'dict':
        return mergeDict(dataset, terms, score)
    else:
        return merge(dataset, terms, score)