def fit(self, X_pos, X_neg, y): if not sp.issparse(X_pos): X_pos = sp.csc_matrix(X_pos) if not sp.issparse(X_neg): X_neg = sp.csc_matrix(X_neg) if self.use_idf: n_samples, n_features = X_pos.shape counter = Counter(y) n_pos_samples = counter[1] n_neg_samples = counter[-1] df_pos = _document_frequency(X_pos) df_neg = _document_frequency(X_neg) # perform idf smoothing if required df_pos += int(self.smooth_idf) df_neg += int(self.smooth_idf) n_samples += int(self.smooth_idf) n_pos_samples += int(self.smooth_idf) n_neg_samples += int(self.smooth_idf) # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = np.log(float(n_pos_samples) / df_pos) - np.log(float(n_neg_samples) / df_neg) + 1.0 self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features, format='csr') return self
def _limit_features(self, X, X_pos, X_neg, vocabulary, high=None, low=None, limit=None): if high is None and low is None and limit is None: return X, set() # Calculate a mask based on document frequencies dfs = _document_frequency(X) tfs = np.asarray(X.sum(axis=0)).ravel() mask = np.ones(len(dfs), dtype=bool) if high is not None: mask &= dfs <= high if low is not None: mask &= dfs >= low if limit is not None and mask.sum() > limit: mask_inds = (-tfs[mask]).argsort()[:limit] new_mask = np.zeros(len(dfs), dtype=bool) new_mask[np.where(mask)[0][mask_inds]] = True mask = new_mask new_indices = np.cumsum(mask) - 1 # maps old indices to new removed_terms = set() for term, old_index in list(six.iteritems(vocabulary)): if mask[old_index]: vocabulary[term] = new_indices[old_index] else: del vocabulary[term] removed_terms.add(term) kept_indices = np.where(mask)[0] if len(kept_indices) == 0: raise ValueError("After pruning, no terms remain. Try a lower" " min_df or a higher max_df.") return X[:, kept_indices], X_pos[:, kept_indices], X_neg[:, kept_indices], removed_terms
def fit(self, X, y=None): """Learn the idf vector (global term weights) Parameters ---------- X : sparse matrix, [n_samples, n_features] a matrix of term/token counts """ _X = X.toarray() self.avdl = _X.sum() / _X.shape[0] # 句子的平均长度 # print("原来的fit的数据:\n",X) # 计算每个词语的tf的值 self.tf = _X.sum(0) / _X.sum() # [M] #M表示总词语的数量 self.tf = self.tf.reshape([1, self.tf.shape[0]]) # [1,M] # print("tf\n",self.tf) ##################以下是TFIDFtransform代码########################## if not sp.issparse(X): X = sp.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) # perform idf smoothing if required df += int(self.smooth_idf) n_samples += int(self.smooth_idf) # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = np.log(float(n_samples) / df) + 1.0 self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features, format='csr') return self
def fit(self, X, y=None): """Learn the idf vector (global term weights) Parameters ---------- X : sparse matrix, [n_samples, n_features] a matrix of term/token counts """ if not sp.issparse(X): X = sp.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) # perform idf smoothing if required df += int(self.smooth_idf) n_samples += int(self.smooth_idf) # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = np.log(float(n_samples) / df) + 1.0 self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features, format='csr') return self
def fit(self, X, y=None): """Learn the idf vector (global term weights). Parameters ---------- X : sparse matrix of shape n_samples, n_features) A matrix of term/token counts. """ X = check_array(X, accept_sparse=('csr', 'csc')) if not sp.issparse(X): X = sp.csr_matrix(X) dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64 if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) df = df.astype(dtype, **_astype_copy_false(df)) # perform idf smoothing if required df += int(self.smooth_idf) n_samples += int(self.smooth_idf) # log+1 instead of log makes sure terms with zero idf don't get suppressed entirely. # original idf = np.log(n_samples / df) + 1 # BM25 idf = np.log((n_samples - df + 0.5)/(df + 0.5) + 1) # = np.log(n_samples - df + 0.5 + df + 0.5) - np.log(df + 0.5) idf = np.log(n_samples + 1) - np.log(df + 0.5) self._idf_diag = sp.diags(idf, offsets=0, shape=(n_features, n_features), format='csr', dtype=dtype) return self
def partial_refit_transform(self, raw_documents): """Update the exiting vocabulary dictionary and idf and return term-document matrix. This is equivalent to partial_refit followed by transform, but more efficiently implemented. Parameters ---------- raw_documents : iterable An iterable which yields either str, unicode or file objects. Returns ------- X : array, [n_samples, n_features] Document-term matrix. """ logger.info("validate: checking {} records for new tokens".format(len(raw_documents))) current_vocabulary_size = len(self.vocabulary_) X = super().partial_refit_transform(raw_documents) vocabulary_size_change = len(self.vocabulary_) - current_vocabulary_size if vocabulary_size_change > 0: df = _document_frequency(X) self.n_features += vocabulary_size_change self.n_samples += X.shape[0] self.df = np.vstack((np.hstack((self.df, np.zeros(vocabulary_size_change))), df)) self.df = self.df.sum(0) self._update_idf() return self._tfidf.transform(X)
def fit(self, X: scipy.sparse.csr_matrix, y: Any = None) -> "BM25Transformer": """Learn the idf vector (global term weights). Parameters ---------- X : sparse matrix of shape n_samples, n_features) A matrix of term/token counts. """ X = check_array(X, accept_sparse=("csr", "csc")) if not scipy.sparse.issparse(X): X = scipy.sparse.csr_matrix(X) dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64 if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) df = df.astype(dtype, **_astype_copy_false(df)) idf = np.log(1 + (n_samples - df + 0.5) / (df + 0.5)) self._idf_diag = scipy.sparse.diags(idf, offsets=0, shape=(n_features, n_features), format="csr", dtype=dtype) return self
def fit(self, X): if not sp.issparse(X): X = sp.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) idf = np.log((n_samples - df + 0.5) / (df + 0.5)) self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features) return self
def fit_transform(self, raw_documents, y=None): """Standard TfidfVectorizer fit method plus storing some meta-data needed for partial_refit methods (document frequency vector and number of samples). """ X = super().fit_transform(raw_documents) self.n_samples, self.n_features = X.shape self.df = _document_frequency(X) self.n_samples += int(self._tfidf.smooth_idf) self._update_idf() return X
def fit(self, X: Union[sps.csr_matrix, np.ndarray]) -> 'BM25Transformer': if not sps.issparse(X): X = sps.csr_matrix(X) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) idf = np.log((n_samples - df + 0.5) / (df + 0.5)) self._idf_diag = sps.spdiags(idf, diags=0, m=n_features, n=n_features) return self
def fit(self, X): """ Parameters ---------- X : sparse matrix, [n_samples, n_features] document-term matrix """ X = check_array(X, accept_sparse=("csr", "csc")) if not sp.issparse(X): X = sp.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) idf = np.log((n_samples - df + 0.5) / (df + 0.5)) if self.floor is not None: idf = idf * (idf > self.floor) + self.floor * (idf < self.floor) self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features) # Create BM25 features # Document length (number of terms) in each row # Shape is (n_samples, 1) dl = X.sum(axis=1) # Number of non-zero elements in each row # Shape is (n_samples, ) sz = X.indptr[1:] - X.indptr[0:-1] # In each row, repeat `dl` for `sz` times # Shape is (sum(sz), ) # Example # ------- # dl = [4, 5, 6] # sz = [1, 2, 3] # rep = [4, 5, 5, 6, 6, 6] rep = np.repeat(np.asarray(dl), sz) # Average document length # Scalar value avgdl = np.average(dl) # Compute BM25 score only for non-zero elements data = (X.data * (self.k1 + 1) / (X.data + self.k1 * (1 - self.b + self.b * rep / avgdl))) X = sp.csr_matrix((data, X.indices, X.indptr), shape=X.shape) if self.norm: X = normalize(X, norm=self.norm, copy=False) self._doc_matrix = X return self
def fit(self, X): """ Parameters ---------- X : sparse matrix, [n_samples, n_features] document-term matrix """ if not sp.issparse(X): X = sp.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) idf = np.log((n_samples - df + 0.5) / (df + 0.5)) self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features) return self
def fit(self, X, y): """ Learn the idf vector (global term weights) don't care the specifict value in X :param X: sparse matrix, [n_samples, n_features] a matrix of term counts :param y: class_label, [n_samples] :return: [n_class, n_features] """ if self.use_idf: labelbin = LabelBinarizer() # 计算样本属于哪个类别 [n_samples, n_classes] Y = labelbin.fit_transform(y) # LabelBinarizer 对于二分类的返回结果跟多分类的返回结果有点不一样 # so deal with binary if labelbin.y_type_ == "binary": Y = np.hstack((1 - Y, Y)) self.classes_ = labelbin.classes_ # 计算每个特征词属于每个类别的样本数 [n_classes, n_features] class_df_ = vectorize.class_df(X, Y) # 如果特征词所在的类别不确定或不知道时,用这个特征词出现的总样本数来代替 unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1) class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0) self.classes_ = np.concatenate( (self.classes_, np.array(["unknow"])), axis=0) # smooth class_df_ class_df_ += int(self.smooth_idf) n_samples, n_features = X.shape df = _document_frequency(X) # perform idf smoothing if required df += int(self.smooth_idf) n_samples += int(self.smooth_idf) # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = float(n_samples) / df idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features) # [n_classes, n_features] self._idf = np.log(safe_sparse_dot(class_df_, idf_diag)) + 1.0 return self
def train(self, contexts, responses): """Fit the tf-idf transform and compute idf statistics.""" with ignore_warnings(): # Ignore deprecated `non_negative` warning. self._vectorizer = HashingVectorizer(non_negative=True) self._tfidf_transform = TfidfTransformer() count_matrix = self._tfidf_transform.fit_transform( self._vectorizer.transform(contexts + responses)) n_samples, n_features = count_matrix.shape df = _document_frequency(count_matrix) idf = np.log((n_samples - df + 0.5) / (df + 0.5)) self._idf_diag = sp.spdiags( idf, diags=0, m=n_features, n=n_features ) document_lengths = count_matrix.sum(axis=1) self._average_document_length = np.mean(document_lengths) print(self._average_document_length)
def fit(self, X, y): """ Learn the idf vector (global term weights) don't care the specifict value in X :param X: sparse matrix, [n_samples, n_features] a matrix of term counts :param y: class_label, [n_samples] :return: [n_class, n_features] """ if self.use_idf: labelbin = LabelBinarizer() # 计算样本属于哪个类别 [n_samples, n_classes] Y = labelbin.fit_transform(y) # LabelBinarizer 对于二分类的返回结果跟多分类的返回结果有点不一样 # so deal with binary if labelbin.y_type_ == "binary": Y = np.hstack((1 - Y, Y)) self.classes_ = labelbin.classes_ # 计算每个特征词属于每个类别的样本数 [n_classes, n_features] class_df_ = vectorize.class_df(X, Y) # 如果特征词所在的类别不确定或不知道时,用这个特征词出现的总样本数来代替 unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1) class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0) self.classes_ = np.concatenate((self.classes_, np.array(["unknow"])), axis=0) # smooth class_df_ class_df_ += int(self.smooth_idf) n_samples, n_features = X.shape df = _document_frequency(X) # perform idf smoothing if required df += int(self.smooth_idf) n_samples += int(self.smooth_idf) # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = float(n_samples) / df idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features) # [n_classes, n_features] self._idf = np.log(safe_sparse_dot(class_df_, idf_diag)) + 1.0 return self
def fit(self, X): """ Parameters ---------- X : sparse matrix, [n_samples, n_features] document-term matrix """ if not sp.sparse.issparse(X): X = sp.sparse.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) idf = np.log((n_samples - df + 0.5) / (df + 0.5)) self._idf_diag = sp.sparse.spdiags(idf, diags=0, m=n_features, n=n_features) doc_len = X.sum(axis=1) self._average_document_len = np.average(doc_len) return self
def fit(self, X, y=None): """Learn the idf vector (global term weights) Parameters X : sparse matrix, [n_samples, n_features] a matrix of term/token counts """ if not sp.issparse(X): X = sp.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) idf = np.log10(float(n_samples) / df) #remove 1? should I add TF? self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features, format='csr') return self
def fit(self, X, y=None): """Learn the idf vector (global term weights) Parameters ---------- X : sparse matrix, [n_samples, n_features] a matrix of term/token counts """ if not sp.issparse(X): X = sp.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) idf = np.log10((float(n_samples) - df + 0.5) / (df+0.5)) self._avgdl = avgdl = np.average(X.sum(axis=1)) #print self._avgdl self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features, format='csr') return self
def fit(self, X, y=None): """Learn the idf vector (global term weights) Parameters ---------- X : sparse matrix, [n_samples, n_features] a matrix of term/token counts """ X = X.toarray() self.avdl = X.sum() / X.shape[0] #句子的平均长度 # print("原来的fit的数据:\n",X) #计算每个词语的tf的值 self.tf = X.sum(0) / X.sum() #[M] #M表示总词语的数量 self.tf = self.tf.reshape([1, self.tf.shape[0]]) #[1,M] # print("tf\n",self.tf) ###### 原来tfidf的代码 ###### X = check_array(X, accept_sparse=('csr', 'csc')) if not sp.issparse(X): X = sp.csr_matrix(X) dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64 if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X).astype(dtype) # perform idf smoothing if required df += int(self.smooth_idf) n_samples += int(self.smooth_idf) # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = np.log(n_samples / df) + 1 self._idf_diag = sp.diags(idf, offsets=0, shape=(n_features, n_features), format='csr', dtype=dtype) return self
def fit(self, X): """ TODO 用来计算相似度? X : sparse matrix, [n_samples, n_features] document-term matrix """ if not sp.isspmatrix(X): X = sp.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape # _document_frequency计算某个词在文档中出现的次数 # Count the number of non-zero values for each feature in sparse X. df = _document_frequency(X) # 逆文档频率 idf = np.log((n_samples - df + 0.5) / (df + 0.5)) self._idf_log = sp.spdiags(idf, diags=0, m=n_features, n=n_features) return self
def fit(self, X, y=None): """Learn the idf vector (global term weights) Parameters ---------- X : sparse matrix, [n_samples, n_features] a matrix of term/token counts """ if not sp.issparse(X): X = sp.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape n_samples=float(n_samples) df = _document_frequency(X) # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. # idf = np.log(df / n_samples) idf = df / n_samples self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features) return self
def fit(self, X, y=None): if not sp.issparse(X): X = sp.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) # perform idf smoothing if required #df += int(self.smooth_idf) #n_samples += int(self.smooth_idf) # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = np.log(1 + 1 / df) dfidf = df * idf self._dfidf = dfidf self._dfidf_diag = sp.spdiags(dfidf, diags=0, m=n_features, n=n_features, format='csr') return self
def get_tf_idf_scores(msgs): count_vec = CountVectorizer(binary=False) count_df = count_vec.fit_transform(msgs) transformer = TfidfTransformer(use_idf=True, smooth_idf=False) x1 = transformer.fit_transform(count_df) posts_cnt = len(msgs) vals = [ math.log(x) * math.log(posts_cnt / float(y)) for x, y in zip( count_df.sum(axis=0).tolist()[0], _document_frequency(x1)) ] score_map = {k: vals[v] for k, v in count_vec.vocabulary_.items()} pattern = re.compile('[\W_]+', re.UNICODE) scores = [] for msg in msgs: score = 0 msg = pattern.sub(' ', msg).lower() for word in msg.split(): if word in score_map: score += score_map[word] scores.append({"tf_idf_score": score}) return scores
def fit(self, X, y=None): """Learn the bm25 vector (global term weights) Parameters ---------- X : sparse matrix, [n_samples, n_features] a matrix of term/token counts """ if not sp.issparse(X): X = sp.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) # perform idf smoothing if required df += int(self.smooth_idf) n_samples += int(self.smooth_idf) # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = np.log((float(n_samples) - df + 0.5) / (df + 0.5)) self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features) return self
def mapper(X, use_idf=self.use_idf): if not sp.issparse(X): X = sp.csc_matrix(X) if use_idf: return _document_frequency(X)
def getBM25Score(dataset, k1=1.2, b=0.75, mergetype='list', min_df=2, cands=None): if not cands: cands = getAllCandidates(dataset, deliver_as='sentences') ds = [list(itertools.chain.from_iterable(doc)) for doc in cands] else: ds = cands words = listOfTaggedToListOfWords(dataset) # documents = listOfTaggedToString(dataset) # stopW = set(nltk.corpus.stopwords.words('english')) vec_tf = TfidfVectorizer(tokenizer=lambda e: e, lowercase=False, use_idf=False) vec_tf.fit(ds) # vec_tf.ngram_range = (1, findBiggestGram(ds)) # vec_tf.tokenizer = None # vec_tf.stop_words = stopW # vec_tf.min_df = 2 terms = vec_tf.get_feature_names() X = vec_tf.transform(words) tf_arr = X.toarray() N = len(dataset) avgDL = getAvgDL(ds) DF_all = _document_frequency(X) # .sum() score = [] for i, doc in enumerate(dataset.values()): temp = [] dl = len(list(itertools.chain.from_iterable(doc))) for j in range(len(terms)): DF = DF_all[j] tf = tf_arr[i][j] bm25_idf = log((N - DF + 0.5) / (DF + 0.5), 10) bm25_tf = (tf * (k1 + 1)) / (tf + k1 * (1 - b + (b * (dl / avgDL)))) bm25 = bm25_tf * (bm25_idf + 1.) if DF >= min_df: temp.append(bm25 * (len(terms[j]) / len(terms[j].split()))) else: temp.append(0.) score.append(temp) if mergetype == 'dict': return mergeDict(dataset, terms, score) else: return merge(dataset, terms, score)