def TextRank(document): #分句 sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(document) #计算词频 c = CountVectorizer() #计算tf-idf bow_matrix = c.fit_transform(sentences) normalized = TfidfTransformer().fit_transform(bow_matrix) #获取词袋模型中所有词语 all_words = (c.get_feature_names()) #index2word index2words = {v: k for k, v in c.vocabulary_.items()} #根据tf-idf 输出前三个关键字索引 top_n_idx = np.argsort(normalized.todense())[:, -3:] #print(top_n_idx) #根据tf-idf 获取top-n关键字 top_n_words = np.vectorize(index2words.get)(top_n_idx) #print (top_n_words) #计算文本相似性 similarity_graph = normalized * normalized.T #构建图计算 textrank nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) scores = nx.pagerank(nx_graph) return sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
def get_matrix_topics_for_dec(self): from sklearn.feature_extraction.text import TfidfTransformer matrix, topics = self.get_matrix_topics(using='tf') topics = np.array(au.reindex(topics)) matrix = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(matrix) matrix = matrix.astype(np.float32) print(matrix.shape, matrix.dtype, matrix.size) matrix = np.asarray(matrix.todense()) * np.sqrt(matrix.shape[1]) print('todense succeed') p = np.random.permutation(matrix.shape[0]) matrix = matrix[p] topics = topics[p] print('permutation finished') assert matrix.shape[0] == topics.shape[0] return matrix, topics
def facet_clustering(vectors, number_of_clusters, mode='kmeans'): vectors = TfidfTransformer().fit_transform(X=vectors) vectors = vectors.todense().tolist() if mode == 'kmeans': clusters = KMeans(n_clusters=number_of_clusters, random_state=0).fit(vectors) elif mode == 'dbscan': clusters = DBSCAN(min_samples=3).fit(vectors) labels = clusters.labels_ #labels = k.labels_ #n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) #n_noise_ = list(labels).count(-1) #print('Estimated number of clusters: %d' % n_clusters_) #print('Estimated number of noise points: %d' % n_noise_) labeled_vectors = collections.defaultdict(list) for label, vector in zip(labels, vectors): labeled_vectors[label + 1 if label != -1 else -1].append(vector) return labeled_vectors
def main(): df = pd.read_csv(args.uf_csv) feature_cols = df.columns.drop(RESERVED_COLS) logging.info(u"Running TF-IDF") tfidf = TfidfTransformer().fit_transform(df[feature_cols]) m = np.ascontiguousarray(tfidf.todense()).astype('float32') logging.info(u"Clustering via K-Means") _, cluster_labels = smart_kmeans_clustering(m, df.code, args.n_clusters, args.min_props_per_cluster) logging.info(u"Dumping data to: %s", args.output_path) with open(args.output_path, "w") as f: cnt_per_cluster = pd.Series(cluster_labels).value_counts() f.write("*** BEGIN INFO ***\n") for k, v in cnt_per_cluster.describe().iteritems(): f.write("%s: %s\n" % (k, v)) f.write("*** END INFO ***\n") for cl_id in range(args.n_clusters): cluster = df[cluster_labels == cl_id] # mean average usage of explanatory features in the cluster explanation = cluster[feature_cols].apply( lambda x: x / cluster.booking_cnt).mean() f.write("Cluster #%s [%s]\n" % (cl_id, cluster.shape[0])) f.write("Explanation:\n") for k, v in explanation[ explanation > FEATURE_THRESHOLD].iteritems(): f.write("-> %s: %.3f\n" % (k, v)) f.write("Users: %s\n" % ", ".join(cluster.code.tolist())) f.write("---\n") logging.info(u"Finish")
def main(): # os.chdir('/datasets') with open('SMSSpamCollection', 'r', encoding='utf-8') as f: data = f.readlines() f.close() print('the length of data:', len(data)) label = binary_label(extract_label(data)) corpus = extract_corpus(data) # sklearn to extract the feature vectorizers = [] vectorizer = CountVectorizer(stop_words='english') vectorizers.append(vectorizer) # use this method to finish a bigram feature and bag-of-word bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english') vectorizers.append(bigram_vectorizer) for i, j in enumerate(vectorizers): X = j.fit_transform(corpus) X = X.toarray() label = np.array(label).reshape(X.shape[0], 1) print(i, X.shape) print(i, X[0, :]) data = np.concatenate((X, label), axis=1) if i == 1: np.savetxt('SMSSpamCollection_bigram-{}.csv'.format(i), data, delimiter=',') # use TF-IDF to further extraction X = TfidfTransformer().fit_transform(X) X = np.array(X.todense()) data = np.concatenate((X, label), axis=1) np.savetxt('SMSSpamCollection(TF_IDF)_bigram-{}.csv'.format(i), data, delimiter=',')
class GenomeDataset_v2(Dataset): ''' Metagenomics dataset for reading simulated data in fasta format (.fna) ''' HASH_PATTERN = r'\([a-f0-9]{40}\)' def __init__(self, fna_file, feature_type='bow', k_mer=4, return_raw=False, use_tfidf=True, not_return_label=False): ''' Args: k_mer: number of nucleotid to combine into a word. overlap_k_mer: True to extract overlapping k_mer from a genome string. False otherwise. fna_file: path to fna file (fasta format). transform: transformation applied to all samples. ''' assert os.path.exists(fna_file), '{} does not exists'.format(fna_file) self.data = [] self.label = [] self.is_raw = return_raw self.vocab = generate_k_mer_corpus(k_mer) self._len = 0 with open(fna_file, 'r') as g_file: lines = g_file.readlines() lines = [line.strip() for line in lines] gene_str = '' hash_label = '' for line in lines: # Catch new sequence if line[0] == '>': # Update hash label key with gene sting value if hash_label != '': # self.match_dict[hash_label].append(ensure_gene_length(k_mer, gene_str)) gene_str = ensure_gene_length(k_mer, gene_str) gene_str = self.tokensize_gene_str(gene_str) self.data.append(gene_str) self.label.append(hash_label) # Track the number of genes self._len += 1 # Reset hash_label for reading new sequence hash_label = '' gene_str = '' dot_pos = line.find('.') # Seq_flag indicate 1st or 2nd sequence seq_flag = int(line[dot_pos + 1]) # 1st sequence, read the hash value (indicate the label) if seq_flag == 1: hash_pattern = re.search(GenomeDataset.HASH_PATTERN, line) if hash_pattern is not None: # for res in hash_pattern: hash_label = hash_pattern.group(0) # Remove the brackets hash_label = hash_label.replace('(', '') hash_label = hash_label.replace(')', '') else: pass # Ignore 2nd sequence for now # Gene string else: gene_str = gene_str + line count_vectorizer = CountVectorizer(self.data) self.numeric_data = count_vectorizer.fit_transform(self.data) if use_tfidf: self.numeric_data = TfidfTransformer( norm='l2', sublinear_tf=True).fit_transform(self.numeric_data) print('Finished TFIDF.') self.numeric_data = np.asarray(self.numeric_data.todense()) * np.sqrt( self.numeric_data.shape[1]) self.numeric_data = normalize(self.numeric_data, norm='l2') self.numeric_data = self.numeric_data.astype('float32') self.lb_mapping = self.to_onehot_mapping_2(set(self.label)) self.not_return_label = not_return_label def tokensize_gene_str(self, x: str): res_str = '' for i in range(len(x) - 4): sub_k_mer_str = x[i:i + 4] res_str += (' ' + sub_k_mer_str) return res_str[1:] def to_onehot_mapping_2(self, lb_list): lb_mapping = dict() for i, lb in enumerate(lb_list): lb_mapping[lb] = i return lb_mapping def __len__(self): # Return len of dataset in number of gene strings return self._len def __getitem__(self, idx): data = self.data[idx] if self.is_raw else self.numeric_data[idx] raw_lb = self.label[idx] lb = self.lb_mapping[raw_lb] if self.not_return_label: return (data, data) return (data, lb)
bagOfWords[x][y] = '' print("Filtering: ", bagOfWords) for i in range(0, len(bagOfWords)): bagOfWords[i] = filter(bool, bagOfWords[i]) dataSet[i] = ' '.join(bagOfWords[i]) print("Kata Bersih: ", dataSet) #VSM & TFIDF# VSM = CountVectorizer().fit_transform(dataSet) TFIDF = TfidfTransformer().fit_transform(VSM) #print (CountVectorizer().vocabulary) print("VSM: ", VSM) print("", VSM.todense()) print("TFIDF: ", TFIDF) print(TFIDF.todense()) #KONVERSI LABEL# #Pendidikan = 0, RPL = 1, TKJ = 2, MM = 3# label_manual = [ 1, 1, 1, 2, 3, 3, 1, 1, 0, 2, 3,
else: genreMat2.append( np.hstack([ [genre.name] , np.zeros(k) ] )) genreMat2 = np.vstack(genreMat2) print genreMat2 index = filmsbygenre['Action'] E = y[index, :] ### PCA ###################### ans = raw_input("Start PCA with Scikit ? ") if ans != "y": exit() from sklearn.decomposition import PCA pca = PCA(n_components = k, whiten=True) y = pca.fit_transform(X.todense()) topics3 = [[(pca.components_[l][i], feature_names[i]) for i in np.argsort(-np.abs(pca.components_[l]))[:10]] for l in range(k)] print topics3 genreMat3 = [] for genre in Genre.objects.all(): index = filmsbygenre[genre.name] if index != []: E = y[index, :] genreMat3.append( np.hstack([ [genre.name] , np.mean(E, axis = 0)]) ) else: genreMat3.append( np.hstack([ [genre.name] , np.zeros(k) ] )) genreMat3 = np.vstack(genreMat3) print genreMat3
## Calculating Cosine Similarities ## trainVectorizerArray = vectorizerone.fit_transform(docsX,docsY).toarray() testVectorizerArray = vectorizertest.fit_transform(docstest,docstestone).toarray() print ('Fit Vectorizer to train set', trainVectorizerArray) print ('Transform Vectorizer to test set', testVectorizerArray) transformer = TfidfTransformer() transformer.fit(trainVectorizerArray) print print(transformer.transform(trainVectorizerArray).toarray()) transformer.fit(testVectorizerArray) print tfidf = transformer.transform(testVectorizerArray) print tfidf.todense() tfidf[0:1] from sklearn.metrics.pairwise import linear_kernel cosine_similarities = linear_kernel(tfidf[0:1] , tfidf).flatten() cosine_similarities related_docs_indices = cosine_similarities.argsort()[:-2000:-1] related_docs_indices cosine_similarities[related_docs_indices] ## I tried my best to predict the same_security for the test set using this approach, but, i was not able to do so. Hence, i tried another approach(code below) ## ## %md Another Approach ## import nltk import numpy as np
用空格连接 导包 转换为词频矩阵 算TF-IDF ''' import pandas as pd diao = pd.read_csv( r'C:\Users\Administrator\Documents\Tencent Files\1521131720\FileRecv\射雕_chapter.csv', engine='python', encoding='utf-8', index_col=0) diao.head() # 停用词 s1 = jieba.load_userdict(r'D:\PythonSpyder\NLP\jieba分词.txt') jieba.lcut(diao) from sklearn.feature_extraction.text import TfidfTransformer diao_tfidf = TfidfTransformer() diao_tfidf.todense() #转换为矩阵 from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() # 每列对应的词 vectorizer.get_feature #词条字典 vectorizer.vocabulary_
else: genreMat2.append(np.hstack([[genre.name], np.zeros(k)])) genreMat2 = np.vstack(genreMat2) print genreMat2 index = filmsbygenre['Action'] E = y[index, :] ### PCA ###################### ans = raw_input("Start PCA with Scikit ? ") if ans != "y": exit() from sklearn.decomposition import PCA pca = PCA(n_components=k, whiten=True) y = pca.fit_transform(X.todense()) topics3 = [[(pca.components_[l][i], feature_names[i]) for i in np.argsort(-np.abs(pca.components_[l]))[:10]] for l in range(k)] print topics3 genreMat3 = [] for genre in Genre.objects.all(): index = filmsbygenre[genre.name] if index != []: E = y[index, :] genreMat3.append(np.hstack([[genre.name], np.mean(E, axis=0)])) else: genreMat3.append(np.hstack([[genre.name], np.zeros(k)])) genreMat3 = np.vstack(genreMat3)
arquivos_fora_padrao.append(nome_arquivo) #Faz o stemming e guarda o resultado no atributo resolucoes_stem cn.stem() for macrotema in cn.macrotema_por_norma: #criando uma lista que contém apenas normas do macrotema específico resolucoes_stem_macrotema = list(cn.df_resolucoes_macrotemas[ cn.df_resolucoes_macrotemas['macrotema'] == macrotema]['norma']) #Vetorizando e aplicando o tfidf vec = CountVectorizer() bag_palavras = vec.fit_transform(resolucoes_stem_macrotema) feature_names = vec.get_feature_names() base_tfidf = TfidfTransformer().fit_transform(bag_palavras) base_tfidf = base_tfidf.todense() #Reduzindo a dimensionalidade base_tfidf_reduced = cn.SVD(600, base_tfidf) #Clustering print('Começou a clusterização.') t = time.time() clusters_por_cosseno = hierarchy.linkage( base_tfidf_reduced, "average", metric="cosine") #pode testar metric="euclidean" também plt.figure() dn = hierarchy.dendrogram(clusters_por_cosseno) plt.savefig('dendogram.jpg') limite_dissimilaridade = 0.92 id_clusters = hierarchy.fcluster(clusters_por_cosseno,
assuntos = getting_data_subject(attr='classe_process') l_docs, l_target = cut_data(assuntos, -1) for d in l_docs: if type(d) == list: print(d) for i, d in enumerate(l_docs): if type(d) == list: l_docs[i] = "" vectorizer = CountVectorizer(strip_accents="unicode", max_df=0.8, stop_words=get_stop_words()) counts = vectorizer.fit_transform(l_docs) tfidf_transformer = TfidfTransformer().fit_transform(counts) l_target_en = target_encode(l_target) centers = [[1, 1], [-1, -1], [1, -1]] X = StandardScaler().fit_transform(tfidf_transformer.todense()) # ############################################################################# # Compute DBSCAN db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(l_target_en, labels)) print("Completeness: %0.3f" % metrics.completeness_score(l_target_en, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(l_target_en, labels))