Example #1
0
def TextRank(document):

    #分句
    sentence_tokenizer = PunktSentenceTokenizer()
    sentences = sentence_tokenizer.tokenize(document)

    #计算词频
    c = CountVectorizer()

    #计算tf-idf
    bow_matrix = c.fit_transform(sentences)
    normalized = TfidfTransformer().fit_transform(bow_matrix)

    #获取词袋模型中所有词语
    all_words = (c.get_feature_names())

    #index2word
    index2words = {v: k for k, v in c.vocabulary_.items()}

    #根据tf-idf 输出前三个关键字索引
    top_n_idx = np.argsort(normalized.todense())[:, -3:]
    #print(top_n_idx)

    #根据tf-idf 获取top-n关键字
    top_n_words = np.vectorize(index2words.get)(top_n_idx)
    #print (top_n_words)

    #计算文本相似性
    similarity_graph = normalized * normalized.T

    #构建图计算 textrank
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph)
    return sorted(((scores[i], s) for i, s in enumerate(sentences)),
                  reverse=True)
Example #2
0
 def get_matrix_topics_for_dec(self):
     from sklearn.feature_extraction.text import TfidfTransformer
     matrix, topics = self.get_matrix_topics(using='tf')
     topics = np.array(au.reindex(topics))
     matrix = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(matrix)
     matrix = matrix.astype(np.float32)
     print(matrix.shape, matrix.dtype, matrix.size)
     matrix = np.asarray(matrix.todense()) * np.sqrt(matrix.shape[1])
     print('todense succeed')
     p = np.random.permutation(matrix.shape[0])
     matrix = matrix[p]
     topics = topics[p]
     print('permutation finished')
     assert matrix.shape[0] == topics.shape[0]
     return matrix, topics
Example #3
0
def facet_clustering(vectors, number_of_clusters, mode='kmeans'):

    vectors = TfidfTransformer().fit_transform(X=vectors)
    vectors = vectors.todense().tolist()
    if mode == 'kmeans':
        clusters = KMeans(n_clusters=number_of_clusters,
                          random_state=0).fit(vectors)
    elif mode == 'dbscan':
        clusters = DBSCAN(min_samples=3).fit(vectors)
    labels = clusters.labels_
    #labels = k.labels_
    #n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    #n_noise_ = list(labels).count(-1)

    #print('Estimated number of clusters: %d' % n_clusters_)
    #print('Estimated number of noise points: %d' % n_noise_)

    labeled_vectors = collections.defaultdict(list)
    for label, vector in zip(labels, vectors):
        labeled_vectors[label + 1 if label != -1 else -1].append(vector)

    return labeled_vectors
Example #4
0
def main():
    df = pd.read_csv(args.uf_csv)
    feature_cols = df.columns.drop(RESERVED_COLS)

    logging.info(u"Running TF-IDF")
    tfidf = TfidfTransformer().fit_transform(df[feature_cols])
    m = np.ascontiguousarray(tfidf.todense()).astype('float32')

    logging.info(u"Clustering via K-Means")
    _, cluster_labels = smart_kmeans_clustering(m, df.code, args.n_clusters,
                                                args.min_props_per_cluster)

    logging.info(u"Dumping data to: %s", args.output_path)
    with open(args.output_path, "w") as f:
        cnt_per_cluster = pd.Series(cluster_labels).value_counts()

        f.write("*** BEGIN INFO ***\n")
        for k, v in cnt_per_cluster.describe().iteritems():
            f.write("%s: %s\n" % (k, v))
        f.write("*** END INFO ***\n")

        for cl_id in range(args.n_clusters):
            cluster = df[cluster_labels == cl_id]

            # mean average usage of explanatory features in the cluster
            explanation = cluster[feature_cols].apply(
                lambda x: x / cluster.booking_cnt).mean()

            f.write("Cluster #%s [%s]\n" % (cl_id, cluster.shape[0]))
            f.write("Explanation:\n")

            for k, v in explanation[
                    explanation > FEATURE_THRESHOLD].iteritems():
                f.write("-> %s: %.3f\n" % (k, v))

            f.write("Users: %s\n" % ", ".join(cluster.code.tolist()))
            f.write("---\n")
    logging.info(u"Finish")
def main():
    # os.chdir('/datasets')
    with open('SMSSpamCollection', 'r', encoding='utf-8') as f:
        data = f.readlines()
        f.close()
    print('the length of data:', len(data))
    label = binary_label(extract_label(data))
    corpus = extract_corpus(data)

    # sklearn to extract the feature
    vectorizers = []
    vectorizer = CountVectorizer(stop_words='english')
    vectorizers.append(vectorizer)
    # use this method to finish a bigram feature and bag-of-word
    bigram_vectorizer = CountVectorizer(ngram_range=(2, 2),
                                        stop_words='english')
    vectorizers.append(bigram_vectorizer)
    for i, j in enumerate(vectorizers):
        X = j.fit_transform(corpus)
        X = X.toarray()
        label = np.array(label).reshape(X.shape[0], 1)
        print(i, X.shape)
        print(i, X[0, :])
        data = np.concatenate((X, label), axis=1)

        if i == 1:
            np.savetxt('SMSSpamCollection_bigram-{}.csv'.format(i),
                       data,
                       delimiter=',')

        # use TF-IDF to further extraction
        X = TfidfTransformer().fit_transform(X)
        X = np.array(X.todense())
        data = np.concatenate((X, label), axis=1)

        np.savetxt('SMSSpamCollection(TF_IDF)_bigram-{}.csv'.format(i),
                   data,
                   delimiter=',')
Example #6
0
class GenomeDataset_v2(Dataset):
    '''
    Metagenomics dataset for reading simulated data in fasta format (.fna)
    '''
    HASH_PATTERN = r'\([a-f0-9]{40}\)'

    def __init__(self,
                 fna_file,
                 feature_type='bow',
                 k_mer=4,
                 return_raw=False,
                 use_tfidf=True,
                 not_return_label=False):
        '''
        Args:
            k_mer: number of nucleotid to combine into a word.
            overlap_k_mer: True to extract overlapping k_mer from a genome string. False otherwise.
            fna_file: path to fna file (fasta format).
            transform: transformation applied to all samples.
        '''
        assert os.path.exists(fna_file), '{} does not exists'.format(fna_file)
        self.data = []
        self.label = []
        self.is_raw = return_raw
        self.vocab = generate_k_mer_corpus(k_mer)
        self._len = 0
        with open(fna_file, 'r') as g_file:
            lines = g_file.readlines()
            lines = [line.strip() for line in lines]
            gene_str = ''
            hash_label = ''
            for line in lines:
                # Catch new sequence
                if line[0] == '>':

                    # Update hash label key with gene sting value
                    if hash_label != '':
                        # self.match_dict[hash_label].append(ensure_gene_length(k_mer, gene_str))
                        gene_str = ensure_gene_length(k_mer, gene_str)
                        gene_str = self.tokensize_gene_str(gene_str)
                        self.data.append(gene_str)
                        self.label.append(hash_label)

                        # Track the number of genes
                        self._len += 1

                    # Reset hash_label for reading new sequence
                    hash_label = ''
                    gene_str = ''
                    dot_pos = line.find('.')
                    # Seq_flag indicate 1st or 2nd sequence
                    seq_flag = int(line[dot_pos + 1])

                    # 1st sequence, read the hash value (indicate the label)
                    if seq_flag == 1:
                        hash_pattern = re.search(GenomeDataset.HASH_PATTERN,
                                                 line)
                        if hash_pattern is not None:
                            # for res in hash_pattern:
                            hash_label = hash_pattern.group(0)

                            # Remove the brackets
                            hash_label = hash_label.replace('(', '')
                            hash_label = hash_label.replace(')', '')
                    else:
                        pass  # Ignore 2nd sequence for now
                # Gene string
                else:
                    gene_str = gene_str + line

        count_vectorizer = CountVectorizer(self.data)
        self.numeric_data = count_vectorizer.fit_transform(self.data)

        if use_tfidf:
            self.numeric_data = TfidfTransformer(
                norm='l2', sublinear_tf=True).fit_transform(self.numeric_data)
            print('Finished TFIDF.')

        self.numeric_data = np.asarray(self.numeric_data.todense()) * np.sqrt(
            self.numeric_data.shape[1])
        self.numeric_data = normalize(self.numeric_data, norm='l2')
        self.numeric_data = self.numeric_data.astype('float32')

        self.lb_mapping = self.to_onehot_mapping_2(set(self.label))
        self.not_return_label = not_return_label

    def tokensize_gene_str(self, x: str):
        res_str = ''
        for i in range(len(x) - 4):
            sub_k_mer_str = x[i:i + 4]
            res_str += (' ' + sub_k_mer_str)

        return res_str[1:]

    def to_onehot_mapping_2(self, lb_list):
        lb_mapping = dict()
        for i, lb in enumerate(lb_list):
            lb_mapping[lb] = i

        return lb_mapping

    def __len__(self):
        # Return len of dataset in number of gene strings
        return self._len

    def __getitem__(self, idx):
        data = self.data[idx] if self.is_raw else self.numeric_data[idx]
        raw_lb = self.label[idx]
        lb = self.lb_mapping[raw_lb]

        if self.not_return_label:
            return (data, data)
        return (data, lb)
Example #7
0
                bagOfWords[x][y] = ''
print("Filtering: ", bagOfWords)

for i in range(0, len(bagOfWords)):
    bagOfWords[i] = filter(bool, bagOfWords[i])
    dataSet[i] = ' '.join(bagOfWords[i])
print("Kata Bersih: ", dataSet)

#VSM & TFIDF#
VSM = CountVectorizer().fit_transform(dataSet)
TFIDF = TfidfTransformer().fit_transform(VSM)
#print (CountVectorizer().vocabulary)
print("VSM: ", VSM)
print("", VSM.todense())
print("TFIDF: ", TFIDF)
print(TFIDF.todense())

#KONVERSI LABEL#
#Pendidikan = 0, RPL = 1, TKJ = 2, MM = 3#
label_manual = [
    1,
    1,
    1,
    2,
    3,
    3,
    1,
    1,
    0,
    2,
    3,
    else:
        genreMat2.append( np.hstack([ [genre.name] , np.zeros(k) ] ))
genreMat2 = np.vstack(genreMat2)
print genreMat2

index = filmsbygenre['Action']
E = y[index, :]

### PCA ######################
ans = raw_input("Start PCA with Scikit ? ")
if ans != "y":
    exit()

from sklearn.decomposition import PCA
pca = PCA(n_components = k, whiten=True)
y = pca.fit_transform(X.todense())

topics3 = [[(pca.components_[l][i], feature_names[i]) for i in np.argsort(-np.abs(pca.components_[l]))[:10]] for l in range(k)]
print topics3

genreMat3 = []
for genre in Genre.objects.all():
    index = filmsbygenre[genre.name]
    if index != []:
        E = y[index, :]
        genreMat3.append( np.hstack([ [genre.name] , np.mean(E, axis = 0)]) )
    else:
        genreMat3.append( np.hstack([ [genre.name] , np.zeros(k) ] ))
genreMat3 = np.vstack(genreMat3)
print genreMat3
Example #9
0

## Calculating Cosine Similarities ##

trainVectorizerArray = vectorizerone.fit_transform(docsX,docsY).toarray()
testVectorizerArray = vectorizertest.fit_transform(docstest,docstestone).toarray()
print ('Fit Vectorizer to train set', trainVectorizerArray)
print ('Transform Vectorizer to test set', testVectorizerArray)
transformer = TfidfTransformer()
transformer.fit(trainVectorizerArray)
print
print(transformer.transform(trainVectorizerArray).toarray())
transformer.fit(testVectorizerArray)
print
tfidf = transformer.transform(testVectorizerArray)
print tfidf.todense()
tfidf[0:1]
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tfidf[0:1] , tfidf).flatten()
cosine_similarities
related_docs_indices = cosine_similarities.argsort()[:-2000:-1]
related_docs_indices
cosine_similarities[related_docs_indices]

## I tried my best to predict the same_security for the test set using this approach, but, i was not able to do so. Hence, i tried another approach(code below) ##


                          ## %md Another Approach ##

import nltk
import numpy as np
Example #10
0
用空格连接
导包
转换为词频矩阵
算TF-IDF
'''
import pandas as pd

diao = pd.read_csv(
    r'C:\Users\Administrator\Documents\Tencent Files\1521131720\FileRecv\射雕_chapter.csv',
    engine='python',
    encoding='utf-8',
    index_col=0)
diao.head()
# 停用词
s1 = jieba.load_userdict(r'D:\PythonSpyder\NLP\jieba分词.txt')
jieba.lcut(diao)

from sklearn.feature_extraction.text import TfidfTransformer

diao_tfidf = TfidfTransformer()
diao_tfidf.todense()  #转换为矩阵

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# 每列对应的词
vectorizer.get_feature
#词条字典
vectorizer.vocabulary_
Example #11
0
    else:
        genreMat2.append(np.hstack([[genre.name], np.zeros(k)]))
genreMat2 = np.vstack(genreMat2)
print genreMat2

index = filmsbygenre['Action']
E = y[index, :]

### PCA ######################
ans = raw_input("Start PCA with Scikit ? ")
if ans != "y":
    exit()

from sklearn.decomposition import PCA
pca = PCA(n_components=k, whiten=True)
y = pca.fit_transform(X.todense())

topics3 = [[(pca.components_[l][i], feature_names[i])
            for i in np.argsort(-np.abs(pca.components_[l]))[:10]]
           for l in range(k)]
print topics3

genreMat3 = []
for genre in Genre.objects.all():
    index = filmsbygenre[genre.name]
    if index != []:
        E = y[index, :]
        genreMat3.append(np.hstack([[genre.name], np.mean(E, axis=0)]))
    else:
        genreMat3.append(np.hstack([[genre.name], np.zeros(k)]))
genreMat3 = np.vstack(genreMat3)
Example #12
0
        arquivos_fora_padrao.append(nome_arquivo)

#Faz o stemming e guarda o resultado no atributo resolucoes_stem
cn.stem()

for macrotema in cn.macrotema_por_norma:
    #criando uma lista que contém apenas normas do macrotema específico
    resolucoes_stem_macrotema = list(cn.df_resolucoes_macrotemas[
        cn.df_resolucoes_macrotemas['macrotema'] == macrotema]['norma'])

    #Vetorizando e aplicando o tfidf
    vec = CountVectorizer()
    bag_palavras = vec.fit_transform(resolucoes_stem_macrotema)
    feature_names = vec.get_feature_names()
    base_tfidf = TfidfTransformer().fit_transform(bag_palavras)
    base_tfidf = base_tfidf.todense()

    #Reduzindo a dimensionalidade
    base_tfidf_reduced = cn.SVD(600, base_tfidf)

    #Clustering
    print('Começou a clusterização.')
    t = time.time()
    clusters_por_cosseno = hierarchy.linkage(
        base_tfidf_reduced, "average",
        metric="cosine")  #pode testar metric="euclidean" também
    plt.figure()
    dn = hierarchy.dendrogram(clusters_por_cosseno)
    plt.savefig('dendogram.jpg')
    limite_dissimilaridade = 0.92
    id_clusters = hierarchy.fcluster(clusters_por_cosseno,
Example #13
0
    assuntos = getting_data_subject(attr='classe_process')
    l_docs, l_target = cut_data(assuntos, -1)
    for d in l_docs:
        if type(d) == list:
            print(d)
    for i, d in enumerate(l_docs):
        if type(d) == list:
            l_docs[i] = ""

    vectorizer = CountVectorizer(strip_accents="unicode", max_df=0.8, stop_words=get_stop_words())
    counts = vectorizer.fit_transform(l_docs)
    tfidf_transformer = TfidfTransformer().fit_transform(counts)
    l_target_en = target_encode(l_target)
    centers = [[1, 1], [-1, -1], [1, -1]]

    X = StandardScaler().fit_transform(tfidf_transformer.todense())

    # #############################################################################
    # Compute DBSCAN
    db = DBSCAN(eps=0.3, min_samples=10).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(l_target_en, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(l_target_en, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(l_target_en, labels))