def _word_tfidf_dist(documents): words_tfidf = {} if len(documents) > 0: if _check_is_sentence(documents): #if document contains only 1 or 2 or 3 chars --> acronyms try: text_analyzer = Vectorizer(ngram_range=(1,2),max_features=50) matrix = text_analyzer.fit_transform(documents).todense() for vocabulary in text_analyzer.vocabulary_.items(): word = vocabulary[0] indice = vocabulary[1] words_tfidf[word] = score_tfidf_freq(matrix[:,indice]) except ValueError: return {} else: return _freqdist(documents) return words_tfidf
def vectorize_videos(fpath, use_idf=False): ''' Converts a YouTube tag file to a sparse matrix pondered. We can assign weights based on IDF if specified. Arguments --------- fpath: a path to a file Each line is a song, tags are separated by space use_idf: bool (optinal, defaults to True) Indicates whether to use IDF. bottom_filter: float (defaults to 0.005, half of one percent) Minimum probability for tags to be considered useful ''' #Vectorizes to TF-IDF vectorizer = Vectorizer(analyzer=NoopAnalyzer(), use_idf = use_idf) sparse_matrix = vectorizer.fit_transform(clean_up(fpath, bottom_filter=0)) vocabulary = vectorizer.vocabulary return sparse_matrix, vocabulary
def vectorize_videos(fpath, use_idf=False): ''' Converts a YouTube tag file to a sparse matrix pondered. We can assign weights based on IDF if specified. Arguments --------- fpath: a path to a file Each line is a song, tags are separated by space use_idf: bool (optinal, defaults to True) Indicates whether to use IDF. bottom_filter: float (defaults to 0.005, half of one percent) Minimum probability for tags to be considered useful ''' #Vectorizes to TF-IDF vectorizer = Vectorizer(analyzer=NoopAnalyzer(), use_idf=use_idf) sparse_matrix = vectorizer.fit_transform(clean_up(fpath, bottom_filter=0)) vocabulary = vectorizer.vocabulary return sparse_matrix, vocabulary
def get_20newsgroups_data_info_for_categories(categories): data = fetch_20newsgroups(subset='all', categories=categories, shuffle=False) vectorizer = Vectorizer() t0 = time() tfidf = vectorizer.fit_transform(data.data) pairwise_similarity = (tfidf * tfidf.T).todense().tolist() print "done in %fs" % (time() - t0) labels = [data.target_names[i] for i in data.target] payloads = [os.sep.join(e.split(os.sep)[-3:]) for e in data.filenames] # Similarity is from Zero to One - so (1-s) gives distance from 0 to 1. distances = [[(1-s) for s in row[:col_to+1]]for (col_to, row) in enumerate(pairwise_similarity)] # Fix the very slight off-ness involved in precision-conversion for row in distances: row[-1] = 0 pcd_tuples = zip(payloads, labels, distances) di = DataInfo.deserialize_pcd_tuples(pcd_tuples) return di
documents = data_train.data + data_test.data target_names = set(data_train.target_names + data_test.target_names) print "%d documents" % len(documents) print "%d categories" % len(target_names) print # split a training set and a test set labels = np.concatenate((data_train.target, data_test.target)) true_k = np.unique(labels).shape[0] print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_features=10000) X = vectorizer.fit_transform(documents) X = Normalizer(norm="l2", copy=False).transform(X) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape print ############################################################################### # Now sparse MiniBatchKmeans mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13, chunk_size=1000) print "Clustering sparse data with %s" % str(mbkm) t0 = time() mbkm.fit(X)
def test_vectorizer(): # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2) # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1) assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1) assert_equal(counts_test[0, v.vocabulary[u"water"]], 1) # stop word from the fixed list assert_false(u"the" in v.vocabulary) # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert_false(u"copyright" in v.vocabulary) # not present in the sample assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0) assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0) assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0) assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0) # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = toarray(t1.fit(counts_train).transform(counts_train)) assert_equal(len(t1.idf_), len(v1.vocabulary)) assert_equal(tfidf.shape, (n_train, len(v1.vocabulary))) # test tf-idf with new data tfidf_test = toarray(t1.transform(counts_test)) assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary))) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = toarray(t2.fit(counts_train).transform(counts_train)) assert_equal(t2.idf_, None) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = Vectorizer(norm='l1') tv.tc.max_df = v1.max_df tfidf2 = toarray(tv.fit_transform(train_data)) assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = toarray(tv.transform(test_data)) assert_array_almost_equal(tfidf_test, tfidf_test2) # test empty vocabulary v3 = CountVectorizer(vocabulary=None) assert_raises(ValueError, v3.transform, train_data)
print 'data loaded' categories = data_train.target_names # for case categories == None print "%d documents (training set)" % len(data_train.data) print "%d documents (testing set)" % len(data_test.data) print "%d categories" % len(categories) print # split a training set and a test set y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform(data_train.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform(data_test.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time()
filenames = np.concatenate((data_train.filenames, data_test.filenames)) target_names = set(data_train.target_names + data_test.target_names) print "%d documents" % len(filenames) print "%d categories" % len(target_names) print # split a training set and a test set labels = np.concatenate((data_train.target, data_test.target)) true_k = np.unique(labels).shape[0] print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_features=10000) X = vectorizer.fit_transform((open(f).read() for f in filenames)) X = Normalizer(norm="l2", copy=False).transform(X) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape print ############################################################################### # Now sparse MiniBatchKmeans print "_" * 80 mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13, chunk_size=1000, tol=0.0, n_init=1)
print 'data loaded' categories = data_train.target_names # for case categories == None print "%d documents (training set)" % len(data_train.data) print "%d documents (testing set)" % len(data_test.data) print "%d categories" % len(categories) print # split a training set and a test set y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform(data_train.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform(data_test.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print if opts.select_chi2: print ("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time()
def main(tcu_fpath): data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath) #We only want accepted data data = data[data['Situacao'] == 'Aceito e Habilitado'] #Get invalid lines invalids = invalid(data) #Transforms descriptions to base strings desc_column = data['Descricao'] des_cmp_column = data['DescricaoComplementar'] unidade_column = data['UnidadeFornecimento'] qtd_column = [str(qtd) for qtd in data['Quantidade']] as_docs = [] for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column): doc = " ".join(as_text) as_docs.append(doc) #Vectorizes to TF-IDF vectorizer = Vectorizer() doc_sparse_matrix = vectorizer.fit_transform(as_docs) #Run K-Means num_clusters = 7 mbkm = MiniBatchKMeans(num_clusters, init = 'random') mbkm.fit(doc_sparse_matrix) #New labels column, replaces both Descricao columns labels_column = mbkm.labels_ #Old columns to keep chave_column = data['ChavePregao'] uasg_column = data['UASG'] pregoeiro_column = data['PregoeiroOficial'] aceito_column = data['AceitoPara_CNPJ'] lance_column = data['PeloMenorLance'] ref_column = data['ValordeReferencia'] ganho_column = data['GanhoPregao'] #And a new column Superfaturamento super_faturamento = np.ndarray(shape=len(labels_column), dtype = 'S12') for i, ganho in enumerate(ganho_column): if ganho >= -50: #50% vezes o preco é aceito super_faturamento[i] = 'OK' elif ganho < -50 and ganho > -500: #Mais que isso é super faturado super_faturamento[i] = 'Super' elif ganho < -500: #Mais que 5x o valor é foda. super_faturamento[i] = 'SuperPlus' for i in xrange(len(labels_column)): if i not in invalids: print(labels_column[i], end=',') print(chave_column[i], end=',') print(uasg_column[i], end=',') print(pregoeiro_column[i], end=',') print(aceito_column[i], end=',') print(lance_column[i], end=',') print(ref_column[i], end=',') print(ganho_column[i], end=',') print(super_faturamento[i])
input_data = csv.reader(open('descriptions_100.csv','rb')) dataset_data = [] dataset_target = [] for row in input_data: dataset_data.append(row[1]) dataset_target.append(row[0]) labels = dataset_target true_k = np.unique(labels).shape[0] print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_df=0.95, max_features=10000) X = vectorizer.fit_transform(dataset_data) print X print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape ############################################################################### # Do the actual clustering km = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,init_size=1000,batch_size=1000, verbose=1) print "Clustering with %s" % km t0 = time() km.fit(X) print "done in %0.3fs\n" % (time() - t0)
def main(tcu_fpath): data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath) data = data[data['Situacao'] == 'Aceito e Habilitado'] desc_column = data['Descricao'] des_cmp_column = data['DescricaoComplementar'] unidade_column = data['UnidadeFornecimento'] qtd_column = [str(qtd) for qtd in data['Quantidade']] #Transforms descriptions to base strings as_docs = [] for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column): doc = " ".join(as_text) as_docs.append(doc) #Vectorizes to TF-IDF vectorizer = Vectorizer() doc_sparse_matrix = vectorizer.fit_transform(as_docs) #Compute clusters inter = {} intra = {} n_runs = 20 k_vals = range(2, 16) for i in xrange(n_runs): for k in k_vals: #Each K has n_runs clusterings inter_array = inter.setdefault(k, np.zeros(n_runs)) intra_array = intra.setdefault(k, np.zeros(n_runs)) #Run K-Means mbkm = MiniBatchKMeans(k, init = 'random') mbkm.fit(doc_sparse_matrix) centers = mbkm.cluster_centers_ labels = mbkm.labels_ #Inter distance. We use min because the ideia is to maximize this. #Min serves as a penalty for worse case. dist_centers = pairwise.euclidean_distances(centers) min_dist_between_centers = \ np.min(dist_centers[dist_centers > 0]) inter_array[i] = min_dist_between_centers #Intra distance dist_all_centers = mbkm.transform(doc_sparse_matrix) intra_dists = [] for doc_id, cluster in enumerate(labels): dist = dist_all_centers[doc_id, cluster] intra_dists.append(dist) intra_array[i] = np.mean(intra_dists) #Prints num elements per cluster print('Run %d ; k = %d' %(i, k)) counter = Counter(labels) for cluster, population in counter.items(): print('\tK = %d; Pop = %d' %(cluster, population)) print() x = inter.keys() y = [] c = [] for k in x: div = inter[k] / intra[k] y.append(np.mean(div)) c.append(half_confidence_interval_size(div, 0.90)) #hack for the zero to apper x = [0] + x y = [0] + y c = [0] + c ax = plt.gca() ax.set_yscale('log') ax.set_xticks(range(0, 16)) plt.ylabel('InterCluster/IntraCluster Ratio') plt.xlabel('Number of clusters') plt.errorbar(x, y, yerr=c, fmt='bo', markersize=8, elinewidth=2) plt.show()
if 'MLCOMP_DATASETS_HOME' not in os.environ: print "MLCOMP_DATASETS_HOME not set; please follow the above instructions" sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " news_train = load_mlcomp('20news-18828', 'train') print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Loading 20 newsgroups test set... " news_test = load_mlcomp('20news-18828', 'test') t0 = time() print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names) print "Extracting features from the dataset using the same vectorizer"
iter = ShuffleSplit(num_posts, n_iterations=1, test_fraction=0.15, indices=False) for (iter_no, (train_index, test_index)) in enumerate(iter): print 'Iteration no. %d' %(iter_no + 1) y_train = np.array([ x for (x, y) in zip(all_data['target'], train_index) if y ]) y_test = np.array([ x for (x, y) in zip(all_data['target'], test_index) if y ]) print 'Sampled %d training and %d test posts' %(len(y_train), len(y_test)) print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() title_vectorizer = Vectorizer( analyzer=WordNGramAnalyzer( charset='utf-8', stop_words=set(['a', 'an', 'and', 'in', 'is', 'of', 'on', 'the', 'to']), ) ) title_train = title_vectorizer.fit_transform([ x for (x, y) in zip(all_data['title'], train_index) if y ]) domain_vectorizer = extract.SimpleVectorizer() domain_train = domain_vectorizer.fit_transform([ x for (x, y) in zip(all_data['domain'], train_index) if y ]) X_train = title_train print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() title_test = title_vectorizer.transform([ x for (x, y) in zip(all_data['title'], test_index) if y ]) domain_test = domain_vectorizer.transform([ x for (x, y) in zip(all_data['domain'], test_index) if y ]) X_test = domain_test print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42) print "%d documents" % len(dataset.data) print "%d categories" % len(dataset.target_names) print labels = dataset.target true_k = np.unique(labels).shape[0] print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_df=0.95, max_features=10000) X = vectorizer.fit_transform(dataset.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape print ############################################################################### # Do the actual clustering if opts.minibatch: km = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=1)
'SafeHarbor','Truste', 'Change', 'Location', 'Children', 'Contact', 'Process', 'Retention'] # Load data print "Loading privacy policy dataset for categories:" print categories if categories else "all" data_set = load_files('Privacypolicy_balance/raw', categories = categories, shuffle = True, random_state = 42) print 'data loaded' print # Extract features print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_features=10000) X = vectorizer.fit_transform(data_set.data) X = Normalizer(norm="l2", copy=False).transform(X) y = data_set.target # feature selection ch2 = SelectKBest(chi2, k = 1800) X = ch2.fit_transform(X, y) X = X.toarray() n_samples, n_features = X.shape print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % (n_samples, n_features) print
data_train = load_files('./Train', categories = train_category, shuffle = True, random_state = 42) data_test = load_files('./Test/Unlabeled', categories = test_category, shuffle = True, random_state = 42) print 'data loaded' print len(data_train.data) print len(data_test.data) print # Extract features print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_features=10000) X_test = vectorizer.fit_transform(data_test.data) X_test = Normalizer(norm="l2", copy=False).transform(X_test) X = vectorizer.transform(data_train.data) X = Normalizer(norm="l2", copy=False).transform(X) X = X.toarray() X_test = X_test.toarray() n_samples, n_features = X.shape test_samples, test_features = X_test.shape print "done in %fs" % (time() - t0) print "Train set - n_samples: %d, n_features: %d" % (n_samples, n_features) print "Test set - n_samples: %d, n_features: %d" % (test_samples, test_features) print