class TF_Transformer(base.BaseEstimator, base.TransformerMixin): def __init__(self): self.cv_bi = CountVectorizer(min_df=2,max_df=0.7,ngram_range=(1,2)) self.tfidf_trans = TfidfTransformer() self.SVD_trans = TruncatedSVD(n_components=300) # X is a list of Fit_Review named tuples, y is none def fit(self, X, y=None): texts = [review.text for review in X] counts = self.cv_bi.fit_transform(texts) counts_tfidf = self.tfidf_trans.fit_transform(counts) self.SVD_trans.fit(counts_tfidf) return self # X is a list of either Fit_Review or Prod_Corpus named tuples def transform(self, X): texts = [review.text for review in X] counts = self.cv_bi.transform(texts) counts_tfidf = self.tfidf_trans.transform(counts) counts_trunc = self.SVD_trans.transform(counts_tfidf) return counts_trunc
def init_model(self, model, n_topics=10, **kwargs): if model == 'nmf': self.model = NMF( n_components=n_topics, alpha=kwargs.get('alpha', 0.1), l1_ratio=kwargs.get('l1_ratio', 0.5), max_iter=kwargs.get('max_iter', 200), random_state=kwargs.get('random_state', 1), shuffle=kwargs.get('shuffle', False)) elif model == 'lda': self.model = LatentDirichletAllocation( n_topics=n_topics, max_iter=kwargs.get('max_iter', 10), random_state=kwargs.get('random_state', 1), learning_method=kwargs.get('learning_method', 'online'), learning_offset=kwargs.get('learning_offset', 10.0), batch_size=kwargs.get('batch_size', 128), n_jobs=kwargs.get('n_jobs', 1)) elif model == 'lsa': self.model = TruncatedSVD( n_components=n_topics, algorithm=kwargs.get('algorithm', 'randomized'), n_iter=kwargs.get('n_iter', 5), random_state=kwargs.get('random_state', 1)) else: msg = 'model "{}" invalid; must be {}'.format( model, {'nmf', 'lda', 'lsa'}) raise ValueError(msg)
def embed_two_dimensions(data, vectorizer, size=10, n_components=5, colormap='YlOrRd'): if hasattr(data, '__iter__'): iterable = data else: raise Exception('ERROR: Input must be iterable') import itertools iterable_1, iterable_2 = itertools.tee(iterable) # get labels labels = [] for graph in iterable_2: label = graph.graph.get('id', None) if label: labels.append(label) # transform iterable into sparse vectors data_matrix = vectorizer.transform(iterable_1) # embed high dimensional sparse vectors in 2D from sklearn import metrics distance_matrix = metrics.pairwise.pairwise_distances(data_matrix) from sklearn.manifold import MDS feature_map = MDS(n_components=n_components, dissimilarity='precomputed') explicit_data_matrix = feature_map.fit_transform(distance_matrix) from sklearn.decomposition import TruncatedSVD pca = TruncatedSVD(n_components=2) low_dimension_data_matrix = pca.fit_transform(explicit_data_matrix) plt.figure(figsize=(size, size)) embed_dat_matrix_two_dimensions(low_dimension_data_matrix, labels=labels, density_colormap=colormap) plt.show()
def main(): svd = TruncatedSVD() Z = svd.fit_transform(X) plt.scatter(Z[:,0], Z[:,1]) for i in xrange(D): plt.annotate(s=index_word_map[i], xy=(Z[i,0], Z[i,1])) plt.show()
def tfIDFeats(ids,data): # the infamous tfidf vectorizer (Do you remember this one?) tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') # Fit TFIDF tfv.fit(data) X = tfv.transform(data) # Initialize SVD svd = TruncatedSVD(n_components=350) # Initialize the standard scaler scl = StandardScaler( with_mean=False) if X.shape[1]>350: X = svd.fit_transform(X) X = scl.fit_transform(X,ids) if plotData: X = PCA(n_components=2).fit_transform(X) return (X,ids)
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8))
def train(): with open("../data/f_hashtag_prediction/train_data_tweets_processed_0_to_500K.txt") as ftrain: with open("../data/f_hashtag_prediction/test_data_tweets_processed_2K.txt") as ftest: test_set = ftest.read().splitlines() train_set = ftrain.read().splitlines() # vectorizer = CountVectorizer() vectorizer = TfidfVectorizer(min_df=5, max_df=500, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 4), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') # vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(train_set) print tfidf_matrix.shape # print tfidf_matrix # print vectorizer.fixed_vocabulary_ smatrix = vectorizer.transform(test_set) print smatrix.shape joblib.dump(smatrix, "test_tfidf_matrix.o") joblib.dump(tfidf_matrix, "train_tfidf_matrix.o") svd = TruncatedSVD(n_components=500, random_state=42) svd.fit(tfidf_matrix) truncated_train_svd = svd.transform(tfidf_matrix) truncated_test_svd = svd.transform(smatrix) print truncated_train_svd.shape print truncated_test_svd.shape joblib.dump(truncated_train_svd, "truncated_train_svd.o") joblib.dump(truncated_test_svd, "truncated_test_svd.o") print "TEST SET: " test_index = 0
def find_k(self, rank=None, max_clusters=1, vertline=None): if rank != None: svd = TruncatedSVD(rank) self.X = svd.fit_transform(self.X) self.X = Normalizer(copy=False).fit_transform(self.X) k_range = range(1, max_clusters) clusters = [KMeans(n_clusters=k).fit(self.X) for k in k_range] centroids = [cluster.cluster_centers_ for cluster in clusters] k_cosine = [cdist(self.X, cent, metric='cosine') for cent in centroids] dist = [np.min(k_cos, axis=1) for k_cos in k_cosine] wcss = [sum(d[np.isnan(d) == False]**2) for d in dist] # Within cluster sum of squares tss = sum(pdist(self.X)**2)/self.X.shape[0] # Total sum of squares bss = tss - wcss # Explained variance fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(10, 3) plt.tight_layout() ax1.set_title('BSS') ax1.plot(np.arange(1, len(bss)+1), bss) ax1.scatter(np.arange(1, len(bss)+1), bss) ax2.set_title('WCSS') ax2.plot(np.arange(1, len(wcss)+1), wcss) ax2.scatter(np.arange(1, len(wcss)+1), wcss) plt.axvline(vertline, c='red', alpha=0.75) if vertline != None else None plt.show()
def reduce_dim(sparse_matrix, raw_data, unigrams, n: int, filename_prefix: str): """ Applies truncated SVD to given sparse matrix and "clusters" each word according to the component that "leans" most in its direction. i.e. for each user, find out which principal component has the maximum value in its direction. Then assign it to the component with the maximum value. After doing this for all users and summing up the counts, components become "super user"s. :param sparse_matrix: feature matrix to be reduced :param unigrams: unigrams that correspond to columns in sparse_matrix These will be used to create a mapping file from word to super-word :param n: number of components :param filename_prefix: assignment vector will be saved with this prefix :return: reduced feature matrix where each column is a new "super-word" """ svd = TruncatedSVD(n_components=n) svd.fit(sparse_matrix) maximums = np.argmax(np.abs(svd.components_), axis=0) unigram_feat_map = dict([(unigrams[i], maximums[i]) for i in range(len(maximums))]) reduced = get_sparse_occur_matrix(raw_data, unigram_feat_map)[:, 0:n] # num_points, _ = sparse_matrix.shape # counts = sparse.csc_matrix((num_points, n), dtype=int) # # for feat_index, target_component in enumerate(maximums): # counts[:, target_component] += sparse_matrix[:, feat_index] # with open(filename_prefix + ".svdfeatmap", "wb") as svdfeatmap: pickle.dump(unigram_feat_map, svdfeatmap) return reduced
def train_manual(): with open("../data/f_hashtag_prediction/train_data_tweets_processed_0_to_500K.txt") as ftrain: with open("../data/f_hashtag_prediction/test_data_tagged_processed_manual.txt") as ftest: test_set = ftest.read().splitlines() train_set = ftrain.read().splitlines() # vectorizer = CountVectorizer() vectorizer = TfidfVectorizer(min_df=5, max_df=500, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 4), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') # vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(train_set) print tfidf_matrix.shape smatrix = vectorizer.transform(test_set) print smatrix.shape svd = TruncatedSVD(n_components=500, random_state=42) svd.fit(tfidf_matrix) truncated_train_svd = svd.transform(tfidf_matrix) truncated_test_svd = svd.transform(smatrix) print truncated_train_svd.shape print truncated_test_svd.shape cosine = cosine_similarity(truncated_test_svd[0], truncated_train_svd) print cosine print "TEST SET: "
def solve(self, X, missing_mask): observed_mask = ~missing_mask X_filled = X for i in range(self.max_iters): # deviation from original svdImpute algorithm: # gradually increase the rank of our approximation if self.gradual_rank_increase: curr_rank = min(2 ** i, self.rank) else: curr_rank = self.rank tsvd = TruncatedSVD(curr_rank, algorithm=self.svd_algorithm) X_reduced = tsvd.fit_transform(X_filled) X_reconstructed = tsvd.inverse_transform(X_reduced) X_reconstructed = self.clip(X_reconstructed) mae = masked_mae( X_true=X, X_pred=X_reconstructed, mask=observed_mask) if self.verbose: print( "[IterativeSVD] Iter %d: observed MAE=%0.6f" % ( i + 1, mae)) converged = self._converged( X_old=X_filled, X_new=X_reconstructed, missing_mask=missing_mask) X_filled[missing_mask] = X_reconstructed[missing_mask] if converged: break return X_filled
def main(): infile = open(sys.argv[1]) outfile = sys.argv[2] #needs to be a string vocabfile = open(sys.argv[3]) vocab = json.load(vocabfile) F = sparse.lil_matrix((len(vdict), 4*len(vdict)), dtype=np.int32) corpus_size = 0 lc = 0 for line in infile: lc += 1 if lc % 10000 == 0: print('processing line ' + str(lc) + ' at ' + str(datetime.datetime.now())) words = line.split() num_words = len(words) corpus_size += num_words if num_words < 5: process_short_line(num_words, words, F, vocab) else: F[vocab[words[0]], 4 * vocab[words[1]] + 2] += 1 F[vocab[words[0]], 4 * vocab[words[2]] + 3] += 1 F[vocab[words[1]], 4 * vocab[words[0]] + 1] += 1 F[vocab[words[1]], 4 * vocab[words[2]] + 2] += 1 F[vocab[words[1]], 4 * vocab[words[3]] + 3] += 1 F[vocab[words[-2]], 4 * vocab[words[-4]] + 0] += 1 F[vocab[words[-2]], 4 * vocab[words[-3]] + 1] += 1 F[vocab[words[-2]], 4 * vocab[words[-1]] + 2] += 1 F[vocab[words[-1]], 4 * vocab[words[-3]] + 0] += 1 F[vocab[words[-1]], 4 * vocab[words[-2]] + 1] += 1 for i, word in enumerate(words[2:-2]): F[vocab[word], 4 * vocab[words[i-2]] + 0] += 1 F[vocab[word], 4 * vocab[words[i-1]] + 1] += 1 F[vocab[word], 4 * vocab[words[i+1]] + 2] += 1 F[vocab[word], 4 * vocab[words[i+2]] + 3] += 1 # compute PMI Fc = F.tocoo() word_freqs = Fc.sum(1) context_freqs = Fc.sum(0) word_freqs = word_freqs.A1 context_freqs = context_freqs.A1 for i,j,v in zip(Fc.row, Fc.col, Fc.data): F[i,j] = max( math.log((v * corpus_size) / (word_freqs[i] * context_freqs[j])), 0 ) # compute TruncatedSVD svd = TruncatedSVD(n_components=200) Fred = svd.fit_transform(F) np.savetxt(outfile, Fred, delimiter=',') infile.close() vocabfile.close()
def get_lsa(x,t): print "LSA" lsa = SVD(n_components=600,algorithm="arpack") lsa.fit(x) x = lsa.transform(x) t = lsa.transform(t) return x,t
def cook(): x, y, weights = load_data() n_components = 200 svd = TruncatedSVD(n_components, random_state=42) x_unweighted = svd.fit_transform(x) x_weighted = svd.fit_transform(weighted(x, weights)) for i in range(9): frac = 1 - (i * 0.01 + 0.01) print frac x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac) classifier = AdaBoostClassifier(n_estimators=100) classifier.fit(x_train, y_train) print "Unweighted: ", classifier.score(x_test, y_test) x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac) classifier = AdaBoostClassifier(n_estimators=100) classifier.fit(x_train, y_train) print "Weighted: ", classifier.score(x_test, y_test) print '--------------------------' '''
def benchmark(k, epochs): print("*" * 80) print("k: %d, epochs: %d\n" % (k, epochs)) #select = SelectKBest(score_func=chi2, k=k) select = TruncatedSVD(n_components=k) X_train_trunc = select.fit_transform(X_train, Y_train) X_test_trunc = select.transform(X_test) print('done truncating') parameters = {'C': [1, 10, 100, 1000, 10000], 'class_weight': ['auto', None], 'tol':[0.001,0.0001]} clf = LinearSVC(C=100000) #clf = grid_search.GridSearchCV(svc, parameters) clf.fit(X_train_trunc, Y_train) pred = clf.predict(X_test_trunc) if CREATE_SUBMISSION: X_submit_trunc = select.transform(X_submit) pred_submit = clf.predict(X_submit_trunc) dump_csv(pred_submit, k, epochs) score = metrics.f1_score(Y_test, pred) print("f1-score: %0.3f" % score) print("classification report:") print(metrics.classification_report(Y_test, pred)) print("confusion matrix:") print(metrics.confusion_matrix(Y_test, pred))
def truncatedSVD(data, labels, new_dimension): print "start truncatedSVD..." start = time.time() pca = TruncatedSVD(n_components=new_dimension) reduced = pca.fit_transform(data) end = time.time() return (reduced, end-start)
def preprocess(data, n_components, use_tf_idf=True): """ Preproecess the data for clustering by running SVD and normalizing the results. This process is also known as LSA. arguments: data -- Dataset, if tf_idf is Truethe object must contain a tf_idf table alongside a raw frequencies dataframe. n_components -- int, the number of components to use for the SVD a minimum of 100 is recommended. use_tf_idf -- bool, whether to use the tf-idf frequencies for the preprocessing. returns: e -- float, a measure of variance explained by the SVD. X -- np.array, an array with the data reduced to n_components. """ if use_tf_idf: d = data.tf_idf.as_matrix() else: d = data.df.as_matrix() svd = TruncatedSVD(n_components=n_components) X = svd.fit_transform(d) norm = Normalizer() # Record a measure of explained variance e = svd.explained_variance_ratio_.sum()*100 return e, norm.fit_transform(d)
def benchmark(k, epochs): print("*" * 80) print("k: %d, epochs: %d\n" % (k, epochs)) #select = SelectKBest(score_func=chi2, k=k) select = TruncatedSVD(n_components=k) X_train_trunc = select.fit_transform(X_train, Y_train) X_test_trunc = select.transform(X_test) print('done truncating') clf = DBN([X_train_trunc.shape[1], k, 4], learn_rates=0.3, learn_rate_decays=0.9, epochs=epochs, verbose=1) clf.fit(X_train_trunc, Y_train) pred = clf.predict(X_test_trunc) if CREATE_SUBMISSION: X_submit_trunc = select.transform(X_submit) pred_submit = clf.predict(X_submit_trunc) dump_csv(pred_submit, k, epochs) score = metrics.f1_score(Y_test, pred) print("f1-score: %0.3f" % score) print("classification report:") print(metrics.classification_report(Y_test, pred)) print("confusion matrix:") print(metrics.confusion_matrix(Y_test, pred))
def cluster_DBSCAN(args): """ Clustering with Ward hierarchical clustering: constructs a tree and cuts it. """ #load data g_it = node_link_data.node_link_data_to_eden(input = args.input_file, input_type = "file") vec = graph.Vectorizer(r = args.radius,d = args.distance, nbits = args.nbits) logger.info('Vectorizer: %s' % vec) X = vec.transform(g_it, n_jobs = args.n_jobs) logger.info('Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz() / X.shape[0])) #project to lower dimensional space to use clustering algorithms transformer = TruncatedSVD(n_components=args.n_components) X_dense=transformer.fit_transform(X) #log statistics on data logger.info('Dimensionality reduction Instances: %d Features: %d with an avg of %d features per instance' % (X_dense.shape[0], X_dense.shape[1], X.getnnz() / X.shape[0])) #clustering clustering_algo = DBSCAN(eps = args.eps) y = clustering_algo.fit_predict(X_dense) msg = 'Predictions statistics: ' msg += util.report_base_statistics(y) logger.info(msg) #save model for vectorizer out_file_name = "vectorizer" eden_io.dump(vec, output_dir_path = args.output_dir_path, out_file_name = out_file_name) logger.info("Written file: %s/%s",args.output_dir_path, out_file_name) #save result out_file_name = "labels" eden_io.store_matrix(matrix = y, output_dir_path = args.output_dir_path, out_file_name = out_file_name, output_format = "text") logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)
def train_pca_svm(learning_data, pca_dims, probability=True, cache_size=3000, **svm_kwargs): (X_train, y_train, train_ids), (X_test, y_test, test_ids) = learning_data pca = TruncatedSVD(n_components=pca_dims) n_symbols = max( np.max(X_train) + 1, np.max(X_test) + 1 ) logger.info("Forming CSR Matrices") x_train, x_test = create_csr_matrix(X_train, n_symbols), create_csr_matrix(X_test, n_symbols) logger.info("Starting PCA") # pseudo-supervised PCA: fit on positive class only pca = pca.fit(x_train[y_train > 0]) x_train_pca = pca.transform(x_train) x_test_pca = pca.transform(x_test) logger.info("Starting SVM") svc = SVC(probability=probability, cache_size=cache_size, **svm_kwargs) svc.fit(x_train_pca, y_train) logger.info("Scoring SVM") score = svc.score(x_test_pca, y_test) logger.info(score) svc.test_score = score pca.n_symbols = n_symbols return svc, pca, x_train_pca, x_test_pca
def test_inverse_transform(algo): # We need a lot of components for the reconstruction to be "almost # equal" in all positions. XXX Test means or sums instead? tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo) Xt = tsvd.fit_transform(X) Xinv = tsvd.inverse_transform(Xt) assert_array_almost_equal(Xinv, Xdense, decimal=1)
def lsa(BP, lentrain, n_components=16, preproc=True, fit_area='test', min_df=3): """ aka Latent semantic analysis """ if preproc: print "pre-processing data" traindata = [] for observation in BP: traindata.append(preprocess_pipeline(observation, "english", "WordNetLemmatizer", True, True, False)) BP = traindata print "fitting TfidfVectorizer" tfv = TfidfVectorizer(min_df=min_df, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, norm='l2') if fit_area == 'test': tfv.fit(BP[lentrain:]) elif fit_area == 'train': tfv.fit(BP[:lentrain]) else: tfv.fit(BP) print "transforming data" BP = tfv.transform(BP) print "BP(post):",BP.shape if 1: # svd here print "use svd" svd = TruncatedSVD(n_components=n_components, random_state=1) BP = svd.fit_transform(BP) return BP
def kfold(agetext,k,model,k2): import collections out = [] for i in range(k): print "iteration: "+str(i) agetext = shuffle(agetext) datatb = agetext.iloc[:,1:] label = agetext["agegroup"].tolist() X_train, X_test, y_train, y_test = cross_validation.train_test_split( datatb, label, test_size=0.15, random_state=i*6) data = X_train.values counter = collections.Counter(y_train) print counter testdata = X_test.values lsa = TruncatedSVD(k2, algorithm = 'arpack') normalizer = Normalizer(copy=False) X = lsa.fit_transform(data) X = normalizer.fit_transform(X) X_test = lsa.transform(testdata) X_test = normalizer.transform(X_test) model.fit(X,y_train) pred = model.predict(X_test) counter = collections.Counter(y_test) print counter counter = collections.Counter(pred) print counter out.append(round(accuracy_score(y_test, pred),5)) print str(out) print np.mean(out)
def lsa_summarizer(text,num_sen=5): sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sentenceTokens = sent_detector.tokenize(text.strip()) tfvectorizer = TfidfVectorizer(tokenizer=tokenizeText) sparse = tfvectorizer.fit_transform(sentenceTokens).A lsa = TruncatedSVD(n_components=1) concept = lsa.fit_transform(sparse) pos = np.array(list(range(len(sentenceTokens)))) listlist = [list(x) for x in zip(sentenceTokens,concept,pos)] listlist.sort(key=lambda x: x[1],reverse=True) summarysentences = listlist[0:num_sen] summarysentences.sort(key=lambda x: x[2],reverse=False) summary = "" for n in range(num_sen): summary += ' ' + summarysentences[n][0] summary = " ".join(summary.replace(u"\xa0", u" ").strip().split()) return summary
def SVD_CV(counts, scores, n_comp=range(10,611,100)): n_avg = 16 avg_err = [] for n in range(0,n_avg): X_train, X_test, y_train, y_test = cross_validation.train_test_split(counts, scores, \ test_size=0.2, random_state=n) test_err = [] for n in n_comp: TruncTrans = TruncatedSVD(n_components=n) X_trunc_train = TruncTrans.fit_transform(X_train,scores) regr = linear_model(X_trunc_train,y_train) X_trunc_test = TruncTrans.transform(X_test) y_pred = regr.predict(X_trunc_test)*10**(-12)+3 test_err.append(metrics.mean_squared_error(y_test, y_pred)) if not avg_err: avg_err = test_err else: avg_err = [avg_err[i]+(test_err[i]*(1.0/n_avg)) for i in range(0,len(test_err))] plt.plot(n_comp, avg_err, label='Out-of-Sample Error') plt.xlabel('n components') plt.ylabel('MSE') plt.show()
def fit_document_matrix(self, X): """ Reduce dimension of sparse matrix X using Latent Semantic Analysis and build nearst neighbor model Parameters ---------- X: sparse csr matrix, sparse term frequency matrix or others weighting matrix from documents """ n_components = self.n_components n_iter = self.n_iter algorithm = self.algorithm lsa_model = TruncatedSVD(n_components=n_components, n_iter=n_iter, algorithm=algorithm) # reduce dimension using Latent Semantic Analysis vectors = lsa_model.fit_transform(X) self.vectors = vectors # build nearest neighbor model nbrs_model = build_nearest_neighbors(vectors, n_recommend=self.n_recommend) self.nbrs_model = nbrs_model return self
def compute_svd(Xs): # compute 1st principal component svd = TruncatedSVD(n_components=1, n_iter=20, random_state=0) svd.fit(Xs) pc = svd.components_ print(pc.shape, svd.explained_variance_ratio_) return pc
def buildKB16(n_comp = 200, seed_value = 123): ## data # read the training/test data print('Importing Data') xtrain = pd.read_csv('../input/xtrain_kb6099.csv') xtest = pd.read_csv('../input/xtest_kb6099.csv') # separate id_train = xtrain.ID; xtrain.drop('ID', axis = 1, inplace = True) ytrain = xtrain.target; xtrain.drop('target', axis = 1, inplace = True) id_test = xtest.ID; xtest.drop('ID', axis = 1, inplace = True) # fit SVD svd = TruncatedSVD(n_components = n_comp,n_iter=5, random_state= seed_value) svd.fit(xtrain) xtrain = svd.transform(xtrain) xtest = svd.transform(xtest) xtrain = pd.DataFrame(xtrain) xtest = pd.DataFrame(xtest) ## store the results # add indices etc xtrain = pd.DataFrame(xtrain) xtrain['ID'] = id_train xtrain['target'] = ytrain # xtest = pd.DataFrame(xtest) xtest['ID'] = id_test # # # # save the files xtrain.to_csv('../input/xtrain_kb16c'+str(n_comp)+'.csv', index = False, header = True) xtest.to_csv('../input/xtest_kb16c'+str(n_comp)+'.csv', index = False, header = True) return
def basic_lsi(df, n_components=200, max_df=0.5, min_df=5): ''' Basic LSI model for album recommendations Args: df: dataframe with Pitchfork reviews n_components: number of lsi dimensions max_df: max_df in TfidfVectorizer min_df: min_df in TfidfVectorizer Returns: tfidf: sklearn fitted TfidfVectorizer tfidf_trans: sparse matrix with tfidf transformed data svd: sklearn fitted TruncatedSVD svd_trans: dense array with lsi transformed data ''' X = df['review'] stopwords = nltk.corpus.stopwords.words('english') tfidf = TfidfVectorizer(stop_words=stopwords, max_df=max_df, min_df=min_df) tfidf_trans = tfidf.fit_transform(X) svd = TruncatedSVD(n_components=n_components) svd_trans = svd.fit_transform(tfidf_trans) return tfidf, tfidf_trans, svd, svd_trans
def test_sparse_formats(fmt): Xfmt = Xdense if fmt == "dense" else getattr(X, "to" + fmt)() tsvd = TruncatedSVD(n_components=11) Xtrans = tsvd.fit_transform(Xfmt) assert_equal(Xtrans.shape, (n_samples, 11)) Xtrans = tsvd.transform(Xfmt) assert_equal(Xtrans.shape, (n_samples, 11))
def train(n_components, demean, n_samples): print("Loading data...") movie_titles, ratings, rating_indices, n_users, n_items = get_netflix_data( n_samples=n_samples) print("number of users with ratings: {}".format( len(np.unique(rating_indices[:, 0])))) print("number of movies with ratings: {}".format( len(np.unique(rating_indices[:, 1])))) n_splits = 5 kf = KFold(n_splits=n_splits, shuffle=True) kf.get_n_splits(rating_indices) if not n_components: components = [5, 10, 15, 20, 30, 50] components_loss_path = np.zeros((len(components), n_splits)) print("Finding optimal number of components...") for n, n_components in enumerate(components): print("n_components: {}".format(n_components)) for k, (train_index, test_index) in enumerate(kf.split(rating_indices)): mean = None print("Fold {}".format(k)) test_indices = rating_indices[test_index] test_indices = test_indices[:, 0], test_indices[:, 1], test_indices[:, 2] if demean: print("De-mean training data...") train_indices = rating_indices[train_index] mean = np.mean(train_indices[:, 2]) train_indices = train_indices[:, 0], train_indices[:, 1], train_indices[:, 2] - mean data_train = scipy.sparse.csr_matrix( (train_indices[2], (train_indices[0], train_indices[1])), shape=(n_users, n_items)) else: user_test_indices, item_test_indices = test_indices[ 0], test_indices[1] data_train = scipy.sparse.lil_matrix(ratings) data_train[user_test_indices, item_test_indices] = 0 data_train = scipy.sparse.csr_matrix(ratings) print("Finished de-meaning.") start = time.time() print("Fitting...") svd = TruncatedSVD(n_components=n_components) P = svd.fit_transform(data_train) Q = svd.components_ acc, loss = evaluate(P, Q, test_indices, mean=mean) print("Elapsed time: {:.1f}s".format(time.time() - start)) print("loss: {:.4f} - acc: {:.4f}".format(loss, acc)) components_loss_path[n, k] = loss mean_loss = np.mean(components_loss_path, axis=1) best_k = components[np.argmin(mean_loss)] best_loss = np.amin(mean_loss) print("best k: {}, best loss: {:.4f}".format(best_k, best_loss)) else: print("Performing cross validation...") mean_acc = 0.0 mean_loss = 0.0 for k, (train_index, test_index) in enumerate(kf.split(rating_indices)): mean = None print("Fold {}".format(k)) test_indices = rating_indices[test_index] test_indices = test_indices[:, 0], test_indices[:, 1], test_indices[:, 2] if demean: print("De-mean training data...") train_indices = rating_indices[train_index] mean = np.mean(train_indices[:, 2]) train_indices = train_indices[:, 0], train_indices[:, 1], train_indices[:, 2] - mean data_train = scipy.sparse.csr_matrix( (train_indices[2], (train_indices[0], train_indices[1])), shape=(n_users, n_items)) print("Finished de-meaning.") else: user_test_indices, item_test_indices = test_indices[ 0], test_indices[1] data_train = scipy.sparse.lil_matrix(ratings) data_train[user_test_indices, item_test_indices] = 0 data_train = scipy.sparse.csr_matrix(ratings) start = time.time() print("fitting...") svd = TruncatedSVD(n_components=n_components) P = svd.fit_transform(data_train) Q = svd.components_ acc, loss = evaluate(P, Q, test_indices, mean=mean) print("Elapsed time: {:.4f}".format(time.time() - start)) print("loss: {:.4f} - acc: {:.4f}".format(loss, acc)) mean_acc = (mean_acc * k + acc) / (k + 1) mean_loss = (mean_loss * k + loss) / (k + 1) print("mean loss: {:.4f} - mean acc: {:.4f}".format( mean_loss, mean_acc))
if __name__ == "__main__": print 'loading x_tr...' t0 = time.time() x_tr = load_csr_matrix_from_npz('../data/processed/tf_idf_transformation/train/matrix.npz') print 'loading finished, time = {0}'.format(time.time()-t0) print 'loading y_tr...' t0 = time.time() y_tr = numpy.loadtxt('../data/processed/tf_idf_transformation/train/labels.csv', dtype='int') print 'loading finished, time = {0}'.format(time.time()-t0) print 'running TruncatedSVD...' t0 = time.time() from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components=100) x_tr_new = svd.fit_transform(x_tr, y_tr) print 'running TruncatedSVD finished, x_new.shape = {0}, time = {1}'.format(x_tr_new.shape, time.time()-t0) #delete x_tr del x_tr print 'fitting model...' t0 = time.time() from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression clf = OneVsRestClassifier(LogisticRegression()) clf.fit(x_tr_new, y_tr) print 'fitting finished, time = {0}'.format(time.time()-t0) #delete x_tr_new, y_tr
#Load train data X_origin = pd.read_csv("train_isot.csv", ",") Y = X_origin['label'].values X_origin = X_origin['text'].values print("Train set read.") stopwords = set(ENGLISH_STOP_WORDS) svm_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.73, stop_words=stopwords) X = svm_vectorizer.fit_transform(X_origin) print("Vectorized.") svd = TruncatedSVD(n_components=200, algorithm='arpack', random_state=42) print("SVD prepared.") X = svd.fit_transform(X) print("SVD finished.") # tprs = [] # aucs = [] # mean_fpr = np.linspace(0, 1, 100) # fig, ax = plt.subplots() score_f = 0 score_a = 0 kf = KFold(n_splits=5, random_state=42, shuffle=True) for i, (train, test) in enumerate(kf.split(X)): X_train = X[train]
print "generate %s feat" % feat_name # tfidf tfv = getTFV(ngram_range=ngram_range) X_tfidf_train = tfv.fit_transform( dfTrain.iloc[trainInd][column_name]) X_tfidf_valid = tfv.transform( dfTrain.iloc[validInd][column_name]) with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(X_tfidf_train, f, -1) with open("%s/valid.%s.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(X_tfidf_valid, f, -1) # svd svd = TruncatedSVD(n_components=svd_n_components, n_iter=15) X_svd_train = svd.fit_transform(X_tfidf_train) X_svd_test = svd.transform(X_tfidf_valid) with open( "%s/train.%s_individual_svd%d.feat.pkl" % (path, feat_name, svd_n_components), "wb") as f: cPickle.dump(X_svd_train, f, -1) with open( "%s/valid.%s_individual_svd%d.feat.pkl" % (path, feat_name, svd_n_components), "wb") as f: cPickle.dump(X_svd_test, f, -1) print("Done.") # Re-training print("For training and testing...")
def gen_plan_feas(data): n = data.shape[0] mode_list_feas = np.zeros((n, 12)) max_dist, min_dist, mean_dist, std_dist = np.zeros((n, )), np.zeros( (n, )), np.zeros((n, )), np.zeros((n, )) max_price, min_price, mean_price, std_price = np.zeros((n, )), np.zeros( (n, )), np.zeros((n, )), np.zeros((n, )) max_eta, min_eta, mean_eta, std_eta = np.zeros((n, )), np.zeros( (n, )), np.zeros((n, )), np.zeros((n, )) min_dist_mode, max_dist_mode, min_price_mode, max_price_mode, min_eta_mode, max_eta_mode, first_mode = np.zeros( (n, )), np.zeros((n, )), np.zeros((n, )), np.zeros((n, )), np.zeros( (n, )), np.zeros((n, )), np.zeros((n, )) mode_texts = [] for i, plan in tqdm(enumerate(data['plans'].values)): try: cur_plan_list = json.loads(plan) except: cur_plan_list = [] if len(cur_plan_list) == 0: mode_list_feas[i, 0] = 1 first_mode[i] = 0 max_dist[i] = -1 min_dist[i] = -1 mean_dist[i] = -1 std_dist[i] = -1 max_price[i] = -1 min_price[i] = -1 mean_price[i] = -1 std_price[i] = -1 max_eta[i] = -1 min_eta[i] = -1 mean_eta[i] = -1 std_eta[i] = -1 min_dist_mode[i] = -1 max_dist_mode[i] = -1 min_price_mode[i] = -1 max_price_mode[i] = -1 min_eta_mode[i] = -1 max_eta_mode[i] = -1 mode_texts.append('word_null') else: distance_list = [] price_list = [] eta_list = [] mode_list = [] for tmp_dit in cur_plan_list: distance_list.append(int(tmp_dit['distance'])) if tmp_dit['price'] == '': price_list.append(0) else: price_list.append(int(tmp_dit['price'])) eta_list.append(int(tmp_dit['eta'])) mode_list.append(int(tmp_dit['transport_mode'])) mode_texts.append(' '.join( ['word_{}'.format(mode) for mode in mode_list])) distance_list = np.array(distance_list) price_list = np.array(price_list) eta_list = np.array(eta_list) mode_list = np.array(mode_list, dtype='int') mode_list_feas[i, mode_list] = 1 distance_sort_idx = np.argsort(distance_list) price_sort_idx = np.argsort(price_list) eta_sort_idx = np.argsort(eta_list) max_dist[i] = distance_list[distance_sort_idx[-1]] min_dist[i] = distance_list[distance_sort_idx[0]] mean_dist[i] = np.mean(distance_list) std_dist[i] = np.std(distance_list) max_price[i] = price_list[price_sort_idx[-1]] min_price[i] = price_list[price_sort_idx[0]] mean_price[i] = np.mean(price_list) std_price[i] = np.std(price_list) max_eta[i] = eta_list[eta_sort_idx[-1]] min_eta[i] = eta_list[eta_sort_idx[0]] mean_eta[i] = np.mean(eta_list) std_eta[i] = np.std(eta_list) first_mode[i] = mode_list[0] max_dist_mode[i] = mode_list[distance_sort_idx[-1]] min_dist_mode[i] = mode_list[distance_sort_idx[0]] max_price_mode[i] = mode_list[price_sort_idx[-1]] min_price_mode[i] = mode_list[price_sort_idx[0]] max_eta_mode[i] = mode_list[eta_sort_idx[-1]] min_eta_mode[i] = mode_list[eta_sort_idx[0]] feature_data = pd.DataFrame(mode_list_feas) feature_data.columns = ['mode_feas_{}'.format(i) for i in range(12)] feature_data['max_dist'] = max_dist feature_data['min_dist'] = min_dist feature_data['mean_dist'] = mean_dist feature_data['std_dist'] = std_dist feature_data['max_price'] = max_price feature_data['min_price'] = min_price feature_data['mean_price'] = mean_price feature_data['std_price'] = std_price feature_data['max_eta'] = max_eta feature_data['min_eta'] = min_eta feature_data['mean_eta'] = mean_eta feature_data['std_eta'] = std_eta feature_data['max_dist_mode'] = max_dist_mode feature_data['min_dist_mode'] = min_dist_mode feature_data['max_price_mode'] = max_price_mode feature_data['min_price_mode'] = min_price_mode feature_data['max_eta_mode'] = max_eta_mode feature_data['min_eta_mode'] = min_eta_mode feature_data['first_mode'] = first_mode print('mode tfidf...') tfidf_enc = TfidfVectorizer(ngram_range=(1, 2)) tfidf_vec = tfidf_enc.fit_transform(mode_texts) svd_enc = TruncatedSVD(n_components=10, n_iter=20, random_state=2019) mode_svd = svd_enc.fit_transform(tfidf_vec) mode_svd = pd.DataFrame(mode_svd) mode_svd.columns = ['svd_mode_{}'.format(i) for i in range(10)] data = pd.concat([data, feature_data, mode_svd], axis=1) data = data.drop(['plans'], axis=1) return data
def hac(self, n_clusters, verbose=False, tfidf=None, n_dimensions=None): """ Apply Hierarchical Agglomerative Clustering on a document collection. This method generates a hierarchical clustering tree for the collection. The leaves of the tree are clusters consisting of single documents. The tree is then saved by saving the list of merges in a file. Each entry of this list contains the two tree nodes that were merged to create a new node and the new node's id. Node ids less than the number of leaves represent leaves, while node ids greater than the number of leaves indicate internal nodes. Args: self.corpus (:obj:'Corpus'): The Corpus object of the document collection. Defaults to None. Only used when no pre-computed Tf/Idf matrix is given. tfidf_path (str): The path to the file containing the Tf/Idf matrix .pkl file. Defaults to None and in this case the Tf/Idf matrix is calculated. verbose (bool): When True additional information will be printed. Defaults to False. Returns: hac_model (:obj:'AgglomerativeClustering'): The HAC model fitted on the document collection. """ # Compute or load Tf/Idf matrix. if tfidf is None: tfidf = self.extract_tfidf(self.corpus) print(tfidf.shape) # Apply latent semantic analysis. if n_dimensions is not None: print('Performing latent semantic analysis') svd = TruncatedSVD(n_dimensions) # Normalize SVD results for better clustering results. lsa = make_pipeline(svd, Normalizer(copy=False)) tfidf = lsa.fit_transform(tfidf) print(tfidf.shape) # Calculate documente distance matrix from Tf/Idf matrix print('Constructing distance matrix...') dist = 1 - cosine_similarity(tfidf) start_time = time.time() print('Clustering...') # Generate HAC model. hac_model = AgglomerativeClustering(linkage='ward', n_clusters=n_clusters) # Fit the model on the distance matrix. hac_model.fit(dist) end_time = time.time() pickle.dump(hac_model, open('hac.pkl', 'wb')) if verbose: # Visualize cluster model children = hac_model.children_ merges = [{ 'node_id': node_id + len(dist), 'right': children[node_id, 0], 'left': children[node_id, 1] } for node_id in range(0, len(children))] pickle.dump(merges, open('merges.pkl', 'wb')) pickle.dump(children, open('children.pkl', 'wb')) for merge_entry in enumerate(merges): print(merge_entry[1]) print('Clustering completed after ' + str(round((end_time - start_time) / 60)) + "' " + str(round((end_time - start_time) % 60)) + "''") return hac_model
def kmeans(self, n_clusters, tfidf=None, n_dimensions=None, verbose=False): """ Applies kmeans clustering on a document collection. Args: self.corpus (:obj:'Corpus'): The Corpus object of the document collection. Defaults to None. Only used when no pre-computed Tf/Idf matrix is given. tfidf_path (str): The path to the file containing the Tf/Idf matrix .pkl file. Defaults to None and in this case the Tf/Idf matrix is calculated. verbose (bool): When True additional information will be printed. Defaults to False. Returns: kmodel (:obj:'Kmeans'): Scikit KMeans clustering model. """ # Compute or load Tf/Idf matrix. if tfidf is None: tfidf = self.extract_tfidf(self.corpus) print(tfidf.shape) # Apply latent semantic analysis. if n_dimensions is not None: print('Performing latent semantic analysis...') svd = TruncatedSVD(n_dimensions) # Normalize SVD results for better clustering results. lsa = make_pipeline(svd, Normalizer(copy=False)) tfidf = lsa.fit_transform(tfidf) print(tfidf.shape) # Do the clustering. start_time = time.time() print('Clustering...') kmodel = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=1, max_iter=10, verbose=True) kmodel.fit(tfidf) end_time = time.time() # Create a matching of the clusters and the ids of the documents # they contain. cluster_doc = pd.Series() for i in range(kmodel.n_clusters): ids = [] for docid, cluster in enumerate(kmodel.labels_): if cluster == i: ids.append(docid) cluster_doc.loc[i] = ids if verbose: # Print some info. print("Top terms per cluster:") if n_dimensions is not None: original_space_centroids = svd.inverse_transform( kmodel.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = kmodel.cluster_centers_.argsort()[:, ::-1] features = pickle.load(open('features.pkl', 'rb')) cluster_word = pd.Series() for i in range(n_clusters): cluster_features = [] print("Cluster %d:" % i) for ind in order_centroids[i, :100]: cluster_features.append(features[ind]) cluster_word.loc[i] = cluster_features pickle.dump(kmodel, open('kmodel.pkl', 'wb')) pickle.dump(kmodel.cluster_centers_, open('centers.pkl', 'wb')) pickle.dump(cluster_doc, open('cluster_doc.pkl', 'wb')) pickle.dump(cluster_word, open('cluster_word.pkl', 'wb')) print('Clustering completed after ' + str(round((end_time - start_time) / 60)) + "' " + str(round((end_time - start_time) % 60)) + "''") return kmodel
] dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42) labels_true = dataset.target true_k = np.unique(labels_true).shape[0] # t0 = time() vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, stop_words='english', use_idf=True) X = vectorizer.fit_transform(dataset.data) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) # explained_variance = svd.explained_variance_ratio_.sum() Wardhierarchial = AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto', connectivity=None, linkage='ward', memory=None, n_clusters=2, pooling_func='deprecated').fit(X) labels = Wardhierarchial.labels_ print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
d2 = pdt_ttl_vec[i, :] dst_srch_ttl1[i] = cosine_similarity(d1, d2) dst_srch_desc1 = np.zeros(srch_vec.shape[0]) for i in range(srch_vec.shape[0]): d1 = srch_vec[i, :] d2 = pdt_desc_vec[i, :] dst_srch_desc1[i] = cosine_similarity(d1, d2) dst_ttl_desc1 = np.zeros(srch_vec.shape[0]) for i in range(srch_vec.shape[0]): d1 = pdt_ttl_vec[i, :] d2 = pdt_desc_vec[i, :] dst_srch_desc1[i] = cosine_similarity(d1, d2) svd = TruncatedSVD(n_components=30, random_state=2016) srch_vec = svd.fit_transform(srch_vec) pdt_ttl_vec = svd.fit_transform(pdt_ttl_vec) pdt_desc_vec = svd.fit_transform(pdt_desc_vec) srch_vec = pd.DataFrame( srch_vec, columns=['srch_vec_' + str(i) for i in range(srch_vec.shape[1])]) pdt_ttl_vec = pd.DataFrame( pdt_ttl_vec, columns=['ttl_vec_' + str(i) for i in range(pdt_ttl_vec.shape[1])]) pdt_desc_vec = pd.DataFrame( pdt_desc_vec, columns=['desc_vec_' + str(i) for i in range(pdt_desc_vec.shape[1])]) id = list(df_all['id'])
transformer_list=[ # Pipeline for pulling features from the post's title line ('title', Pipeline([ ('selector', ItemSelector(key='title')), ('tfidf', TfidfVectorizer(min_df=50, stop_words='english')), ])), # Pipeline for standard bag-of-words model for abstract ('abstract_bow', Pipeline([ ('selector', ItemSelector(key='abstract')), ('tfidf', TfidfVectorizer(stop_words='english')), ('best', TruncatedSVD(n_components=50)), ])), # Pipeline for pulling ad hoc features from post's abstract ( 'abstract_stats', Pipeline([ ('selector', ItemSelector(key='abstract')), ('stats', TextStats()), # returns a list of dicts ('vect', DictVectorizer() ), # list of dicts -> feature matrix ])), ], # weight components in FeatureUnion transformer_weights={
le = preprocessing.LabelEncoder() le.fit(df["Category"]) Y_train=le.transform(df["Category"]) X_train1=df['Content'] X_train2=[] for i in range(len(X_train1)): X_train2.append(10*df['Title'][i]+df['Content'][i]) X_train=np.array(X_train2) #read test file df_test=pd.read_csv("test_set.csv",sep="\t") vectorizer=CountVectorizer(stop_words='english') transformer=TfidfTransformer() svd=TruncatedSVD(n_components=200, random_state=42) pipeline_test = Pipeline([ ('vect', vectorizer), ('tfidf', transformer), ('svd',svd), ]) #My method---Voting Classifier clf1 = BernoulliNB(fit_prior=False) clf2 = KNeighborsClassifier(weights='distance',n_jobs=-1) clf3 = RandomForestClassifier(n_estimators=500,n_jobs=-1) clf = VotingClassifier(estimators=[('bnb',clf1),('knn',clf2),('rf',clf3)], voting='hard') pipeline = Pipeline([ ('vect', vectorizer), ('tfidf', transformer), ('svd',svd), ('clf', clf)
# print titles for sentence in test_data['Content']: temp_title = '' for j in range(10): temp_title = titles2[i] + ' ' + temp_title sentences2.append(temp_title + PorterStemmer().stem_sentence(sentence)) i = i + 1 #Vectorizing-LSI-Classifier X_train = np.array(sentences) X_test = np.array(sentences2) clf = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopw)),\ ('svd' , TruncatedSVD(n_components=1000) ),\ ('clf', svm.SVC(C=10, gamma = 0.0001, kernel= 'linear', class_weight='balanced')), ]) clf.fit(X_train, y) predicted = clf.predict(X_test) #Print Results categories = le.inverse_transform(predicted) i = 0 CsvData2 = [['Id', 'Category']] for t in test_data['Id']: CsvData2.append([t, categories[i]]) i = i + 1
def svd(*args, **kwargs): return TruncatedSVD(*args, **kwargs)
def test_pipeline_column_transformer(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1" if x > 0.5 else "cat2") X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3" if x > 0.5 else "cat4") y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="lbfgs", tol=1e-3, ) numeric_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ]) categorical_transformer = Pipeline(steps=[ ( "onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"), ), ( "tsvd", TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4), ), ]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) initial_type = [ ("numfeat", FloatTensorType([None, 3])), ("strfeat", StringTensorType([None, 2])), ] X_train = X_train[:11] model_onnx = convert_sklearn(model, initial_types=initial_type) dump_data_and_model( X_train, model, model_onnx, basename="SklearnPipelineColumnTransformerPipeliner", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.3') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", ) if __name__ == "__main__": from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer pydot_graph = GetPydotGraph( model_onnx.graph, name=model_onnx.graph.name, rankdir="TP", node_producer=GetOpNodeProducer("docstring"), ) pydot_graph.write_dot("graph.dot") import os os.system("dot -O -G=300 -Tpng graph.dot")
'adenoma', 'neurocitoma', 'cervello', 'glioma', 'glioblastoma', 'glia', 'lipoma', 'liposarcoma', 'adiposo' ] pairs = { 'b': [('fibroma', 'fibrosarcoma'), ('lipoma', 'liposarcoma'), ('osteoma', 'osteosarcoma'), ('papilloma', 'carcinoma'), ('adenoma', 'adenocarcinoma'), ('glioma', 'glioblastoma'), ('neurocitoma', 'neuroblastoma')], 'r': [('fibrosarcoma', 'connettivo'), ('liposarcoma', 'adiposo'), ('linfoma', 'linfonodo'), ('osteosarcoma', 'osso'), ('mesotelioma', 'mesotelio'), ('glioblastoma', 'glia'), ('neuroblastoma', 'neuroni')] } proj = Projector(n_components=2, random_state=0) np.set_printoptions(suppress=True) Y = proj.fit_transform(vectors[[index[k] for k in subset], :]) labels = words[[index[k] for k in subset]] plt.scatter(Y[:, 0], Y[:, 1]) for label, x, y in zip(labels, Y[:, 0], Y[:, 1]): plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points') for color in pairs: for pair in pairs[color]: plt.plot([Y[subset.index(pair[0]), 0], Y[subset.index(pair[1]), 0]], [Y[subset.index(pair[0]), 1], Y[subset.index(pair[1]), 1]], '-', color=color)
# Transposing the matrix X = ratings_matrix.T X.head() # X = ratings_matrix # X.head() X.shape X1 = X #Decomposing the Matrix SVD = TruncatedSVD(n_components=10) decomposed_matrix = SVD.fit_transform(X) decomposed_matrix.shape #Correlation Matrix correlation_matrix = np.corrcoef(decomposed_matrix) correlation_matrix.shape X.index[75] # Index of product ID purchased by customer i = "B00000K135" product_names = list(X.index)
''' Modeling: - TF-IDF - SVD - Visualization ''' # TF-IDF vec = TfidfVectorizer(max_features = 1000, max_df = 0.5, smooth_idf = True) x = vec.fit_transform(joined) # SVD svd = TruncatedSVD(n_components = 20, algorithm = 'randomized', n_iter = 100, random_state = 100) svd.fit(x) # Labels terms = vec.get_feature_names() for i, comp in enumerate(svd.components_): terms_comp = zip(terms, comp) sorted_terms = sorted(terms_comp, key = lambda x:x[1], reverse = True)[:7] print("Topic "+ str(i) + ": ", [t[0] for t in sorted_terms]) # Visualize topics = svd.fit_transform(x)
def post(self): # Get the THEME labels abs_filename = ett_h.generate_dynamic_path( [base_folder_location, LabelType.THEME.value, label_file_name]) labels = (ett_h.load_data_common_separated(abs_filename, ',')) # Get the label data from input_data raw_label = TrainThemeUpload.input_data[ColumnName.LABEL.value] data = ett_t.transform_data_to_dataframe_basic( TrainThemeUpload.input_data, colnames) # Get the OneHotEncoded labels label_df = ett_t.one_hot_encoding(raw_label) #17 labels dataframe # Rename the OneHotEncoded labels label_df.columns = labels # Get the number of labels num_of_labels = len(labels) # Data preprocessing nan_cleaned_data = ett_c.clean_dataframe_by_regex( data, RegexFilter.NON_ALPHA_NUMERIC.value ) # Removed all non alphanumeric characters d_cleaned_data = ett_c.clean_dataframe_by_regex( nan_cleaned_data, RegexFilter.DIGITS_ONLY.value) # Removed all digits l_cleaned_data = ett_c.remove_non_iso_words( d_cleaned_data, Language.ENGLISH.value) # Remove non-English text rew_cleaned_data = ett_c.remove_language_stopwords( l_cleaned_data, Language.ENGLISH.name) # Remove English stop words l_transformed_data = ett_t.lowercase( rew_cleaned_data) # Transform text to lowercase le_transformed_data = ett_t.stemming_mp( l_transformed_data ) # Transform text to core words i.e. playing > play data = le_transformed_data # Return the newly transformed data # Split the data into 0.8 training datasets and 0.2 testing datasets X_train, X_test, y_train, y_test = train_test_split(data, label_df, test_size=0.2, random_state=42) endpoint_output = {} for i in range(num_of_labels): model_id = str(i) single_label = y_train.iloc[:, i] label = labels[i] print("label", label) pipeline = imbPipeline([ (ModelType.TFIDF.value, TfidfVectorizer()), # Data vectorization (ModelType.OVERSAMPLE.value, SMOTE(random_state=42)), # Data balancing (ModelType.SVD.value, TruncatedSVD()), # Feature selection (ModelType.NOR.value, preprocessing.MinMaxScaler()), # Data normalization (ModelType.CLF.value, OneVsRestClassifier(SVC())) ]) # CLassification #list_c = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1] list_c = [1] #list_n = [100, 150, 200, 250, 300, 350, 400, 450, 500, 550]) list_n = [100] # Remember to add[2,\]2] best_score = 0 epsilon = .005 dictionary = {} for para_c in list_c: for para_n in list_n: parameters = { ModelType.TFIDF.value: [ TfidfVectorizer(max_features=800, ngram_range=(1, 4), norm='l2', encoding='latin-1', stop_words='english', analyzer='word') ], ModelType.SVD.value: [ TruncatedSVD(n_components=para_n, n_iter=7, random_state=42) ], ModelType.CLF.value: [ OneVsRestClassifier( SVC(kernel='linear', probability=True, C=para_c)) ] } gs_clf = GridSearchCV(pipeline, parameters, cv=5, error_score='raise', scoring='f1') gs_clf = gs_clf.fit(X_train, single_label) current_score = gs_clf.best_score_ dictionary[current_score] = parameters for current_score in dictionary.keys(): if current_score - epsilon > best_score: best_score = current_score model_dict = dictionary[best_score] label_model_list = {} label_model_list['score'] = best_score folder_time = time.strftime("_%Y%m%d_%H%M") # Create Directory in the AWS S3 Bucket os.mkdir("/Users/yihanbao/Desktop/unisdr-training/theme/" + label + "/" + label + folder_time) # Navigate to AWS model saving folder model_folder = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname(os.path.realpath(__file__))))), ett_h.generate_dynamic_path( [LabelType.THEME.value, label, label + folder_time])) """ # Connect to AWS conn = boto.s3.connect_to_region(" ",aws_access_key_id = 'AWS-Access-Key', aws_secret_access_key = 'AWS-Secrete-Key', calling_format = boto.s3.connection.OrdinaryCallingFormat()) bucket = conn.get_bucket("oict-psdg-unisdr-train-models-v1") # AWS Key aws_path = ett_h.generate_dynamic_path([LabelType.THEME.value, label, timestamp+label]) """ # Here to fit the training datasets to the models with best score # vectorization vector = model_dict[ModelType.TFIDF.value][0].fit( X_train, single_label) ett_h.save_model( vector, ett_h.generate_dynamic_path( [model_folder, label + folder_time + vector_model_name])) vectorized_df = vector.transform(X_train) label_model_list[ URLName.VECURL.value] = ett_h.generate_dynamic_path( [model_folder, label + folder_time + vector_model_name]) """ key_name = timestamp+label+model_name full_key_name = os.path.join(path, key_name) pickle_byte_obj = pickle.dump(vector) s3_resource = resource('s3') s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj) """ # Balcancing sm = SMOTE(random_state=42) X_res, y_res = sm.fit_resample(vectorized_df, single_label) # Feature selction svd = model_dict[ModelType.SVD.value][0].fit(X_res, y_res) ett_h.save_model( svd, ett_h.generate_dynamic_path([ model_folder, label + folder_time + dim_reductor_model_name ])) dim_reductor_df = svd.transform(X_res) label_model_list[ URLName.DIMURL.value] = ett_h.generate_dynamic_path([ model_folder, label + folder_time + dim_reductor_model_name ]) """ key_name = timestamp+label+dim_reductor_model_name full_key_name = os.path.join(path, key_name) pickle_byte_obj = pickle.dump(svd) s3_resource = resource('s3') s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj) """ # Normalizing min_max_scaler = preprocessing.MinMaxScaler() nor_model = min_max_scaler.fit(dim_reductor_df, y_res) ett_h.save_model( nor_model, ett_h.generate_dynamic_path([ model_folder, label + folder_time + normalizar_model_name ])) scaled_df = nor_model.transform(dim_reductor_df) label_model_list[ URLName.NORURL.value] = ett_h.generate_dynamic_path([ model_folder, label + folder_time + normalizar_model_name ]) """ key_name = timestamp+label+normalizar_model_name full_key_name = os.path.join(path, key_name) pickle_byte_obj = pickle.dump(nor_model) s3_resource = resource('s3') s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj) """ # Classifier clf = model_dict[ModelType.CLF.value][0].fit(scaled_df, y_res) clf.fit(scaled_df, y_res) ett_h.save_model( clf, ett_h.generate_dynamic_path( [model_folder, label + folder_time + model_name])) label_model_list[ URLName.MODURL.value] = ett_h.generate_dynamic_path( [model_folder, label + folder_time + model_name]) """ key_name = timestamp+label+model_name full_key_name = os.path.join(path, key_name) pickle_byte_obj = pickle.dump(scaled_df) s3_resource = resource('s3') s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj) """ endpoint_output[model_id] = [label_model_list] output = json.dumps(endpoint_output) return output
'nntp', '00041032', '000062david42', '000050', '00041555', '0004244402', 'mcimail', '00043819', 'prb', '0004246', '0004422', '00044513', '00044939','access', 'digex', 'host', 'would', 'writes', 'posting', 'dseg']) # In[5]: vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range = (1, 3)) X = vectorizer.fit_transform(corpus) # In[6]: #decompose into X=UST^T lsa = TruncatedSVD(n_components = 25, n_iter = 100) lsa.fit(X) # In[7]: terms = vectorizer.get_feature_names() for i, comp in enumerate(lsa.components_): termsInComp = zip (terms, comp) sortedTerms = sorted(termsInComp, key = lambda x: x[1], reverse = True) [:10] print("Concept %d:" % i ) for term in sortedTerms: print(term[0]) print(" ")
def svd_vector(data): svd = TruncatedSVD(n_components=1) vector = svd.fit_transform(data.ix[:, 6:].transpose()) return [item for sublist in vector for item in sublist]
if len(fns) > 1: print('Multiple merged embeddings in working directory.') sys.exit() else: m = fns[0] print('Reading raw.') sys.stdout.flush() df = pd.read_csv(m, index_col=0, header=None) if df.index.names[0] == 0: print('Renaming index column to SampleID.') df.index.names = ['SampleID'] df.to_csv(m, compression='gzip') mat = df.to_numpy().T sampids = df.index del df print('Performing svd.') sys.stdout.flush() svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0) svd.fit(mat) pc = svd.components_ mat -= mat.dot(pc.T) * pc print('Saving nonraw.') sys.stdout.flush() df = pd.DataFrame(mat.T, index=sampids) df.index.names = ['SampleID'] df.to_csv(m.replace('_raw', ''), compression='gzip')
def do_lsa(X, target_dim): svd = TruncatedSVD(target_dim, random_state=42) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) return lsa.fit_transform(X)
def main(argv): choose_mindf = argv[1] try: path = argv[2] except: path = None categories1 = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey' ] categories2 = [ 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian' ] cat_all = [ 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian', 'alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.windows.x', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ] dclass = Data(categories1, cat_all, categories2, path) stop_words = text.ENGLISH_STOP_WORDS print('-----Part A-----') #plot_histogram(dclass) print('-----Part B-----') vectorizer2 = CountVectorizer(min_df=2, stop_words=stop_words, max_df=0.8) tfidf_transformer2 = TfidfTransformer() vectorizer5 = CountVectorizer(min_df=5, stop_words=stop_words, max_df=0.8) tfidf_transformer5 = TfidfTransformer() tfidf2 = preprocess(dclass, dclass.training_data1, vectorizer2, tfidf_transformer2, train=True) tfidf5 = preprocess(dclass, dclass.training_data1, vectorizer5, tfidf_transformer5, train=True) #default min_df=5 print('# of terms with min_df = 2:', tfidf2[0, :].toarray().shape[1], '\n# of terms with min_df = 5:', tfidf5[0, :].toarray().shape[1]) d_tfidf = {'2': tfidf2, '5': tfidf5} d_vectorizer = {'2': vectorizer2, '5': vectorizer5} d_transformer = {'2': tfidf_transformer2, '5': tfidf_transformer5} print('-----Part C-----') vectorizerc_2 = CountVectorizer(min_df=2, stop_words=stop_words, max_df=0.8) tfidf_transformerc_2 = TfidfTransformer() tfidf_c_2 = preprocess(dclass, dclass.training_data2, vectorizerc_2, tfidf_transformerc_2, train=True, ICF=True) #default min_df=5, use TF-ICF find_10most(dclass, tfidf_c_2) vectorizerc_5 = CountVectorizer(min_df=5, stop_words=stop_words, max_df=0.8) tfidf_transformerc_5 = TfidfTransformer() tfidf_c_5 = preprocess(dclass, dclass.training_data2, vectorizerc_5, tfidf_transformerc_5, train=True, ICF=True) #default min_df=5, use TF-ICF find_10most(dclass, tfidf_c_5) print('-----Part D-----') #SVD and NMF base on TF-IDF5 result svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42) D_LSI = svd.fit_transform(d_tfidf[choose_mindf]) model = NMF(n_components=50, init='random', random_state=0) D_NMF = model.fit_transform(d_tfidf[choose_mindf]) print('LSI.shape:', D_LSI.shape, '\nNMF.shape:', D_NMF.shape) print('-----Part E-----') #SVM tfidftest = preprocess(dclass, dclass.testing_data1, d_vectorizer[choose_mindf], d_transformer[choose_mindf], train=False) #testing data D_LSI_test = svd.transform(tfidftest) D_NMF_test = model.transform(tfidftest) print('for D_LSI:') part_e(dclass, D_LSI, D_LSI_test) print('for D_NMF:') part_e(dclass, D_NMF, D_NMF_test) print('-----Part F-----') print('for D_LSI:') part_f(dclass, D_LSI, D_LSI_test) print('for D_NMF:') part_f(dclass, D_NMF, D_NMF_test) print('-----Part G-----') part_g(dclass, D_NMF, D_NMF_test, dclass.training_target1) print('-----Part H-----') part_h(dclass, D_LSI, D_LSI_test) part_h(dclass, D_NMF, D_NMF_test) print('-----Part I-----') part_i(dclass, D_LSI, D_LSI_test) part_i(dclass, D_NMF, D_NMF_test) print('-----Part J-----') tfidf2_j = preprocess(dclass, dclass.training_dataj, vectorizer2, tfidf_transformer2, train=True) D_LSI_j = svd.fit_transform(tfidf2_j) D_NMF_j = model.fit_transform(tfidf2_j) tfidftest_j = preprocess(dclass, dclass.testing_dataj, vectorizer2, tfidf_transformer2, train=False) #testing data D_LSI_test_j = svd.transform(tfidftest_j) D_NMF_test_j = model.transform(tfidftest_j) print('----------------Naive Bayes in J-----------------') part_g(dclass, D_NMF_j, D_NMF_test_j, dclass.training_targetj, True) print('----------------SVM in J with LSI data-----------') part_j_SVM(dclass, D_LSI_j, D_LSI_test_j) print('----------------SVM in J with NMF data-----------') part_j_SVM(dclass, D_NMF_j, D_NMF_test_j)
def train_with_bag_of_words(X_train, y_train, scorer, classifier='SVC', search=True): """ Pass the data through a pipeline and return a trained model. Args: X_train: Train data y_train: Labels for the train data (transformed by LabelEncoder) search : Whether to search for the best hyperparameters """ estimators = { 'SVC': SVC( C=5.1, kernel='linear', decision_function_shape='ovr', #class_weight = 'balanced' # better without 'balanced' ), 'LogisticRegression': LogisticRegression(C=5.1, ), 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.3), } if classifier != 'VotingClassifier': clf = estimators.get(classifier) else: estimators['SVC'].probability = True clf = VotingClassifier(estimators=[(k, v) for k, v in estimators.items()], voting='soft') print(clf) pipeline = Pipeline( [ ( 'col_transf', ColumnTransformer( [ ('scaler', StandardScaler(), [ 'budget', 'client.feedback', 'client.reviews_count', 'client.jobs_posted', 'client.past_hires' ]), ('title_vec', Pipeline([ ('preprocessor', SpacyPreprocessor()), ('tfidf', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, use_idf=True, ngram_range=(2, 2))), ('svd', TruncatedSVD(n_components=150)), ]), 'title'), ( 'snippet_vec', Pipeline([ ('preprocessor', SpacyPreprocessor()), ( 'tfidf', TfidfVectorizer( tokenizer=identity, preprocessor=None, lowercase=False, use_idf=True, sublinear_tf= False, # not good results when True ngram_range=(1, 2))), ('svd', TruncatedSVD(n_components=100)), ]), 'snippet'), ('cat', ce.CatBoostEncoder(), ["job_type", 'category2', 'client.country']), ], remainder='drop')), #('oversampling', ADASYN(random_state=42)), ('classifier', clf), ], verbose=True) if search: log_space = gen_parameters_from_log_space(low_value=5, high_value=8, n_samples=10) lin_space = np.arange(2, 8, 2, dtype=np.int) if classifier == 'SVC': grid = { # 'union__title_vec__tfidf__ngram_range' : [(1,2), (2,2)], # 'union__snippet_vec__tfidf__ngram_range' : [(1,2), (2,2)], # 'union__snippet_vec__svd__n_components' : np.arange(50, 301, 50), # 'union__title_vec__svd__n_components' : np.arange(100, 301, 50), 'classifier__C': log_space, } elif classifier == 'LogisticRegression': grid = { 'classifier__C': gen_parameters_from_log_space(0.1, 10, 10), } elif classifier == 'GradientBoostingClassifier': grid = { 'classifier__learning_rate': gen_parameters_from_log_space(0.01, 1, 10), } elif classifier == 'VotingClassifier': grid = { 'classifier__lr__C': gen_parameters_from_log_space(0.1, 10, 10), 'classifier__C': gen_parameters_from_log_space(5, 8, 10), 'classifier__learning_rate': gen_parameters_from_log_space(0.01, 1, 10), } # With scoring="ovo", computes the average AUC of all possible pairwise # combinations of classes. Insensitive to class imbalance when # average='macro'. # Also see: https://stackoverflow.com/a/62471736/1253729 searcher = GridSearchCV( estimator=pipeline, param_grid=grid, n_jobs=4, return_train_score=True, refit=True, verbose=True, cv=StratifiedKFold(n_splits=3), scoring=scorer, ) model = searcher.fit(X_train, y_train.values.ravel()) print(f"Best found parameters: {searcher.best_params_}") else: model = pipeline.fit(X_train, y_train.values.ravel()) return model
rows.append({'text': text, 'class': classification}) index.append(filename) data_frame = DataFrame(rows, index=index) return data_frame data = DataFrame({'text': [], 'class': []}) for path, classification in SOURCES: data = data.append(build_data_frame(path, classification)) data = data.reindex(np.random.permutation(data.index)) ## now split files into training data and labels. probably tuple (filename, r/d) classifier = Pipeline([ ('tfidf', TfidfVectorizer()), ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), ('clf', XGBClassifier()) ]) #classifier.fit(data['text'].values, data['class'].values) k_fold = KFold(n_splits=8) scores = [] confusion = np.array([[0, 0], [0, 0]]) for train_indices, test_indices in k_fold.split(data): train_text = data.iloc[train_indices]['text'].values train_y = data.iloc[train_indices]['class'].values test_text = data.iloc[test_indices]['text'].values test_y = data.iloc[test_indices]['class'].values
for size in tqdm(size_list): model = KeyedVectors.load("./trained_model/fasttext_gensim_" + str(size) + ".model") words_np = [] words_label = [] for word in list_words: words_np.append(model[word]) words_label.append(word) word_vector_reduced = {} for index, vec in enumerate(words_np): word_vector_reduced[words_label[index]] = vec list_cosin_similarity = [] for x, y in zip(data["Word1"], data["Word2"]): list_cosin_similarity.append(round(cosin_similarity(x, y, word_vector_reduced), 2)) data['Relation_number'] = new_col data["FastText_" + str(size)] = list_cosin_similarity if size == 200: for new_size in size_list[:-1]: svd = TruncatedSVD(n_components=new_size, n_iter=30) svd.fit(words_np) reduced = svd.transform(words_np) word_vector_reduced = {} for index, vec in enumerate(reduced): word_vector_reduced[words_label[index]] = vec list_cosin_similarity = [] for x, y in zip(data["Word1"], data["Word2"]): list_cosin_similarity.append(round(cosin_similarity(x, y, word_vector_reduced), 2)) data["FastText_SVD_" + str(new_size)] = list_cosin_similarity # Ghi ket qua ra file csv tmp_name = os.path.basename(path_visim).split('.')[0] + '_result.csv' data.to_csv(os.path.join("./result", tmp_name), sep="\t")
print '==============' print metrics.confusion_matrix(labels, km.labels_) print '==============' print '-----------------------------------------------------' #============================================================================== #=========================Reduce Dimensionality (SVD)========================== print '##############################################################' for i in range(0,5): print 'Performing truncatedSVD...' svd = TruncatedSVD(n_components = 165, n_iter = 13,random_state = 42) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X_reduced = lsa.fit_transform(X) k_means(X_reduced, labels, 'truncatedSVD') #============================================================================== #=========================Reduce Dimensionality (PCA)========================== print '##############################################################' for i in range(0,5): print 'Performing PCA...'
# Tokenize each document into words # Gets rid of stop words, and stemmed version of word # Ignores words appearing in less then 5 (or 2 if min_df = 2) documents vectorizer = CountVectorizer(min_df=5, stop_words= stop_words, tokenizer=LemmaTokenizer() ) X_train_counts = vectorizer.fit_transform(eight_train.data) X_test_counts = vectorizer.transform(eight_test.data) # TFIDF # We set smooth_idf = false so we use the equation idf(d, t) = log [ n / df(d, t) ] + 1 tfidf_transformer = TfidfTransformer(smooth_idf=False) X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_test_tfidf = tfidf_transformer.transform(X_test_counts) # 'arpack' for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds) svd = TruncatedSVD(n_components=50, algorithm='arpack') X_train_lsi = svd.fit_transform(X_train_tfidf) X_test_lsi = svd.transform(X_test_tfidf) # separate into two groups(Computer Tech & Recreation) train_target_group = [ int(x / 4) for x in eight_train.target] test_actual= [ int(x / 4) for x in eight_test.target] # Logistic Regresstion Classifier log_reg = LogisticRegression() log_reg.fit(X_train_lsi, train_target_group) predicted = log_reg.predict(X_test_lsi) predicted_probs = log_reg.predict_proba(X_test_lsi) fpr, tpr, _ = roc_curve(test_actual, predicted_probs[:,1])
# Fit and transform principalComponents = pca.fit_transform(X_scaled) # Print ratio of variance explained #After transformation, we keep the three (as specified in n_components) transformed features with the highest #transformed covariance eigenvalues print("PCA explained variance ratios: {}".format( pca.explained_variance_ratio_)) print("PCA components: {}".format(pca.components_)) print("The PCA component vector has size {}, because there are {} vectors with length {}".format(pca.components_.shape, \ pca.components_.shape[0], pca.components_.shape[1])) ########## Practicing singular value decomposition # SVD svd = TruncatedSVD(n_components=2) # Fit and transform principalComponents = svd.fit_transform(X_scaled) # Print ratio of variance explained print("SVD explained variance ratios: {}".format( svd.explained_variance_ratio_)) ################ Practicing creating a PCA plot for visualization #first project data points onto the two PCA axes. Note that principle components coming out of svd are already normalized. df_pca = pd.DataFrame(np.transpose(np.array([np.dot(X_scaled, svd.components_[0]), np.dot(X_scaled, svd.components_[1]),\ cancer.target])), \ columns=['Principal component 1', 'Principal component 2', 'Target value']) targets = [0, 1]
#UNSUPERVISED MODEL from model import * from sklearn.manifold import TSNE from sklearn.decomposition import TruncatedSVD from sklearn.cluster import KMeans Sparse SVD on tf-idf to reduce features to 50 print("start dimensionality reduction") data = get_vectorized_tweets('training_vecs.npy').toarray() svd_model = TruncatedSVD(n_components=50) data_svd = svd_model.fit_transform(data) print("start TSNE") tsne_model = TSNE(n_components = 2) data_tsne = tsne_model.fit_transform(data_svd) np.save('tsne_training_data.npy', data_tsne) data_tsne = sample(np.asarray(get_vectorized_tweets('tsne_training_data.npy')), 500) print(data_tsne.shape) cluster_labels = KMeans(n_clusters = 5).fit(data_tsne).labels_ import matplotlib.pyplot as plt print("scatter:") plt.scatter(data_tsne[:,0], data_tsne[:,1], c = cluster_labels) plt.show() #UNSUPERVISED MODEL ONLY TOXIC SPEECH #select only toxic speech df_data = pd.read_csv("twitter-sentiment-analysis-hatred-speech/train.csv",names=('id','label','tweet'),header=None) labels = df_data.to_numpy().T[1] data_tsne = np.asarray(get_vectorized_tweets('tsne_training_data.npy'))