コード例 #1
2
class TF_Transformer(base.BaseEstimator, base.TransformerMixin):
	def __init__(self):

		self.cv_bi = CountVectorizer(min_df=2,max_df=0.7,ngram_range=(1,2))
		self.tfidf_trans = TfidfTransformer()
		self.SVD_trans = TruncatedSVD(n_components=300)

    # X is a list of Fit_Review named tuples, y is none
	def fit(self, X, y=None):

		texts = [review.text for review in X]

		counts = self.cv_bi.fit_transform(texts)
		counts_tfidf = self.tfidf_trans.fit_transform(counts)
		self.SVD_trans.fit(counts_tfidf)

		return self

    # X is a list of either Fit_Review or Prod_Corpus named tuples
	def transform(self, X):

		texts = [review.text for review in X]

		counts = self.cv_bi.transform(texts)
		counts_tfidf = self.tfidf_trans.transform(counts)
		counts_trunc = self.SVD_trans.transform(counts_tfidf)

		return counts_trunc
コード例 #2
0
ファイル: topic_model.py プロジェクト: markcassar/textacy
 def init_model(self, model, n_topics=10, **kwargs):
     if model == 'nmf':
         self.model = NMF(
             n_components=n_topics,
             alpha=kwargs.get('alpha', 0.1),
             l1_ratio=kwargs.get('l1_ratio', 0.5),
             max_iter=kwargs.get('max_iter', 200),
             random_state=kwargs.get('random_state', 1),
             shuffle=kwargs.get('shuffle', False))
     elif model == 'lda':
         self.model = LatentDirichletAllocation(
             n_topics=n_topics,
             max_iter=kwargs.get('max_iter', 10),
             random_state=kwargs.get('random_state', 1),
             learning_method=kwargs.get('learning_method', 'online'),
             learning_offset=kwargs.get('learning_offset', 10.0),
             batch_size=kwargs.get('batch_size', 128),
             n_jobs=kwargs.get('n_jobs', 1))
     elif model == 'lsa':
         self.model = TruncatedSVD(
             n_components=n_topics,
             algorithm=kwargs.get('algorithm', 'randomized'),
             n_iter=kwargs.get('n_iter', 5),
             random_state=kwargs.get('random_state', 1))
     else:
         msg = 'model "{}" invalid; must be {}'.format(
             model, {'nmf', 'lda', 'lsa'})
         raise ValueError(msg)
コード例 #3
0
ファイル: embedding.py プロジェクト: gianlucacorrado/EDeN
def embed_two_dimensions(data, vectorizer, size=10, n_components=5, colormap='YlOrRd'):
    if hasattr(data, '__iter__'):
        iterable = data
    else:
        raise Exception('ERROR: Input must be iterable')
    import itertools
    iterable_1, iterable_2 = itertools.tee(iterable)
    # get labels
    labels = []
    for graph in iterable_2:
        label = graph.graph.get('id', None)
        if label:
            labels.append(label)

    # transform iterable into sparse vectors
    data_matrix = vectorizer.transform(iterable_1)
    # embed high dimensional sparse vectors in 2D
    from sklearn import metrics
    distance_matrix = metrics.pairwise.pairwise_distances(data_matrix)

    from sklearn.manifold import MDS
    feature_map = MDS(n_components=n_components, dissimilarity='precomputed')
    explicit_data_matrix = feature_map.fit_transform(distance_matrix)

    from sklearn.decomposition import TruncatedSVD
    pca = TruncatedSVD(n_components=2)
    low_dimension_data_matrix = pca.fit_transform(explicit_data_matrix)

    plt.figure(figsize=(size, size))
    embed_dat_matrix_two_dimensions(low_dimension_data_matrix, labels=labels, density_colormap=colormap)
    plt.show()
コード例 #4
0
def main():
    svd = TruncatedSVD()
    Z = svd.fit_transform(X)
    plt.scatter(Z[:,0], Z[:,1])
    for i in xrange(D):
        plt.annotate(s=index_word_map[i], xy=(Z[i,0], Z[i,1]))
    plt.show()
コード例 #5
0
def tfIDFeats(ids,data):


    # the infamous tfidf vectorizer (Do you remember this one?)
    tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
    # Fit TFIDF
    tfv.fit(data)
    X =  tfv.transform(data) 
        # Initialize SVD

    svd = TruncatedSVD(n_components=350)
    
    # Initialize the standard scaler 
    scl = StandardScaler( with_mean=False)
    
    
    
    if X.shape[1]>350:
        X = svd.fit_transform(X)
    X = scl.fit_transform(X,ids)
    if plotData:
        X = PCA(n_components=2).fit_transform(X)
    return (X,ids)
コード例 #6
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))
コード例 #7
0
ファイル: tfidf.py プロジェクト: rudraksh125/socialmedia
def train():
    with open("../data/f_hashtag_prediction/train_data_tweets_processed_0_to_500K.txt") as ftrain:
        with open("../data/f_hashtag_prediction/test_data_tweets_processed_2K.txt") as ftest:
            test_set = ftest.read().splitlines()
            train_set = ftrain.read().splitlines()
            # vectorizer = CountVectorizer()
            vectorizer = TfidfVectorizer(min_df=5, max_df=500, max_features=None,
                                         strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                         ngram_range=(1, 4), use_idf=1, smooth_idf=1, sublinear_tf=1,
                                         stop_words='english')
            # vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform(train_set)
            print tfidf_matrix.shape
            # print tfidf_matrix
            # print vectorizer.fixed_vocabulary_
            smatrix = vectorizer.transform(test_set)
            print smatrix.shape

            joblib.dump(smatrix, "test_tfidf_matrix.o")
            joblib.dump(tfidf_matrix, "train_tfidf_matrix.o")

            svd = TruncatedSVD(n_components=500, random_state=42)
            svd.fit(tfidf_matrix)
            truncated_train_svd = svd.transform(tfidf_matrix)
            truncated_test_svd = svd.transform(smatrix)

            print truncated_train_svd.shape
            print truncated_test_svd.shape

            joblib.dump(truncated_train_svd, "truncated_train_svd.o")
            joblib.dump(truncated_test_svd, "truncated_test_svd.o")

        print "TEST SET: "
        test_index = 0
コード例 #8
0
 def find_k(self, rank=None, max_clusters=1, vertline=None):
     
     if rank != None:
         svd = TruncatedSVD(rank)
         self.X = svd.fit_transform(self.X)
         self.X = Normalizer(copy=False).fit_transform(self.X)
     
     k_range = range(1, max_clusters)
     clusters = [KMeans(n_clusters=k).fit(self.X) for k in k_range]
     centroids = [cluster.cluster_centers_ for cluster in clusters]
     k_cosine = [cdist(self.X, cent, metric='cosine') for cent in centroids]
     dist = [np.min(k_cos, axis=1) for k_cos in k_cosine]
     
     wcss = [sum(d[np.isnan(d) == False]**2) for d in dist] # Within cluster sum of squares
     tss = sum(pdist(self.X)**2)/self.X.shape[0] # Total sum of squares
     bss = tss - wcss # Explained variance
             
     fig, (ax1, ax2) = plt.subplots(1, 2)
     fig.set_size_inches(10, 3)
     plt.tight_layout()
     
     ax1.set_title('BSS')
     ax1.plot(np.arange(1, len(bss)+1), bss)
     ax1.scatter(np.arange(1, len(bss)+1), bss)        
     ax2.set_title('WCSS')
     ax2.plot(np.arange(1, len(wcss)+1), wcss)
     ax2.scatter(np.arange(1, len(wcss)+1), wcss)
     plt.axvline(vertline, c='red', alpha=0.75) if vertline != None else None
         
     plt.show()
コード例 #9
0
ファイル: io.py プロジェクト: mmathioudakis/geotopics
def reduce_dim(sparse_matrix, raw_data, unigrams, n: int, filename_prefix: str):
    """
    Applies truncated SVD to given sparse matrix and "clusters" each word according to
    the component that "leans" most in its direction.

    i.e. for each user, find out which principal component has the maximum value in its
    direction. Then assign it to the component with the maximum value.

    After doing this for all users and summing up the counts, components become
    "super user"s.

    :param sparse_matrix: feature matrix to be reduced
    :param unigrams: unigrams that correspond to columns in sparse_matrix
    These will be used to create a mapping file from word to super-word
    :param n: number of components
    :param filename_prefix: assignment vector will be saved with this prefix
    :return: reduced feature matrix where each column is a new "super-word"
    """
    svd = TruncatedSVD(n_components=n)
    svd.fit(sparse_matrix)
    maximums = np.argmax(np.abs(svd.components_), axis=0)
    unigram_feat_map = dict([(unigrams[i], maximums[i]) for i in range(len(maximums))])

    reduced = get_sparse_occur_matrix(raw_data, unigram_feat_map)[:, 0:n]
    # num_points, _ = sparse_matrix.shape
    # counts = sparse.csc_matrix((num_points, n), dtype=int)
    #
    # for feat_index, target_component in enumerate(maximums):
    #     counts[:, target_component] += sparse_matrix[:, feat_index]
    #
    with open(filename_prefix + ".svdfeatmap", "wb") as svdfeatmap:
        pickle.dump(unigram_feat_map, svdfeatmap)

    return reduced
コード例 #10
0
ファイル: tfidf.py プロジェクト: rudraksh125/socialmedia
def train_manual():
    with open("../data/f_hashtag_prediction/train_data_tweets_processed_0_to_500K.txt") as ftrain:
        with open("../data/f_hashtag_prediction/test_data_tagged_processed_manual.txt") as ftest:
            test_set = ftest.read().splitlines()
            train_set = ftrain.read().splitlines()
            # vectorizer = CountVectorizer()
            vectorizer = TfidfVectorizer(min_df=5, max_df=500, max_features=None,
                                         strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                         ngram_range=(1, 4), use_idf=1, smooth_idf=1, sublinear_tf=1,
                                         stop_words='english')
            # vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform(train_set)
            print tfidf_matrix.shape

            smatrix = vectorizer.transform(test_set)
            print smatrix.shape

            svd = TruncatedSVD(n_components=500, random_state=42)
            svd.fit(tfidf_matrix)
            truncated_train_svd = svd.transform(tfidf_matrix)
            truncated_test_svd = svd.transform(smatrix)

            print truncated_train_svd.shape
            print truncated_test_svd.shape

            cosine = cosine_similarity(truncated_test_svd[0], truncated_train_svd)
            print cosine

        print "TEST SET: "
コード例 #11
0
 def solve(self, X, missing_mask):
     observed_mask = ~missing_mask
     X_filled = X
     for i in range(self.max_iters):
         # deviation from original svdImpute algorithm:
         # gradually increase the rank of our approximation
         if self.gradual_rank_increase:
             curr_rank = min(2 ** i, self.rank)
         else:
             curr_rank = self.rank
         tsvd = TruncatedSVD(curr_rank, algorithm=self.svd_algorithm)
         X_reduced = tsvd.fit_transform(X_filled)
         X_reconstructed = tsvd.inverse_transform(X_reduced)
         X_reconstructed = self.clip(X_reconstructed)
         mae = masked_mae(
             X_true=X,
             X_pred=X_reconstructed,
             mask=observed_mask)
         if self.verbose:
             print(
                 "[IterativeSVD] Iter %d: observed MAE=%0.6f" % (
                     i + 1, mae))
         converged = self._converged(
             X_old=X_filled,
             X_new=X_reconstructed,
             missing_mask=missing_mask)
         X_filled[missing_mask] = X_reconstructed[missing_mask]
         if converged:
             break
     return X_filled
コード例 #12
0
def main():
    infile = open(sys.argv[1])
    outfile = sys.argv[2] #needs to be a string
    vocabfile = open(sys.argv[3])
    vocab = json.load(vocabfile)

    F = sparse.lil_matrix((len(vdict), 4*len(vdict)), dtype=np.int32)
    corpus_size = 0
    lc = 0

    for line in infile:
        lc += 1
        if lc % 10000 == 0:
            print('processing line ' + str(lc) + ' at ' + str(datetime.datetime.now()))
        words = line.split()
        num_words = len(words)
        corpus_size += num_words

        if num_words < 5:
            process_short_line(num_words, words, F, vocab)

        else:
            F[vocab[words[0]], 4 * vocab[words[1]] + 2] += 1
            F[vocab[words[0]], 4 * vocab[words[2]] + 3] += 1

            F[vocab[words[1]], 4 * vocab[words[0]] + 1] += 1
            F[vocab[words[1]], 4 * vocab[words[2]] + 2] += 1
            F[vocab[words[1]], 4 * vocab[words[3]] + 3] += 1

            F[vocab[words[-2]], 4 * vocab[words[-4]] + 0] += 1
            F[vocab[words[-2]], 4 * vocab[words[-3]] + 1] += 1
            F[vocab[words[-2]], 4 * vocab[words[-1]] + 2] += 1

            F[vocab[words[-1]], 4 * vocab[words[-3]] + 0] += 1
            F[vocab[words[-1]], 4 * vocab[words[-2]] + 1] += 1
            
            for i, word in enumerate(words[2:-2]):
                F[vocab[word], 4 * vocab[words[i-2]] + 0] += 1
                F[vocab[word], 4 * vocab[words[i-1]] + 1] += 1
                F[vocab[word], 4 * vocab[words[i+1]] + 2] += 1
                F[vocab[word], 4 * vocab[words[i+2]] + 3] += 1
                
    # compute PMI
    Fc = F.tocoo()
    word_freqs = Fc.sum(1)
    context_freqs = Fc.sum(0)
    word_freqs = word_freqs.A1
    context_freqs = context_freqs.A1

    for i,j,v in zip(Fc.row, Fc.col, Fc.data):
        F[i,j] = max( math.log((v * corpus_size) / (word_freqs[i] * context_freqs[j])), 0 )

    # compute TruncatedSVD
    svd = TruncatedSVD(n_components=200)
    Fred = svd.fit_transform(F)

    np.savetxt(outfile, Fred, delimiter=',')

    infile.close()
    vocabfile.close()
コード例 #13
0
def get_lsa(x,t):
    print "LSA"
    lsa = SVD(n_components=600,algorithm="arpack")
    lsa.fit(x)
    x = lsa.transform(x)
    t = lsa.transform(t)
    return x,t
コード例 #14
0
ファイル: cook.py プロジェクト: wangchr/eMeriL
def cook():
    x, y, weights = load_data()
    n_components = 200
    svd = TruncatedSVD(n_components, random_state=42)
    x_unweighted = svd.fit_transform(x)
    x_weighted = svd.fit_transform(weighted(x, weights))

    for i in range(9):
        frac = 1 - (i * 0.01 + 0.01)
        print frac

        x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac)
        classifier = AdaBoostClassifier(n_estimators=100)
        classifier.fit(x_train, y_train)
        print "Unweighted: ", classifier.score(x_test, y_test)

        x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac)
        classifier = AdaBoostClassifier(n_estimators=100)
        classifier.fit(x_train, y_train)
        print "Weighted: ", classifier.score(x_test, y_test)

        print '--------------------------'


    '''
def benchmark(k, epochs):
  print("*" * 80)
  print("k: %d, epochs: %d\n" % (k, epochs))

  #select = SelectKBest(score_func=chi2, k=k)
  select = TruncatedSVD(n_components=k)
  X_train_trunc = select.fit_transform(X_train, Y_train)
  X_test_trunc = select.transform(X_test)

  print('done truncating')

  parameters = {'C': [1, 10, 100, 1000, 10000],  'class_weight': ['auto', None], 'tol':[0.001,0.0001]}
  clf = LinearSVC(C=100000)
  #clf = grid_search.GridSearchCV(svc, parameters)
  clf.fit(X_train_trunc, Y_train)
  pred = clf.predict(X_test_trunc)

  if CREATE_SUBMISSION:
    X_submit_trunc = select.transform(X_submit)
    pred_submit = clf.predict(X_submit_trunc)
    dump_csv(pred_submit, k, epochs)

  score = metrics.f1_score(Y_test, pred)
  print("f1-score:   %0.3f" % score)

  print("classification report:")
  print(metrics.classification_report(Y_test, pred))

  print("confusion matrix:")
  print(metrics.confusion_matrix(Y_test, pred))
コード例 #16
0
def truncatedSVD(data, labels, new_dimension):
    print "start truncatedSVD..."
    start = time.time()
    pca = TruncatedSVD(n_components=new_dimension)
    reduced = pca.fit_transform(data)
    end = time.time()
    return (reduced, end-start)
コード例 #17
0
def preprocess(data, n_components, use_tf_idf=True):
    """
    Preproecess the data for clustering by running SVD and
    normalizing the results. This process is also known as
    LSA.

    arguments:
    data -- Dataset, if tf_idf is Truethe object must contain a
            tf_idf table alongside a raw frequencies dataframe.
    n_components -- int, the number of components to use for the SVD
                    a minimum of 100 is recommended.
    use_tf_idf -- bool, whether to use the tf-idf frequencies for the
                  preprocessing.

    returns:
    e -- float, a measure of variance explained by the SVD.
    X -- np.array, an array with the data reduced to n_components.
    """
    if use_tf_idf:
        d = data.tf_idf.as_matrix()
    else:
        d = data.df.as_matrix()
    svd = TruncatedSVD(n_components=n_components)
    X = svd.fit_transform(d)
    norm = Normalizer()

    # Record a measure of explained variance
    e = svd.explained_variance_ratio_.sum()*100
    return e, norm.fit_transform(d)
def benchmark(k, epochs):
  print("*" * 80)
  print("k: %d, epochs: %d\n" % (k, epochs))

  #select = SelectKBest(score_func=chi2, k=k)
  select = TruncatedSVD(n_components=k)
  X_train_trunc = select.fit_transform(X_train, Y_train)
  X_test_trunc = select.transform(X_test)

  print('done truncating')

  clf = DBN([X_train_trunc.shape[1], k, 4], learn_rates=0.3, learn_rate_decays=0.9, epochs=epochs, verbose=1)
  clf.fit(X_train_trunc, Y_train)
  pred = clf.predict(X_test_trunc)

  if CREATE_SUBMISSION:
    X_submit_trunc = select.transform(X_submit)
    pred_submit = clf.predict(X_submit_trunc)
    dump_csv(pred_submit, k, epochs)

  score = metrics.f1_score(Y_test, pred)
  print("f1-score:   %0.3f" % score)

  print("classification report:")
  print(metrics.classification_report(Y_test, pred))

  print("confusion matrix:")
  print(metrics.confusion_matrix(Y_test, pred))
コード例 #19
0
ファイル: cluster_DBSCAN.py プロジェクト: nickgentoo/pyEDeN
def cluster_DBSCAN(args):
	"""
	Clustering with Ward hierarchical clustering: constructs a tree and cuts it.
	"""
	#load data
	g_it = node_link_data.node_link_data_to_eden(input = args.input_file, input_type = "file")
	vec = graph.Vectorizer(r = args.radius,d = args.distance, nbits = args.nbits)
	logger.info('Vectorizer: %s' % vec)

	X = vec.transform(g_it, n_jobs = args.n_jobs)
	logger.info('Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz() / X.shape[0]))
	
	#project to lower dimensional space to use clustering algorithms
	transformer = TruncatedSVD(n_components=args.n_components)
	X_dense=transformer.fit_transform(X)

	#log statistics on data
	logger.info('Dimensionality reduction Instances: %d Features: %d with an avg of %d features per instance' % (X_dense.shape[0], X_dense.shape[1], X.getnnz() / X.shape[0]))

	#clustering
	clustering_algo = DBSCAN(eps = args.eps)
	y = clustering_algo.fit_predict(X_dense)
	msg = 'Predictions statistics: '
	msg += util.report_base_statistics(y)
	logger.info(msg)

	#save model for vectorizer
	out_file_name = "vectorizer"
	eden_io.dump(vec, output_dir_path = args.output_dir_path, out_file_name = out_file_name)
	logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)

	#save result
	out_file_name = "labels"
	eden_io.store_matrix(matrix = y, output_dir_path = args.output_dir_path, out_file_name = out_file_name, output_format = "text")
	logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)
コード例 #20
0
ファイル: svm_model.py プロジェクト: bhtucker/chatnet
def train_pca_svm(learning_data, pca_dims, probability=True, cache_size=3000, **svm_kwargs):
    (X_train, y_train, train_ids), (X_test, y_test, test_ids) = learning_data

    pca = TruncatedSVD(n_components=pca_dims)
    n_symbols = max(
        np.max(X_train) + 1, np.max(X_test) + 1
    )
    logger.info("Forming CSR Matrices")
    x_train, x_test = create_csr_matrix(X_train, n_symbols), create_csr_matrix(X_test, n_symbols)
    logger.info("Starting PCA")
    # pseudo-supervised PCA: fit on positive class only
    pca = pca.fit(x_train[y_train > 0])

    x_train_pca = pca.transform(x_train)
    x_test_pca = pca.transform(x_test)

    logger.info("Starting SVM")
    svc = SVC(probability=probability, cache_size=cache_size, **svm_kwargs)
    svc.fit(x_train_pca, y_train)
    logger.info("Scoring SVM")
    score = svc.score(x_test_pca, y_test)
    logger.info(score)
    svc.test_score = score
    pca.n_symbols = n_symbols
    return svc, pca, x_train_pca, x_test_pca
コード例 #21
0
def test_inverse_transform(algo):
    # We need a lot of components for the reconstruction to be "almost
    # equal" in all positions. XXX Test means or sums instead?
    tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo)
    Xt = tsvd.fit_transform(X)
    Xinv = tsvd.inverse_transform(Xt)
    assert_array_almost_equal(Xinv, Xdense, decimal=1)
コード例 #22
0
ファイル: prepare.py プロジェクト: orazaro/stumbleupon_kaggle
def lsa(BP, lentrain, n_components=16, preproc=True, 
    fit_area='test', min_df=3):
    """
    aka Latent semantic analysis
    """
    if preproc:
        print "pre-processing data"
        traindata = []
        for observation in BP:
            traindata.append(preprocess_pipeline(observation, "english", 
                "WordNetLemmatizer", True, True, False))
        BP = traindata

    print "fitting TfidfVectorizer"
    tfv = TfidfVectorizer(min_df=min_df,  max_features=None, strip_accents='unicode',  
        analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1,
        smooth_idf=1, sublinear_tf=1, norm='l2')
    if fit_area == 'test':
        tfv.fit(BP[lentrain:])
    elif fit_area == 'train':
        tfv.fit(BP[:lentrain])
    else:
        tfv.fit(BP)
    print "transforming data"
    BP = tfv.transform(BP)
    print "BP(post):",BP.shape

    if 1:
        # svd here
        print "use svd"
        svd = TruncatedSVD(n_components=n_components, random_state=1)
        BP = svd.fit_transform(BP)
    
    return BP
コード例 #23
0
ファイル: AgeGroup.py プロジェクト: hurelyyu/CS_Master_UW
def kfold(agetext,k,model,k2):
    import collections
    out = []
    for i in range(k):
        print "iteration: "+str(i)
        agetext = shuffle(agetext)
        datatb = agetext.iloc[:,1:]
        label = agetext["agegroup"].tolist()
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            datatb, label, test_size=0.15, random_state=i*6)
        data = X_train.values
        counter = collections.Counter(y_train)
        print counter
        testdata = X_test.values
        lsa = TruncatedSVD(k2, algorithm = 'arpack')
        normalizer = Normalizer(copy=False)
        X = lsa.fit_transform(data)
        X = normalizer.fit_transform(X)
        X_test = lsa.transform(testdata)
        X_test = normalizer.transform(X_test)
        model.fit(X,y_train)
        pred = model.predict(X_test)
        counter = collections.Counter(y_test)
        print counter
        counter = collections.Counter(pred)
        print counter
        out.append(round(accuracy_score(y_test, pred),5))
    print str(out)
    print np.mean(out)
コード例 #24
0
def lsa_summarizer(text,num_sen=5):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())

    tfvectorizer = TfidfVectorizer(tokenizer=tokenizeText)
    sparse = tfvectorizer.fit_transform(sentenceTokens).A
    lsa = TruncatedSVD(n_components=1)
    concept = lsa.fit_transform(sparse)

    pos = np.array(list(range(len(sentenceTokens))))    
    
    listlist = [list(x) for x in zip(sentenceTokens,concept,pos)]

    listlist.sort(key=lambda x: x[1],reverse=True)

    summarysentences = listlist[0:num_sen]

    summarysentences.sort(key=lambda x: x[2],reverse=False)

    summary = ""
    for n in range(num_sen):
        summary += ' ' + summarysentences[n][0]
        summary = " ".join(summary.replace(u"\xa0", u" ").strip().split())

    return summary
コード例 #25
0
def SVD_CV(counts, scores, n_comp=range(10,611,100)):

	n_avg = 16
	avg_err = []
	for n in range(0,n_avg):

		X_train, X_test, y_train, y_test = cross_validation.train_test_split(counts, scores, \
											test_size=0.2, random_state=n)
		test_err = []
		for n in n_comp:
			TruncTrans = TruncatedSVD(n_components=n)
			X_trunc_train = TruncTrans.fit_transform(X_train,scores)
			regr = linear_model(X_trunc_train,y_train)
			X_trunc_test = TruncTrans.transform(X_test)
			y_pred = regr.predict(X_trunc_test)*10**(-12)+3
			test_err.append(metrics.mean_squared_error(y_test, y_pred))

		if not avg_err:
			avg_err = test_err
		else:
			avg_err = [avg_err[i]+(test_err[i]*(1.0/n_avg)) for i in range(0,len(test_err))]



	plt.plot(n_comp, avg_err, label='Out-of-Sample Error')
	plt.xlabel('n components')
	plt.ylabel('MSE')
	plt.show()
コード例 #26
0
    def fit_document_matrix(self, X):
        """
        Reduce dimension of sparse matrix X
        using Latent Semantic Analysis and
        build nearst neighbor model

        Parameters
        ----------
        X: sparse csr matrix, sparse term frequency matrix or
            others weighting matrix from documents
        """
        n_components = self.n_components
        n_iter = self.n_iter
        algorithm = self.algorithm
        lsa_model = TruncatedSVD(n_components=n_components,
                                 n_iter=n_iter,
                                 algorithm=algorithm)
        # reduce dimension using Latent Semantic Analysis
        vectors = lsa_model.fit_transform(X)
        self.vectors = vectors

        # build nearest neighbor model
        nbrs_model = build_nearest_neighbors(vectors, n_recommend=self.n_recommend)
        self.nbrs_model = nbrs_model

        return self
コード例 #27
0
def compute_svd(Xs):
    # compute 1st principal component
    svd = TruncatedSVD(n_components=1, n_iter=20, random_state=0)
    svd.fit(Xs)
    pc = svd.components_
    print(pc.shape, svd.explained_variance_ratio_)
    return pc
コード例 #28
0
ファイル: build_datasets.py プロジェクト: mpearmain/bnp
def buildKB16(n_comp = 200, seed_value = 123):
    ## data
    # read the training/test data  
    print('Importing Data')
    xtrain = pd.read_csv('../input/xtrain_kb6099.csv')
    xtest = pd.read_csv('../input/xtest_kb6099.csv')
    
    # separate 
    id_train = xtrain.ID; xtrain.drop('ID', axis = 1, inplace = True)
    ytrain = xtrain.target; xtrain.drop('target', axis = 1, inplace = True)
    id_test = xtest.ID; xtest.drop('ID', axis = 1, inplace = True)
    
    # fit SVD
    svd = TruncatedSVD(n_components = n_comp,n_iter=5, random_state= seed_value)
    svd.fit(xtrain)
    xtrain = svd.transform(xtrain)
    xtest = svd.transform(xtest)
    xtrain = pd.DataFrame(xtrain)
    xtest = pd.DataFrame(xtest)
    
    ## store the results
    # add indices etc
    xtrain = pd.DataFrame(xtrain)
    xtrain['ID'] = id_train
    xtrain['target'] = ytrain
#
    xtest = pd.DataFrame(xtest)
    xtest['ID'] = id_test
#
#
#    # save the files
    xtrain.to_csv('../input/xtrain_kb16c'+str(n_comp)+'.csv', index = False, header = True)
    xtest.to_csv('../input/xtest_kb16c'+str(n_comp)+'.csv', index = False, header = True)
    
    return
コード例 #29
0
ファイル: eda.py プロジェクト: lwoloszy/albumpitch
def basic_lsi(df, n_components=200, max_df=0.5, min_df=5):
    '''
    Basic LSI model for album recommendations

    Args:
        df: dataframe with Pitchfork reviews
        n_components: number of lsi dimensions
        max_df: max_df in TfidfVectorizer
        min_df: min_df in TfidfVectorizer
    Returns:
        tfidf: sklearn fitted TfidfVectorizer
        tfidf_trans: sparse matrix with tfidf transformed data
        svd: sklearn fitted TruncatedSVD
        svd_trans: dense array with lsi transformed data

    '''

    X = df['review']
    stopwords = nltk.corpus.stopwords.words('english')

    tfidf = TfidfVectorizer(stop_words=stopwords,
                            max_df=max_df, min_df=min_df)
    tfidf_trans = tfidf.fit_transform(X)

    svd = TruncatedSVD(n_components=n_components)
    svd_trans = svd.fit_transform(tfidf_trans)

    return tfidf, tfidf_trans, svd, svd_trans
コード例 #30
0
def test_sparse_formats(fmt):
    Xfmt = Xdense if fmt == "dense" else getattr(X, "to" + fmt)()
    tsvd = TruncatedSVD(n_components=11)
    Xtrans = tsvd.fit_transform(Xfmt)
    assert_equal(Xtrans.shape, (n_samples, 11))
    Xtrans = tsvd.transform(Xfmt)
    assert_equal(Xtrans.shape, (n_samples, 11))
コード例 #31
0
def train(n_components, demean, n_samples):
    print("Loading data...")
    movie_titles, ratings, rating_indices, n_users, n_items = get_netflix_data(
        n_samples=n_samples)
    print("number of users with ratings: {}".format(
        len(np.unique(rating_indices[:, 0]))))
    print("number of movies with ratings: {}".format(
        len(np.unique(rating_indices[:, 1]))))
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True)
    kf.get_n_splits(rating_indices)

    if not n_components:
        components = [5, 10, 15, 20, 30, 50]
        components_loss_path = np.zeros((len(components), n_splits))
        print("Finding optimal number of components...")
        for n, n_components in enumerate(components):
            print("n_components: {}".format(n_components))
            for k, (train_index,
                    test_index) in enumerate(kf.split(rating_indices)):
                mean = None
                print("Fold {}".format(k))
                test_indices = rating_indices[test_index]
                test_indices = test_indices[:,
                                            0], test_indices[:,
                                                             1], test_indices[:,
                                                                              2]
                if demean:
                    print("De-mean training data...")
                    train_indices = rating_indices[train_index]
                    mean = np.mean(train_indices[:, 2])
                    train_indices = train_indices[:,
                                                  0], train_indices[:,
                                                                    1], train_indices[:,
                                                                                      2] - mean
                    data_train = scipy.sparse.csr_matrix(
                        (train_indices[2],
                         (train_indices[0], train_indices[1])),
                        shape=(n_users, n_items))
                else:
                    user_test_indices, item_test_indices = test_indices[
                        0], test_indices[1]
                    data_train = scipy.sparse.lil_matrix(ratings)
                    data_train[user_test_indices, item_test_indices] = 0
                    data_train = scipy.sparse.csr_matrix(ratings)
                print("Finished de-meaning.")
                start = time.time()
                print("Fitting...")
                svd = TruncatedSVD(n_components=n_components)
                P = svd.fit_transform(data_train)
                Q = svd.components_
                acc, loss = evaluate(P, Q, test_indices, mean=mean)
                print("Elapsed time: {:.1f}s".format(time.time() - start))
                print("loss: {:.4f} - acc: {:.4f}".format(loss, acc))
                components_loss_path[n, k] = loss
        mean_loss = np.mean(components_loss_path, axis=1)
        best_k = components[np.argmin(mean_loss)]
        best_loss = np.amin(mean_loss)
        print("best k: {}, best loss: {:.4f}".format(best_k, best_loss))
    else:
        print("Performing cross validation...")
        mean_acc = 0.0
        mean_loss = 0.0
        for k, (train_index,
                test_index) in enumerate(kf.split(rating_indices)):
            mean = None
            print("Fold {}".format(k))
            test_indices = rating_indices[test_index]
            test_indices = test_indices[:, 0], test_indices[:,
                                                            1], test_indices[:,
                                                                             2]
            if demean:
                print("De-mean training data...")
                train_indices = rating_indices[train_index]
                mean = np.mean(train_indices[:, 2])
                train_indices = train_indices[:,
                                              0], train_indices[:,
                                                                1], train_indices[:,
                                                                                  2] - mean
                data_train = scipy.sparse.csr_matrix(
                    (train_indices[2], (train_indices[0], train_indices[1])),
                    shape=(n_users, n_items))
                print("Finished de-meaning.")
            else:
                user_test_indices, item_test_indices = test_indices[
                    0], test_indices[1]
                data_train = scipy.sparse.lil_matrix(ratings)
                data_train[user_test_indices, item_test_indices] = 0
                data_train = scipy.sparse.csr_matrix(ratings)
            start = time.time()
            print("fitting...")
            svd = TruncatedSVD(n_components=n_components)
            P = svd.fit_transform(data_train)
            Q = svd.components_
            acc, loss = evaluate(P, Q, test_indices, mean=mean)
            print("Elapsed time: {:.4f}".format(time.time() - start))
            print("loss: {:.4f} - acc: {:.4f}".format(loss, acc))
            mean_acc = (mean_acc * k + acc) / (k + 1)
            mean_loss = (mean_loss * k + loss) / (k + 1)
        print("mean loss: {:.4f} - mean acc: {:.4f}".format(
            mean_loss, mean_acc))
コード例 #32
0
ファイル: test2.py プロジェクト: winaboy/CSE515P
if __name__ == "__main__":
    print 'loading x_tr...'
    t0 = time.time()
    x_tr = load_csr_matrix_from_npz('../data/processed/tf_idf_transformation/train/matrix.npz')
    print 'loading finished, time = {0}'.format(time.time()-t0)

    print 'loading y_tr...'
    t0 = time.time()
    y_tr = numpy.loadtxt('../data/processed/tf_idf_transformation/train/labels.csv', dtype='int')
    print 'loading finished, time = {0}'.format(time.time()-t0)
 
    print 'running TruncatedSVD...'
    t0 = time.time()
    from sklearn.decomposition import TruncatedSVD
    svd = TruncatedSVD(n_components=100)
    x_tr_new = svd.fit_transform(x_tr, y_tr)
    print 'running TruncatedSVD finished, x_new.shape = {0}, time = {1}'.format(x_tr_new.shape, time.time()-t0)
  
    #delete x_tr
    del x_tr

    print 'fitting model...'
    t0 = time.time()
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.linear_model import LogisticRegression
    clf = OneVsRestClassifier(LogisticRegression())
    clf.fit(x_tr_new, y_tr)
    print 'fitting finished, time = {0}'.format(time.time()-t0)

    #delete x_tr_new, y_tr
コード例 #33
0
#Load train data
X_origin = pd.read_csv("train_isot.csv", ",")
Y = X_origin['label'].values
X_origin = X_origin['text'].values
print("Train set read.")

stopwords = set(ENGLISH_STOP_WORDS)

svm_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.73,
                                 stop_words=stopwords)
X = svm_vectorizer.fit_transform(X_origin)

print("Vectorized.")

svd = TruncatedSVD(n_components=200, algorithm='arpack', random_state=42)
print("SVD prepared.")
X = svd.fit_transform(X)

print("SVD finished.")
# tprs = []
# aucs = []
# mean_fpr = np.linspace(0, 1, 100)

# fig, ax = plt.subplots()
score_f = 0
score_a = 0

kf = KFold(n_splits=5, random_state=42, shuffle=True)
for i, (train, test) in enumerate(kf.split(X)):
    X_train = X[train]
コード例 #34
0
                print "generate %s feat" % feat_name
                # tfidf
                tfv = getTFV(ngram_range=ngram_range)
                X_tfidf_train = tfv.fit_transform(
                    dfTrain.iloc[trainInd][column_name])
                X_tfidf_valid = tfv.transform(
                    dfTrain.iloc[validInd][column_name])
                with open("%s/train.%s.feat.pkl" % (path, feat_name),
                          "wb") as f:
                    cPickle.dump(X_tfidf_train, f, -1)
                with open("%s/valid.%s.feat.pkl" % (path, feat_name),
                          "wb") as f:
                    cPickle.dump(X_tfidf_valid, f, -1)

                # svd
                svd = TruncatedSVD(n_components=svd_n_components, n_iter=15)
                X_svd_train = svd.fit_transform(X_tfidf_train)
                X_svd_test = svd.transform(X_tfidf_valid)
                with open(
                        "%s/train.%s_individual_svd%d.feat.pkl" %
                    (path, feat_name, svd_n_components), "wb") as f:
                    cPickle.dump(X_svd_train, f, -1)
                with open(
                        "%s/valid.%s_individual_svd%d.feat.pkl" %
                    (path, feat_name, svd_n_components), "wb") as f:
                    cPickle.dump(X_svd_test, f, -1)

    print("Done.")

    # Re-training
    print("For training and testing...")
def gen_plan_feas(data):
    n = data.shape[0]
    mode_list_feas = np.zeros((n, 12))
    max_dist, min_dist, mean_dist, std_dist = np.zeros((n, )), np.zeros(
        (n, )), np.zeros((n, )), np.zeros((n, ))

    max_price, min_price, mean_price, std_price = np.zeros((n, )), np.zeros(
        (n, )), np.zeros((n, )), np.zeros((n, ))

    max_eta, min_eta, mean_eta, std_eta = np.zeros((n, )), np.zeros(
        (n, )), np.zeros((n, )), np.zeros((n, ))

    min_dist_mode, max_dist_mode, min_price_mode, max_price_mode, min_eta_mode, max_eta_mode, first_mode = np.zeros(
        (n, )), np.zeros((n, )), np.zeros((n, )), np.zeros((n, )), np.zeros(
            (n, )), np.zeros((n, )), np.zeros((n, ))
    mode_texts = []
    for i, plan in tqdm(enumerate(data['plans'].values)):
        try:
            cur_plan_list = json.loads(plan)
        except:
            cur_plan_list = []
        if len(cur_plan_list) == 0:
            mode_list_feas[i, 0] = 1
            first_mode[i] = 0

            max_dist[i] = -1
            min_dist[i] = -1
            mean_dist[i] = -1
            std_dist[i] = -1

            max_price[i] = -1
            min_price[i] = -1
            mean_price[i] = -1
            std_price[i] = -1

            max_eta[i] = -1
            min_eta[i] = -1
            mean_eta[i] = -1
            std_eta[i] = -1

            min_dist_mode[i] = -1
            max_dist_mode[i] = -1
            min_price_mode[i] = -1
            max_price_mode[i] = -1
            min_eta_mode[i] = -1
            max_eta_mode[i] = -1

            mode_texts.append('word_null')
        else:
            distance_list = []
            price_list = []
            eta_list = []
            mode_list = []
            for tmp_dit in cur_plan_list:
                distance_list.append(int(tmp_dit['distance']))
                if tmp_dit['price'] == '':
                    price_list.append(0)
                else:
                    price_list.append(int(tmp_dit['price']))
                eta_list.append(int(tmp_dit['eta']))
                mode_list.append(int(tmp_dit['transport_mode']))
            mode_texts.append(' '.join(
                ['word_{}'.format(mode) for mode in mode_list]))
            distance_list = np.array(distance_list)
            price_list = np.array(price_list)
            eta_list = np.array(eta_list)
            mode_list = np.array(mode_list, dtype='int')
            mode_list_feas[i, mode_list] = 1
            distance_sort_idx = np.argsort(distance_list)
            price_sort_idx = np.argsort(price_list)
            eta_sort_idx = np.argsort(eta_list)

            max_dist[i] = distance_list[distance_sort_idx[-1]]
            min_dist[i] = distance_list[distance_sort_idx[0]]
            mean_dist[i] = np.mean(distance_list)
            std_dist[i] = np.std(distance_list)

            max_price[i] = price_list[price_sort_idx[-1]]
            min_price[i] = price_list[price_sort_idx[0]]
            mean_price[i] = np.mean(price_list)
            std_price[i] = np.std(price_list)

            max_eta[i] = eta_list[eta_sort_idx[-1]]
            min_eta[i] = eta_list[eta_sort_idx[0]]
            mean_eta[i] = np.mean(eta_list)
            std_eta[i] = np.std(eta_list)

            first_mode[i] = mode_list[0]
            max_dist_mode[i] = mode_list[distance_sort_idx[-1]]
            min_dist_mode[i] = mode_list[distance_sort_idx[0]]

            max_price_mode[i] = mode_list[price_sort_idx[-1]]
            min_price_mode[i] = mode_list[price_sort_idx[0]]

            max_eta_mode[i] = mode_list[eta_sort_idx[-1]]
            min_eta_mode[i] = mode_list[eta_sort_idx[0]]

    feature_data = pd.DataFrame(mode_list_feas)
    feature_data.columns = ['mode_feas_{}'.format(i) for i in range(12)]
    feature_data['max_dist'] = max_dist
    feature_data['min_dist'] = min_dist
    feature_data['mean_dist'] = mean_dist
    feature_data['std_dist'] = std_dist

    feature_data['max_price'] = max_price
    feature_data['min_price'] = min_price
    feature_data['mean_price'] = mean_price
    feature_data['std_price'] = std_price

    feature_data['max_eta'] = max_eta
    feature_data['min_eta'] = min_eta
    feature_data['mean_eta'] = mean_eta
    feature_data['std_eta'] = std_eta

    feature_data['max_dist_mode'] = max_dist_mode
    feature_data['min_dist_mode'] = min_dist_mode
    feature_data['max_price_mode'] = max_price_mode
    feature_data['min_price_mode'] = min_price_mode
    feature_data['max_eta_mode'] = max_eta_mode
    feature_data['min_eta_mode'] = min_eta_mode
    feature_data['first_mode'] = first_mode
    print('mode tfidf...')
    tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vec = tfidf_enc.fit_transform(mode_texts)
    svd_enc = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)
    mode_svd = svd_enc.fit_transform(tfidf_vec)
    mode_svd = pd.DataFrame(mode_svd)
    mode_svd.columns = ['svd_mode_{}'.format(i) for i in range(10)]

    data = pd.concat([data, feature_data, mode_svd], axis=1)
    data = data.drop(['plans'], axis=1)
    return data
コード例 #36
0
ファイル: clustering.py プロジェクト: markdimi/wikisearch
    def hac(self, n_clusters, verbose=False, tfidf=None, n_dimensions=None):
        """ Apply Hierarchical Agglomerative Clustering on a document collection.

        This method generates a hierarchical clustering tree for the collection.
        The leaves of the tree are clusters consisting of single documents.
        The tree is then saved by saving the list of merges in a file.

        Each entry of this list contains the two tree nodes that were merged to
        create a new node and the new node's id. Node ids less than the number
        of leaves represent leaves, while node ids greater than the number of
        leaves indicate internal nodes.

        Args:
            self.corpus (:obj:'Corpus'): The Corpus object of the document
                collection. Defaults to None. Only used when no pre-computed
                Tf/Idf matrix is given.
            tfidf_path (str): The path to the file containing the Tf/Idf matrix
                .pkl file. Defaults to None and in this case the Tf/Idf matrix
                is calculated.
            verbose (bool): When True additional information will be printed.
                Defaults to False.

        Returns:
            hac_model (:obj:'AgglomerativeClustering'): The HAC model fitted on
            the document collection.

        """
        # Compute or load Tf/Idf matrix.
        if tfidf is None:
            tfidf = self.extract_tfidf(self.corpus)
            print(tfidf.shape)

        # Apply latent semantic analysis.
        if n_dimensions is not None:
            print('Performing latent semantic analysis')
            svd = TruncatedSVD(n_dimensions)
            # Normalize SVD results for better clustering results.
            lsa = make_pipeline(svd, Normalizer(copy=False))
            tfidf = lsa.fit_transform(tfidf)

            print(tfidf.shape)

        # Calculate documente distance matrix from Tf/Idf matrix
        print('Constructing distance matrix...')
        dist = 1 - cosine_similarity(tfidf)

        start_time = time.time()
        print('Clustering...')
        # Generate HAC model.
        hac_model = AgglomerativeClustering(linkage='ward',
                                            n_clusters=n_clusters)
        # Fit the model on the distance matrix.
        hac_model.fit(dist)
        end_time = time.time()
        pickle.dump(hac_model, open('hac.pkl', 'wb'))

        if verbose:
            # Visualize cluster model
            children = hac_model.children_
            merges = [{
                'node_id': node_id + len(dist),
                'right': children[node_id, 0],
                'left': children[node_id, 1]
            } for node_id in range(0, len(children))]
            pickle.dump(merges, open('merges.pkl', 'wb'))
            pickle.dump(children, open('children.pkl', 'wb'))

            for merge_entry in enumerate(merges):
                print(merge_entry[1])

        print('Clustering completed after ' +
              str(round((end_time - start_time) / 60)) + "' " +
              str(round((end_time - start_time) % 60)) + "''")
        return hac_model
コード例 #37
0
ファイル: clustering.py プロジェクト: markdimi/wikisearch
    def kmeans(self, n_clusters, tfidf=None, n_dimensions=None, verbose=False):
        """ Applies kmeans clustering on a document collection.

        Args:
            self.corpus (:obj:'Corpus'): The Corpus object of the document
                collection. Defaults to None. Only used when no pre-computed
                Tf/Idf matrix is given.
            tfidf_path (str): The path to the file containing the Tf/Idf matrix
                .pkl file. Defaults to None and in this case the Tf/Idf matrix
                is calculated.
            verbose (bool): When True additional information will be printed.
                Defaults to False.

        Returns:
            kmodel (:obj:'Kmeans'): Scikit KMeans clustering model.

        """

        # Compute or load Tf/Idf matrix.
        if tfidf is None:
            tfidf = self.extract_tfidf(self.corpus)
            print(tfidf.shape)

        # Apply latent semantic analysis.
        if n_dimensions is not None:
            print('Performing latent semantic analysis...')
            svd = TruncatedSVD(n_dimensions)
            # Normalize SVD results for better clustering results.
            lsa = make_pipeline(svd, Normalizer(copy=False))
            tfidf = lsa.fit_transform(tfidf)
            print(tfidf.shape)

        # Do the clustering.
        start_time = time.time()
        print('Clustering...')
        kmodel = MiniBatchKMeans(n_clusters=n_clusters,
                                 init='k-means++',
                                 n_init=1,
                                 max_iter=10,
                                 verbose=True)
        kmodel.fit(tfidf)
        end_time = time.time()

        # Create a matching of the clusters and the ids of the documents
        # they contain.
        cluster_doc = pd.Series()
        for i in range(kmodel.n_clusters):
            ids = []
            for docid, cluster in enumerate(kmodel.labels_):
                if cluster == i:
                    ids.append(docid)
                    cluster_doc.loc[i] = ids

        if verbose:
            # Print some info.
            print("Top terms per cluster:")
            if n_dimensions is not None:
                original_space_centroids = svd.inverse_transform(
                    kmodel.cluster_centers_)
                order_centroids = original_space_centroids.argsort()[:, ::-1]
            else:
                order_centroids = kmodel.cluster_centers_.argsort()[:, ::-1]

            features = pickle.load(open('features.pkl', 'rb'))
            cluster_word = pd.Series()
            for i in range(n_clusters):
                cluster_features = []
                print("Cluster %d:" % i)
                for ind in order_centroids[i, :100]:
                    cluster_features.append(features[ind])
                cluster_word.loc[i] = cluster_features

        pickle.dump(kmodel, open('kmodel.pkl', 'wb'))
        pickle.dump(kmodel.cluster_centers_, open('centers.pkl', 'wb'))
        pickle.dump(cluster_doc, open('cluster_doc.pkl', 'wb'))
        pickle.dump(cluster_word, open('cluster_word.pkl', 'wb'))

        print('Clustering completed after ' +
              str(round((end_time - start_time) / 60)) + "' " +
              str(round((end_time - start_time) % 60)) + "''")

        return kmodel
コード例 #38
0
]

dataset = fetch_20newsgroups(subset='all',
                             categories=categories,
                             shuffle=True,
                             random_state=42)
labels_true = dataset.target
true_k = np.unique(labels_true).shape[0]
# t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5,
                             max_features=10000,
                             min_df=2,
                             stop_words='english',
                             use_idf=True)
X = vectorizer.fit_transform(dataset.data)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
# explained_variance = svd.explained_variance_ratio_.sum()

Wardhierarchial = AgglomerativeClustering(affinity='euclidean',
                                          compute_full_tree='auto',
                                          connectivity=None,
                                          linkage='ward',
                                          memory=None,
                                          n_clusters=2,
                                          pooling_func='deprecated').fit(X)
labels = Wardhierarchial.labels_
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
コード例 #39
0
    d2 = pdt_ttl_vec[i, :]
    dst_srch_ttl1[i] = cosine_similarity(d1, d2)

dst_srch_desc1 = np.zeros(srch_vec.shape[0])
for i in range(srch_vec.shape[0]):
    d1 = srch_vec[i, :]
    d2 = pdt_desc_vec[i, :]
    dst_srch_desc1[i] = cosine_similarity(d1, d2)

dst_ttl_desc1 = np.zeros(srch_vec.shape[0])
for i in range(srch_vec.shape[0]):
    d1 = pdt_ttl_vec[i, :]
    d2 = pdt_desc_vec[i, :]
    dst_srch_desc1[i] = cosine_similarity(d1, d2)

svd = TruncatedSVD(n_components=30, random_state=2016)

srch_vec = svd.fit_transform(srch_vec)
pdt_ttl_vec = svd.fit_transform(pdt_ttl_vec)
pdt_desc_vec = svd.fit_transform(pdt_desc_vec)

srch_vec = pd.DataFrame(
    srch_vec, columns=['srch_vec_' + str(i) for i in range(srch_vec.shape[1])])
pdt_ttl_vec = pd.DataFrame(
    pdt_ttl_vec,
    columns=['ttl_vec_' + str(i) for i in range(pdt_ttl_vec.shape[1])])
pdt_desc_vec = pd.DataFrame(
    pdt_desc_vec,
    columns=['desc_vec_' + str(i) for i in range(pdt_desc_vec.shape[1])])

id = list(df_all['id'])
コード例 #40
0
                transformer_list=[

                    # Pipeline for pulling features from the post's title line
                    ('title',
                     Pipeline([
                         ('selector', ItemSelector(key='title')),
                         ('tfidf',
                          TfidfVectorizer(min_df=50, stop_words='english')),
                     ])),

                    # Pipeline for standard bag-of-words model for abstract
                    ('abstract_bow',
                     Pipeline([
                         ('selector', ItemSelector(key='abstract')),
                         ('tfidf', TfidfVectorizer(stop_words='english')),
                         ('best', TruncatedSVD(n_components=50)),
                     ])),

                    # Pipeline for pulling ad hoc features from post's abstract
                    (
                        'abstract_stats',
                        Pipeline([
                            ('selector', ItemSelector(key='abstract')),
                            ('stats', TextStats()),  # returns a list of dicts
                            ('vect', DictVectorizer()
                             ),  # list of dicts -> feature matrix
                        ])),
                ],

                # weight components in FeatureUnion
                transformer_weights={
コード例 #41
0
le = preprocessing.LabelEncoder()
le.fit(df["Category"])
Y_train=le.transform(df["Category"])
X_train1=df['Content'] 
X_train2=[]
for i in range(len(X_train1)):
	X_train2.append(10*df['Title'][i]+df['Content'][i])

X_train=np.array(X_train2)

#read test file
df_test=pd.read_csv("test_set.csv",sep="\t")

vectorizer=CountVectorizer(stop_words='english')
transformer=TfidfTransformer()
svd=TruncatedSVD(n_components=200, random_state=42) 
pipeline_test = Pipeline([
    ('vect', vectorizer),
    ('tfidf', transformer),
    ('svd',svd),
])
#My method---Voting Classifier
clf1 = BernoulliNB(fit_prior=False)
clf2 = KNeighborsClassifier(weights='distance',n_jobs=-1)
clf3 = RandomForestClassifier(n_estimators=500,n_jobs=-1)
clf = VotingClassifier(estimators=[('bnb',clf1),('knn',clf2),('rf',clf3)], voting='hard')
pipeline = Pipeline([
    ('vect', vectorizer),
    ('tfidf', transformer),
    ('svd',svd),
    ('clf', clf)
コード例 #42
0
# print titles

for sentence in test_data['Content']:
    temp_title = ''

    for j in range(10):
        temp_title = titles2[i] + ' ' + temp_title

    sentences2.append(temp_title + PorterStemmer().stem_sentence(sentence))
    i = i + 1

#Vectorizing-LSI-Classifier
X_train = np.array(sentences)
X_test = np.array(sentences2)
clf = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopw)),\
                ('svd' , TruncatedSVD(n_components=1000) ),\
                ('clf', svm.SVC(C=10, gamma = 0.0001, kernel= 'linear', class_weight='balanced')),
               ])

clf.fit(X_train, y)
predicted = clf.predict(X_test)

#Print Results
categories = le.inverse_transform(predicted)

i = 0
CsvData2 = [['Id', 'Category']]

for t in test_data['Id']:
    CsvData2.append([t, categories[i]])
    i = i + 1
コード例 #43
0
def svd(*args, **kwargs):
    return TruncatedSVD(*args, **kwargs)
コード例 #44
0
    def test_pipeline_column_transformer(self):

        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1"
                                              if x > 0.5 else "cat2")
        X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3"
                                               if x > 0.5 else "cat4")
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="lbfgs",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ])

        categorical_transformer = Pipeline(steps=[
            (
                "onehot",
                OneHotEncoder(sparse=True, handle_unknown="ignore"),
            ),
            (
                "tsvd",
                TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4),
            ),
        ])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)
        initial_type = [
            ("numfeat", FloatTensorType([None, 3])),
            ("strfeat", StringTensorType([None, 2])),
        ]

        X_train = X_train[:11]
        model_onnx = convert_sklearn(model, initial_types=initial_type)

        dump_data_and_model(
            X_train,
            model,
            model_onnx,
            basename="SklearnPipelineColumnTransformerPipeliner",
            allow_failure="StrictVersion(onnx.__version__)"
            " < StrictVersion('1.3') or "
            "StrictVersion(onnxruntime.__version__)"
            " <= StrictVersion('0.4.0')",
        )

        if __name__ == "__main__":
            from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer

            pydot_graph = GetPydotGraph(
                model_onnx.graph,
                name=model_onnx.graph.name,
                rankdir="TP",
                node_producer=GetOpNodeProducer("docstring"),
            )
            pydot_graph.write_dot("graph.dot")

            import os

            os.system("dot -O -G=300 -Tpng graph.dot")
コード例 #45
0
    'adenoma', 'neurocitoma', 'cervello', 'glioma', 'glioblastoma', 'glia',
    'lipoma', 'liposarcoma', 'adiposo'
]

pairs = {
    'b': [('fibroma', 'fibrosarcoma'), ('lipoma', 'liposarcoma'),
          ('osteoma', 'osteosarcoma'), ('papilloma', 'carcinoma'),
          ('adenoma', 'adenocarcinoma'), ('glioma', 'glioblastoma'),
          ('neurocitoma', 'neuroblastoma')],
    'r': [('fibrosarcoma', 'connettivo'), ('liposarcoma', 'adiposo'),
          ('linfoma', 'linfonodo'), ('osteosarcoma', 'osso'),
          ('mesotelioma', 'mesotelio'), ('glioblastoma', 'glia'),
          ('neuroblastoma', 'neuroni')]
}

proj = Projector(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
Y = proj.fit_transform(vectors[[index[k] for k in subset], :])
labels = words[[index[k] for k in subset]]

plt.scatter(Y[:, 0], Y[:, 1])
for label, x, y in zip(labels, Y[:, 0], Y[:, 1]):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')

for color in pairs:
    for pair in pairs[color]:
        plt.plot([Y[subset.index(pair[0]), 0], Y[subset.index(pair[1]), 0]],
                 [Y[subset.index(pair[0]), 1], Y[subset.index(pair[1]), 1]],
                 '-',
                 color=color)
コード例 #46
0
# Transposing the matrix

X = ratings_matrix.T
X.head()

# X = ratings_matrix
# X.head()
X.shape


X1 = X

#Decomposing the Matrix

SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape

#Correlation Matrix

correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

X.index[75]

# Index of product ID purchased by customer

i = "B00000K135"

product_names = list(X.index)
コード例 #47
0
    
'''
    Modeling:
        
        - TF-IDF
        - SVD
        - Visualization
'''

# TF-IDF
vec = TfidfVectorizer(max_features = 1000, max_df = 0.5, smooth_idf = True)

x = vec.fit_transform(joined)

# SVD
svd = TruncatedSVD(n_components = 20, algorithm = 'randomized', n_iter = 100, random_state = 100)

svd.fit(x)

# Labels
terms = vec.get_feature_names()

for i, comp in enumerate(svd.components_):
    
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key = lambda x:x[1], reverse = True)[:7]
    
    print("Topic "+ str(i) + ": ", [t[0] for t in sorted_terms])
    
# Visualize
topics = svd.fit_transform(x)
コード例 #48
0
    def post(self):

        # Get the THEME labels
        abs_filename = ett_h.generate_dynamic_path(
            [base_folder_location, LabelType.THEME.value, label_file_name])
        labels = (ett_h.load_data_common_separated(abs_filename, ','))
        # Get the label data from input_data
        raw_label = TrainThemeUpload.input_data[ColumnName.LABEL.value]
        data = ett_t.transform_data_to_dataframe_basic(
            TrainThemeUpload.input_data, colnames)
        # Get the OneHotEncoded labels
        label_df = ett_t.one_hot_encoding(raw_label)  #17 labels dataframe
        # Rename the OneHotEncoded labels
        label_df.columns = labels
        # Get the number of labels
        num_of_labels = len(labels)
        # Data preprocessing
        nan_cleaned_data = ett_c.clean_dataframe_by_regex(
            data, RegexFilter.NON_ALPHA_NUMERIC.value
        )  # Removed all non alphanumeric characters
        d_cleaned_data = ett_c.clean_dataframe_by_regex(
            nan_cleaned_data,
            RegexFilter.DIGITS_ONLY.value)  # Removed all digits
        l_cleaned_data = ett_c.remove_non_iso_words(
            d_cleaned_data, Language.ENGLISH.value)  # Remove non-English text
        rew_cleaned_data = ett_c.remove_language_stopwords(
            l_cleaned_data, Language.ENGLISH.name)  # Remove English stop words
        l_transformed_data = ett_t.lowercase(
            rew_cleaned_data)  # Transform text to lowercase
        le_transformed_data = ett_t.stemming_mp(
            l_transformed_data
        )  # Transform text to core words i.e. playing > play
        data = le_transformed_data  # Return the newly transformed data

        # Split the data into 0.8 training datasets and 0.2 testing datasets
        X_train, X_test, y_train, y_test = train_test_split(data,
                                                            label_df,
                                                            test_size=0.2,
                                                            random_state=42)
        endpoint_output = {}
        for i in range(num_of_labels):
            model_id = str(i)
            single_label = y_train.iloc[:, i]
            label = labels[i]
            print("label", label)
            pipeline = imbPipeline([
                (ModelType.TFIDF.value,
                 TfidfVectorizer()),  # Data vectorization
                (ModelType.OVERSAMPLE.value,
                 SMOTE(random_state=42)),  # Data balancing
                (ModelType.SVD.value, TruncatedSVD()),  # Feature selection
                (ModelType.NOR.value,
                 preprocessing.MinMaxScaler()),  # Data normalization
                (ModelType.CLF.value, OneVsRestClassifier(SVC()))
            ])  # CLassification

            #list_c = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]
            list_c = [1]

            #list_n = [100, 150, 200, 250, 300, 350, 400, 450, 500, 550])
            list_n = [100]
            # Remember to add[2,\]2]
            best_score = 0
            epsilon = .005
            dictionary = {}

            for para_c in list_c:
                for para_n in list_n:
                    parameters = {
                        ModelType.TFIDF.value: [
                            TfidfVectorizer(max_features=800,
                                            ngram_range=(1, 4),
                                            norm='l2',
                                            encoding='latin-1',
                                            stop_words='english',
                                            analyzer='word')
                        ],
                        ModelType.SVD.value: [
                            TruncatedSVD(n_components=para_n,
                                         n_iter=7,
                                         random_state=42)
                        ],
                        ModelType.CLF.value: [
                            OneVsRestClassifier(
                                SVC(kernel='linear',
                                    probability=True,
                                    C=para_c))
                        ]
                    }
                    gs_clf = GridSearchCV(pipeline,
                                          parameters,
                                          cv=5,
                                          error_score='raise',
                                          scoring='f1')
                    gs_clf = gs_clf.fit(X_train, single_label)
                    current_score = gs_clf.best_score_
                    dictionary[current_score] = parameters

            for current_score in dictionary.keys():
                if current_score - epsilon > best_score:
                    best_score = current_score

            model_dict = dictionary[best_score]

            label_model_list = {}
            label_model_list['score'] = best_score

            folder_time = time.strftime("_%Y%m%d_%H%M")
            # Create Directory in the AWS S3 Bucket
            os.mkdir("/Users/yihanbao/Desktop/unisdr-training/theme/" + label +
                     "/" + label + folder_time)
            # Navigate to AWS model saving folder
            model_folder = os.path.join(
                os.path.dirname(
                    os.path.dirname(
                        os.path.dirname(
                            os.path.dirname(os.path.realpath(__file__))))),
                ett_h.generate_dynamic_path(
                    [LabelType.THEME.value, label, label + folder_time]))
            """
            # Connect to AWS
            conn = boto.s3.connect_to_region(" ",aws_access_key_id = 'AWS-Access-Key', aws_secret_access_key = 'AWS-Secrete-Key',
                                 calling_format = boto.s3.connection.OrdinaryCallingFormat())

            bucket = conn.get_bucket("oict-psdg-unisdr-train-models-v1")
        
            # AWS Key 
            aws_path = ett_h.generate_dynamic_path([LabelType.THEME.value, label, timestamp+label])
            """
            # Here to fit the training datasets to the  models with best score
            # vectorization
            vector = model_dict[ModelType.TFIDF.value][0].fit(
                X_train, single_label)
            ett_h.save_model(
                vector,
                ett_h.generate_dynamic_path(
                    [model_folder, label + folder_time + vector_model_name]))
            vectorized_df = vector.transform(X_train)
            label_model_list[
                URLName.VECURL.value] = ett_h.generate_dynamic_path(
                    [model_folder, label + folder_time + vector_model_name])
            """
            key_name = timestamp+label+model_name
            full_key_name = os.path.join(path, key_name)
            
            pickle_byte_obj = pickle.dump(vector) 
            s3_resource = resource('s3')
            s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj)
            """
            # Balcancing
            sm = SMOTE(random_state=42)
            X_res, y_res = sm.fit_resample(vectorized_df, single_label)

            # Feature selction
            svd = model_dict[ModelType.SVD.value][0].fit(X_res, y_res)
            ett_h.save_model(
                svd,
                ett_h.generate_dynamic_path([
                    model_folder, label + folder_time + dim_reductor_model_name
                ]))
            dim_reductor_df = svd.transform(X_res)
            label_model_list[
                URLName.DIMURL.value] = ett_h.generate_dynamic_path([
                    model_folder, label + folder_time + dim_reductor_model_name
                ])
            """
            key_name = timestamp+label+dim_reductor_model_name
            full_key_name = os.path.join(path, key_name)
            
            pickle_byte_obj = pickle.dump(svd) 
            s3_resource = resource('s3')
            s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj)
            """

            # Normalizing
            min_max_scaler = preprocessing.MinMaxScaler()
            nor_model = min_max_scaler.fit(dim_reductor_df, y_res)
            ett_h.save_model(
                nor_model,
                ett_h.generate_dynamic_path([
                    model_folder, label + folder_time + normalizar_model_name
                ]))
            scaled_df = nor_model.transform(dim_reductor_df)
            label_model_list[
                URLName.NORURL.value] = ett_h.generate_dynamic_path([
                    model_folder, label + folder_time + normalizar_model_name
                ])
            """
            key_name = timestamp+label+normalizar_model_name
            full_key_name = os.path.join(path, key_name)
            
            pickle_byte_obj = pickle.dump(nor_model) 
            s3_resource = resource('s3')
            s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj)
            """

            # Classifier
            clf = model_dict[ModelType.CLF.value][0].fit(scaled_df, y_res)
            clf.fit(scaled_df, y_res)
            ett_h.save_model(
                clf,
                ett_h.generate_dynamic_path(
                    [model_folder, label + folder_time + model_name]))
            label_model_list[
                URLName.MODURL.value] = ett_h.generate_dynamic_path(
                    [model_folder, label + folder_time + model_name])
            """
            key_name = timestamp+label+model_name
            full_key_name = os.path.join(path, key_name)
            
            pickle_byte_obj = pickle.dump(scaled_df) 
            s3_resource = resource('s3')
            s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj)
            """
            endpoint_output[model_id] = [label_model_list]
        output = json.dumps(endpoint_output)
        return output
コード例 #49
0
ファイル: LSA+Lab.py プロジェクト: djlee1987/LSA-Lab
                'nntp', '00041032', '000062david42', '000050', '00041555', '0004244402', 'mcimail', '00043819',
                'prb', '0004246', '0004422', '00044513', '00044939','access', 'digex', 'host', 'would', 'writes',
                'posting', 'dseg'])


# In[5]:

vectorizer = TfidfVectorizer(stop_words=stopset,
                                use_idf=True, ngram_range = (1, 3))
X = vectorizer.fit_transform(corpus)


# In[6]:

#decompose into X=UST^T
lsa = TruncatedSVD(n_components = 25, n_iter = 100)
lsa.fit(X)


# In[7]:

terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
    termsInComp = zip (terms, comp)
    sortedTerms = sorted(termsInComp, key = lambda x: x[1], reverse = True) [:10]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print(" ")

コード例 #50
0
ファイル: featurizer.py プロジェクト: markusstocker/pynpf
def svd_vector(data):
    svd = TruncatedSVD(n_components=1)
    vector = svd.fit_transform(data.ix[:, 6:].transpose())
    return [item for sublist in vector for item in sublist]
コード例 #51
0
if len(fns) > 1:
    print('Multiple merged embeddings in working directory.')
    sys.exit()
else:
    m = fns[0]

print('Reading raw.')
sys.stdout.flush()
df = pd.read_csv(m, index_col=0, header=None)
if df.index.names[0] == 0:
    print('Renaming index column to SampleID.')
    df.index.names = ['SampleID']
    df.to_csv(m, compression='gzip')

mat = df.to_numpy().T
sampids = df.index
del df

print('Performing svd.')
sys.stdout.flush()
svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
svd.fit(mat)
pc = svd.components_
mat -= mat.dot(pc.T) * pc

print('Saving nonraw.')
sys.stdout.flush()
df = pd.DataFrame(mat.T, index=sampids)
df.index.names = ['SampleID']
df.to_csv(m.replace('_raw', ''), compression='gzip')
コード例 #52
0
 def do_lsa(X, target_dim):
     svd = TruncatedSVD(target_dim, random_state=42)
     normalizer = Normalizer(copy=False)
     lsa = make_pipeline(svd, normalizer)
     return lsa.fit_transform(X)
コード例 #53
0
def main(argv):
    choose_mindf = argv[1]
    try:
        path = argv[2]
    except:
        path = None
    categories1 = [
        'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
        'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles',
        'rec.sport.baseball', 'rec.sport.hockey'
    ]
    categories2 = [
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
        'soc.religion.christian'
    ]
    cat_all = [
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
        'soc.religion.christian', 'alt.atheism', 'comp.graphics',
        'comp.os.ms-windows.misc', 'comp.windows.x', 'rec.autos',
        'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey',
        'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
        'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc',
        'talk.religion.misc'
    ]
    dclass = Data(categories1, cat_all, categories2, path)
    stop_words = text.ENGLISH_STOP_WORDS

    print('-----Part A-----')
    #plot_histogram(dclass)
    print('-----Part B-----')

    vectorizer2 = CountVectorizer(min_df=2, stop_words=stop_words, max_df=0.8)
    tfidf_transformer2 = TfidfTransformer()

    vectorizer5 = CountVectorizer(min_df=5, stop_words=stop_words, max_df=0.8)
    tfidf_transformer5 = TfidfTransformer()
    tfidf2 = preprocess(dclass,
                        dclass.training_data1,
                        vectorizer2,
                        tfidf_transformer2,
                        train=True)
    tfidf5 = preprocess(dclass,
                        dclass.training_data1,
                        vectorizer5,
                        tfidf_transformer5,
                        train=True)  #default min_df=5

    print('# of terms with min_df = 2:', tfidf2[0, :].toarray().shape[1],
          '\n# of terms with min_df = 5:', tfidf5[0, :].toarray().shape[1])

    d_tfidf = {'2': tfidf2, '5': tfidf5}
    d_vectorizer = {'2': vectorizer2, '5': vectorizer5}
    d_transformer = {'2': tfidf_transformer2, '5': tfidf_transformer5}

    print('-----Part C-----')

    vectorizerc_2 = CountVectorizer(min_df=2,
                                    stop_words=stop_words,
                                    max_df=0.8)
    tfidf_transformerc_2 = TfidfTransformer()

    tfidf_c_2 = preprocess(dclass,
                           dclass.training_data2,
                           vectorizerc_2,
                           tfidf_transformerc_2,
                           train=True,
                           ICF=True)  #default min_df=5, use TF-ICF
    find_10most(dclass, tfidf_c_2)

    vectorizerc_5 = CountVectorizer(min_df=5,
                                    stop_words=stop_words,
                                    max_df=0.8)
    tfidf_transformerc_5 = TfidfTransformer()

    tfidf_c_5 = preprocess(dclass,
                           dclass.training_data2,
                           vectorizerc_5,
                           tfidf_transformerc_5,
                           train=True,
                           ICF=True)  #default min_df=5, use TF-ICF
    find_10most(dclass, tfidf_c_5)

    print('-----Part D-----')  #SVD and NMF base on TF-IDF5 result
    svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)
    D_LSI = svd.fit_transform(d_tfidf[choose_mindf])
    model = NMF(n_components=50, init='random', random_state=0)
    D_NMF = model.fit_transform(d_tfidf[choose_mindf])
    print('LSI.shape:', D_LSI.shape, '\nNMF.shape:', D_NMF.shape)

    print('-----Part E-----')  #SVM
    tfidftest = preprocess(dclass,
                           dclass.testing_data1,
                           d_vectorizer[choose_mindf],
                           d_transformer[choose_mindf],
                           train=False)  #testing data
    D_LSI_test = svd.transform(tfidftest)
    D_NMF_test = model.transform(tfidftest)
    print('for D_LSI:')
    part_e(dclass, D_LSI, D_LSI_test)
    print('for D_NMF:')
    part_e(dclass, D_NMF, D_NMF_test)

    print('-----Part F-----')
    print('for D_LSI:')
    part_f(dclass, D_LSI, D_LSI_test)
    print('for D_NMF:')
    part_f(dclass, D_NMF, D_NMF_test)

    print('-----Part G-----')
    part_g(dclass, D_NMF, D_NMF_test, dclass.training_target1)

    print('-----Part H-----')
    part_h(dclass, D_LSI, D_LSI_test)
    part_h(dclass, D_NMF, D_NMF_test)

    print('-----Part I-----')
    part_i(dclass, D_LSI, D_LSI_test)
    part_i(dclass, D_NMF, D_NMF_test)

    print('-----Part J-----')

    tfidf2_j = preprocess(dclass,
                          dclass.training_dataj,
                          vectorizer2,
                          tfidf_transformer2,
                          train=True)
    D_LSI_j = svd.fit_transform(tfidf2_j)
    D_NMF_j = model.fit_transform(tfidf2_j)

    tfidftest_j = preprocess(dclass,
                             dclass.testing_dataj,
                             vectorizer2,
                             tfidf_transformer2,
                             train=False)  #testing data
    D_LSI_test_j = svd.transform(tfidftest_j)
    D_NMF_test_j = model.transform(tfidftest_j)

    print('----------------Naive Bayes in J-----------------')
    part_g(dclass, D_NMF_j, D_NMF_test_j, dclass.training_targetj, True)

    print('----------------SVM in J with LSI data-----------')
    part_j_SVM(dclass, D_LSI_j, D_LSI_test_j)

    print('----------------SVM in J with NMF data-----------')
    part_j_SVM(dclass, D_NMF_j, D_NMF_test_j)
コード例 #54
0
ファイル: learner.py プロジェクト: ManuelZ/Upwork-Downloader
def train_with_bag_of_words(X_train,
                            y_train,
                            scorer,
                            classifier='SVC',
                            search=True):
    """
    Pass the data through a pipeline and return a trained model.

    Args:
        X_train: Train data
        y_train: Labels for the train data (transformed by LabelEncoder)
        search : Whether to search for the best hyperparameters
    """

    estimators = {
        'SVC':
        SVC(
            C=5.1,
            kernel='linear',
            decision_function_shape='ovr',
            #class_weight            = 'balanced' # better without 'balanced'
        ),
        'LogisticRegression':
        LogisticRegression(C=5.1, ),
        'GradientBoostingClassifier':
        GradientBoostingClassifier(learning_rate=0.3),
    }

    if classifier != 'VotingClassifier':
        clf = estimators.get(classifier)
    else:
        estimators['SVC'].probability = True
        clf = VotingClassifier(estimators=[(k, v)
                                           for k, v in estimators.items()],
                               voting='soft')

    print(clf)

    pipeline = Pipeline(
        [
            (
                'col_transf',
                ColumnTransformer(
                    [
                        ('scaler', StandardScaler(), [
                            'budget', 'client.feedback',
                            'client.reviews_count', 'client.jobs_posted',
                            'client.past_hires'
                        ]),
                        ('title_vec',
                         Pipeline([
                             ('preprocessor', SpacyPreprocessor()),
                             ('tfidf',
                              TfidfVectorizer(tokenizer=identity,
                                              preprocessor=None,
                                              lowercase=False,
                                              use_idf=True,
                                              ngram_range=(2, 2))),
                             ('svd', TruncatedSVD(n_components=150)),
                         ]), 'title'),
                        (
                            'snippet_vec',
                            Pipeline([
                                ('preprocessor', SpacyPreprocessor()),
                                (
                                    'tfidf',
                                    TfidfVectorizer(
                                        tokenizer=identity,
                                        preprocessor=None,
                                        lowercase=False,
                                        use_idf=True,
                                        sublinear_tf=
                                        False,  # not good results when True
                                        ngram_range=(1, 2))),
                                ('svd', TruncatedSVD(n_components=100)),
                            ]),
                            'snippet'),
                        ('cat', ce.CatBoostEncoder(),
                         ["job_type", 'category2', 'client.country']),
                    ],
                    remainder='drop')),

            #('oversampling', ADASYN(random_state=42)),
            ('classifier', clf),
        ],
        verbose=True)

    if search:

        log_space = gen_parameters_from_log_space(low_value=5,
                                                  high_value=8,
                                                  n_samples=10)

        lin_space = np.arange(2, 8, 2, dtype=np.int)

        if classifier == 'SVC':
            grid = {
                # 'union__title_vec__tfidf__ngram_range'   : [(1,2), (2,2)],
                # 'union__snippet_vec__tfidf__ngram_range' : [(1,2), (2,2)],
                # 'union__snippet_vec__svd__n_components'  : np.arange(50, 301, 50),
                # 'union__title_vec__svd__n_components'    : np.arange(100, 301, 50),
                'classifier__C': log_space,
            }

        elif classifier == 'LogisticRegression':
            grid = {
                'classifier__C': gen_parameters_from_log_space(0.1, 10, 10),
            }

        elif classifier == 'GradientBoostingClassifier':
            grid = {
                'classifier__learning_rate':
                gen_parameters_from_log_space(0.01, 1, 10),
            }

        elif classifier == 'VotingClassifier':
            grid = {
                'classifier__lr__C':
                gen_parameters_from_log_space(0.1, 10, 10),
                'classifier__C':
                gen_parameters_from_log_space(5, 8, 10),
                'classifier__learning_rate':
                gen_parameters_from_log_space(0.01, 1, 10),
            }

        # With scoring="ovo", computes the average AUC of all possible pairwise
        # combinations of classes. Insensitive to class imbalance when
        # average='macro'.
        # Also see: https://stackoverflow.com/a/62471736/1253729

        searcher = GridSearchCV(
            estimator=pipeline,
            param_grid=grid,
            n_jobs=4,
            return_train_score=True,
            refit=True,
            verbose=True,
            cv=StratifiedKFold(n_splits=3),
            scoring=scorer,
        )

        model = searcher.fit(X_train, y_train.values.ravel())
        print(f"Best found parameters: {searcher.best_params_}")

    else:
        model = pipeline.fit(X_train, y_train.values.ravel())

    return model
コード例 #55
0
ファイル: klusterV4.py プロジェクト: mikeparowski/kluster
        rows.append({'text': text, 'class': classification})
        index.append(filename)

    data_frame = DataFrame(rows, index=index)
    return data_frame

data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

data = data.reindex(np.random.permutation(data.index))
## now split files into training data and labels. probably tuple (filename, r/d)

classifier = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svd', TruncatedSVD(algorithm='randomized', n_components=300)),
    ('clf', XGBClassifier())
])

#classifier.fit(data['text'].values, data['class'].values)

k_fold = KFold(n_splits=8)
scores = []
confusion = np.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold.split(data):
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values
コード例 #56
0
    for size in tqdm(size_list):
        model = KeyedVectors.load("./trained_model/fasttext_gensim_" + str(size) + ".model")
        words_np = []
        words_label = []
        for word in list_words:
            words_np.append(model[word])
            words_label.append(word)
        word_vector_reduced = {}
        for index, vec in enumerate(words_np):
            word_vector_reduced[words_label[index]] = vec
        list_cosin_similarity = []
        for x, y in zip(data["Word1"], data["Word2"]):
            list_cosin_similarity.append(round(cosin_similarity(x, y, word_vector_reduced), 2))
        data['Relation_number'] = new_col
        data["FastText_" + str(size)] = list_cosin_similarity
        if size == 200:
            for new_size in size_list[:-1]:
                svd = TruncatedSVD(n_components=new_size, n_iter=30)
                svd.fit(words_np)
                reduced = svd.transform(words_np)
                word_vector_reduced = {}
                for index, vec in enumerate(reduced):
                    word_vector_reduced[words_label[index]] = vec
                list_cosin_similarity = []
                for x, y in zip(data["Word1"], data["Word2"]):
                    list_cosin_similarity.append(round(cosin_similarity(x, y, word_vector_reduced), 2))
                data["FastText_SVD_" + str(new_size)] = list_cosin_similarity
    # Ghi ket qua ra file csv
    tmp_name = os.path.basename(path_visim).split('.')[0] + '_result.csv'
    data.to_csv(os.path.join("./result", tmp_name), sep="\t")
コード例 #57
0
ファイル: Q3.py プロジェクト: ttommytang/EE219
    print '=============='
    print metrics.confusion_matrix(labels, km.labels_)
    print '=============='
    print '-----------------------------------------------------'
    #==============================================================================






#=========================Reduce Dimensionality (SVD)==========================
print '##############################################################'
for i in range(0,5):
    print 'Performing truncatedSVD...'
    svd = TruncatedSVD(n_components = 165, n_iter = 13,random_state = 42)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    
    X_reduced = lsa.fit_transform(X)
    
    k_means(X_reduced, labels, 'truncatedSVD')
#==============================================================================



#=========================Reduce Dimensionality (PCA)==========================
print '##############################################################'
for i in range(0,5):
    print 'Performing PCA...'
    
コード例 #58
0
ファイル: part_h_lsi.py プロジェクト: Jas-Jess/ee219
# Tokenize each document into words
# Gets rid of stop words, and stemmed version of word
# Ignores words appearing in less then 5 (or 2 if min_df = 2) documents 
vectorizer = CountVectorizer(min_df=5, stop_words= stop_words, tokenizer=LemmaTokenizer() )
X_train_counts = vectorizer.fit_transform(eight_train.data)
X_test_counts = vectorizer.transform(eight_test.data)

# TFIDF
# We set smooth_idf = false so we use the equation idf(d, t) = log [ n / df(d, t) ] + 1
tfidf_transformer = TfidfTransformer(smooth_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# 'arpack' for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds)
svd = TruncatedSVD(n_components=50, algorithm='arpack')
X_train_lsi = svd.fit_transform(X_train_tfidf)
X_test_lsi = svd.transform(X_test_tfidf)

# separate into two groups(Computer Tech & Recreation)
train_target_group = [ int(x / 4) for x in eight_train.target]
test_actual= [ int(x / 4) for x in eight_test.target]

# Logistic Regresstion Classifier 
log_reg = LogisticRegression()
log_reg.fit(X_train_lsi, train_target_group)

predicted = log_reg.predict(X_test_lsi)
predicted_probs = log_reg.predict_proba(X_test_lsi)

fpr, tpr, _ = roc_curve(test_actual, predicted_probs[:,1])
コード例 #59
0
# Fit and transform
principalComponents = pca.fit_transform(X_scaled)

# Print ratio of variance explained
#After transformation, we keep the three (as specified in n_components) transformed features with the highest
#transformed covariance eigenvalues
print("PCA explained variance ratios: {}".format(
    pca.explained_variance_ratio_))
print("PCA components: {}".format(pca.components_))
print("The PCA component vector has size {}, because there are {} vectors with length {}".format(pca.components_.shape, \
 pca.components_.shape[0], pca.components_.shape[1]))

########## Practicing singular value decomposition

# SVD
svd = TruncatedSVD(n_components=2)

# Fit and transform
principalComponents = svd.fit_transform(X_scaled)

# Print ratio of variance explained
print("SVD explained variance ratios: {}".format(
    svd.explained_variance_ratio_))

################ Practicing creating a PCA plot for visualization
#first project data points onto the two PCA axes. Note that principle components coming out of svd are already normalized.
df_pca = pd.DataFrame(np.transpose(np.array([np.dot(X_scaled, svd.components_[0]), np.dot(X_scaled, svd.components_[1]),\
 cancer.target])), \
 columns=['Principal component 1', 'Principal component 2', 'Target value'])

targets = [0, 1]
コード例 #60
0
#UNSUPERVISED MODEL
from model import *
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans


Sparse SVD on tf-idf to reduce features to 50
print("start dimensionality reduction")
data = get_vectorized_tweets('training_vecs.npy').toarray()
svd_model = TruncatedSVD(n_components=50)
data_svd = svd_model.fit_transform(data)
print("start TSNE")
tsne_model = TSNE(n_components = 2)
data_tsne = tsne_model.fit_transform(data_svd)
np.save('tsne_training_data.npy', data_tsne)
data_tsne = sample(np.asarray(get_vectorized_tweets('tsne_training_data.npy')), 500)
print(data_tsne.shape)
cluster_labels = KMeans(n_clusters = 5).fit(data_tsne).labels_

import matplotlib.pyplot as plt
print("scatter:")
plt.scatter(data_tsne[:,0], data_tsne[:,1], c = cluster_labels)
plt.show()

#UNSUPERVISED MODEL ONLY TOXIC SPEECH
#select only toxic speech
df_data = pd.read_csv("twitter-sentiment-analysis-hatred-speech/train.csv",names=('id','label','tweet'),header=None)
labels = df_data.to_numpy().T[1]
data_tsne = np.asarray(get_vectorized_tweets('tsne_training_data.npy'))