class TF_Transformer(base.BaseEstimator, base.TransformerMixin):
	def __init__(self):

		self.cv_bi = CountVectorizer(min_df=2,max_df=0.7,ngram_range=(1,2))
		self.tfidf_trans = TfidfTransformer()
		self.SVD_trans = TruncatedSVD(n_components=300)

    # X is a list of Fit_Review named tuples, y is none
	def fit(self, X, y=None):

		texts = [review.text for review in X]

		counts = self.cv_bi.fit_transform(texts)
		counts_tfidf = self.tfidf_trans.fit_transform(counts)
		self.SVD_trans.fit(counts_tfidf)

		return self

    # X is a list of either Fit_Review or Prod_Corpus named tuples
	def transform(self, X):

		texts = [review.text for review in X]

		counts = self.cv_bi.transform(texts)
		counts_tfidf = self.tfidf_trans.transform(counts)
		counts_trunc = self.SVD_trans.transform(counts_tfidf)

		return counts_trunc
Example #2
0
 def init_model(self, model, n_topics=10, **kwargs):
     if model == 'nmf':
         self.model = NMF(
             n_components=n_topics,
             alpha=kwargs.get('alpha', 0.1),
             l1_ratio=kwargs.get('l1_ratio', 0.5),
             max_iter=kwargs.get('max_iter', 200),
             random_state=kwargs.get('random_state', 1),
             shuffle=kwargs.get('shuffle', False))
     elif model == 'lda':
         self.model = LatentDirichletAllocation(
             n_topics=n_topics,
             max_iter=kwargs.get('max_iter', 10),
             random_state=kwargs.get('random_state', 1),
             learning_method=kwargs.get('learning_method', 'online'),
             learning_offset=kwargs.get('learning_offset', 10.0),
             batch_size=kwargs.get('batch_size', 128),
             n_jobs=kwargs.get('n_jobs', 1))
     elif model == 'lsa':
         self.model = TruncatedSVD(
             n_components=n_topics,
             algorithm=kwargs.get('algorithm', 'randomized'),
             n_iter=kwargs.get('n_iter', 5),
             random_state=kwargs.get('random_state', 1))
     else:
         msg = 'model "{}" invalid; must be {}'.format(
             model, {'nmf', 'lda', 'lsa'})
         raise ValueError(msg)
Example #3
0
def embed_two_dimensions(data, vectorizer, size=10, n_components=5, colormap='YlOrRd'):
    if hasattr(data, '__iter__'):
        iterable = data
    else:
        raise Exception('ERROR: Input must be iterable')
    import itertools
    iterable_1, iterable_2 = itertools.tee(iterable)
    # get labels
    labels = []
    for graph in iterable_2:
        label = graph.graph.get('id', None)
        if label:
            labels.append(label)

    # transform iterable into sparse vectors
    data_matrix = vectorizer.transform(iterable_1)
    # embed high dimensional sparse vectors in 2D
    from sklearn import metrics
    distance_matrix = metrics.pairwise.pairwise_distances(data_matrix)

    from sklearn.manifold import MDS
    feature_map = MDS(n_components=n_components, dissimilarity='precomputed')
    explicit_data_matrix = feature_map.fit_transform(distance_matrix)

    from sklearn.decomposition import TruncatedSVD
    pca = TruncatedSVD(n_components=2)
    low_dimension_data_matrix = pca.fit_transform(explicit_data_matrix)

    plt.figure(figsize=(size, size))
    embed_dat_matrix_two_dimensions(low_dimension_data_matrix, labels=labels, density_colormap=colormap)
    plt.show()
def main():
    svd = TruncatedSVD()
    Z = svd.fit_transform(X)
    plt.scatter(Z[:,0], Z[:,1])
    for i in xrange(D):
        plt.annotate(s=index_word_map[i], xy=(Z[i,0], Z[i,1]))
    plt.show()
def tfIDFeats(ids,data):


    # the infamous tfidf vectorizer (Do you remember this one?)
    tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
    # Fit TFIDF
    tfv.fit(data)
    X =  tfv.transform(data) 
        # Initialize SVD

    svd = TruncatedSVD(n_components=350)
    
    # Initialize the standard scaler 
    scl = StandardScaler( with_mean=False)
    
    
    
    if X.shape[1]>350:
        X = svd.fit_transform(X)
    X = scl.fit_transform(X,ids)
    if plotData:
        X = PCA(n_components=2).fit_transform(X)
    return (X,ids)
Example #6
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))
Example #7
0
def train():
    with open("../data/f_hashtag_prediction/train_data_tweets_processed_0_to_500K.txt") as ftrain:
        with open("../data/f_hashtag_prediction/test_data_tweets_processed_2K.txt") as ftest:
            test_set = ftest.read().splitlines()
            train_set = ftrain.read().splitlines()
            # vectorizer = CountVectorizer()
            vectorizer = TfidfVectorizer(min_df=5, max_df=500, max_features=None,
                                         strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                         ngram_range=(1, 4), use_idf=1, smooth_idf=1, sublinear_tf=1,
                                         stop_words='english')
            # vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform(train_set)
            print tfidf_matrix.shape
            # print tfidf_matrix
            # print vectorizer.fixed_vocabulary_
            smatrix = vectorizer.transform(test_set)
            print smatrix.shape

            joblib.dump(smatrix, "test_tfidf_matrix.o")
            joblib.dump(tfidf_matrix, "train_tfidf_matrix.o")

            svd = TruncatedSVD(n_components=500, random_state=42)
            svd.fit(tfidf_matrix)
            truncated_train_svd = svd.transform(tfidf_matrix)
            truncated_test_svd = svd.transform(smatrix)

            print truncated_train_svd.shape
            print truncated_test_svd.shape

            joblib.dump(truncated_train_svd, "truncated_train_svd.o")
            joblib.dump(truncated_test_svd, "truncated_test_svd.o")

        print "TEST SET: "
        test_index = 0
Example #8
0
 def find_k(self, rank=None, max_clusters=1, vertline=None):
     
     if rank != None:
         svd = TruncatedSVD(rank)
         self.X = svd.fit_transform(self.X)
         self.X = Normalizer(copy=False).fit_transform(self.X)
     
     k_range = range(1, max_clusters)
     clusters = [KMeans(n_clusters=k).fit(self.X) for k in k_range]
     centroids = [cluster.cluster_centers_ for cluster in clusters]
     k_cosine = [cdist(self.X, cent, metric='cosine') for cent in centroids]
     dist = [np.min(k_cos, axis=1) for k_cos in k_cosine]
     
     wcss = [sum(d[np.isnan(d) == False]**2) for d in dist] # Within cluster sum of squares
     tss = sum(pdist(self.X)**2)/self.X.shape[0] # Total sum of squares
     bss = tss - wcss # Explained variance
             
     fig, (ax1, ax2) = plt.subplots(1, 2)
     fig.set_size_inches(10, 3)
     plt.tight_layout()
     
     ax1.set_title('BSS')
     ax1.plot(np.arange(1, len(bss)+1), bss)
     ax1.scatter(np.arange(1, len(bss)+1), bss)        
     ax2.set_title('WCSS')
     ax2.plot(np.arange(1, len(wcss)+1), wcss)
     ax2.scatter(np.arange(1, len(wcss)+1), wcss)
     plt.axvline(vertline, c='red', alpha=0.75) if vertline != None else None
         
     plt.show()
Example #9
0
def reduce_dim(sparse_matrix, raw_data, unigrams, n: int, filename_prefix: str):
    """
    Applies truncated SVD to given sparse matrix and "clusters" each word according to
    the component that "leans" most in its direction.

    i.e. for each user, find out which principal component has the maximum value in its
    direction. Then assign it to the component with the maximum value.

    After doing this for all users and summing up the counts, components become
    "super user"s.

    :param sparse_matrix: feature matrix to be reduced
    :param unigrams: unigrams that correspond to columns in sparse_matrix
    These will be used to create a mapping file from word to super-word
    :param n: number of components
    :param filename_prefix: assignment vector will be saved with this prefix
    :return: reduced feature matrix where each column is a new "super-word"
    """
    svd = TruncatedSVD(n_components=n)
    svd.fit(sparse_matrix)
    maximums = np.argmax(np.abs(svd.components_), axis=0)
    unigram_feat_map = dict([(unigrams[i], maximums[i]) for i in range(len(maximums))])

    reduced = get_sparse_occur_matrix(raw_data, unigram_feat_map)[:, 0:n]
    # num_points, _ = sparse_matrix.shape
    # counts = sparse.csc_matrix((num_points, n), dtype=int)
    #
    # for feat_index, target_component in enumerate(maximums):
    #     counts[:, target_component] += sparse_matrix[:, feat_index]
    #
    with open(filename_prefix + ".svdfeatmap", "wb") as svdfeatmap:
        pickle.dump(unigram_feat_map, svdfeatmap)

    return reduced
Example #10
0
def train_manual():
    with open("../data/f_hashtag_prediction/train_data_tweets_processed_0_to_500K.txt") as ftrain:
        with open("../data/f_hashtag_prediction/test_data_tagged_processed_manual.txt") as ftest:
            test_set = ftest.read().splitlines()
            train_set = ftrain.read().splitlines()
            # vectorizer = CountVectorizer()
            vectorizer = TfidfVectorizer(min_df=5, max_df=500, max_features=None,
                                         strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                         ngram_range=(1, 4), use_idf=1, smooth_idf=1, sublinear_tf=1,
                                         stop_words='english')
            # vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform(train_set)
            print tfidf_matrix.shape

            smatrix = vectorizer.transform(test_set)
            print smatrix.shape

            svd = TruncatedSVD(n_components=500, random_state=42)
            svd.fit(tfidf_matrix)
            truncated_train_svd = svd.transform(tfidf_matrix)
            truncated_test_svd = svd.transform(smatrix)

            print truncated_train_svd.shape
            print truncated_test_svd.shape

            cosine = cosine_similarity(truncated_test_svd[0], truncated_train_svd)
            print cosine

        print "TEST SET: "
Example #11
0
 def solve(self, X, missing_mask):
     observed_mask = ~missing_mask
     X_filled = X
     for i in range(self.max_iters):
         # deviation from original svdImpute algorithm:
         # gradually increase the rank of our approximation
         if self.gradual_rank_increase:
             curr_rank = min(2 ** i, self.rank)
         else:
             curr_rank = self.rank
         tsvd = TruncatedSVD(curr_rank, algorithm=self.svd_algorithm)
         X_reduced = tsvd.fit_transform(X_filled)
         X_reconstructed = tsvd.inverse_transform(X_reduced)
         X_reconstructed = self.clip(X_reconstructed)
         mae = masked_mae(
             X_true=X,
             X_pred=X_reconstructed,
             mask=observed_mask)
         if self.verbose:
             print(
                 "[IterativeSVD] Iter %d: observed MAE=%0.6f" % (
                     i + 1, mae))
         converged = self._converged(
             X_old=X_filled,
             X_new=X_reconstructed,
             missing_mask=missing_mask)
         X_filled[missing_mask] = X_reconstructed[missing_mask]
         if converged:
             break
     return X_filled
def main():
    infile = open(sys.argv[1])
    outfile = sys.argv[2] #needs to be a string
    vocabfile = open(sys.argv[3])
    vocab = json.load(vocabfile)

    F = sparse.lil_matrix((len(vdict), 4*len(vdict)), dtype=np.int32)
    corpus_size = 0
    lc = 0

    for line in infile:
        lc += 1
        if lc % 10000 == 0:
            print('processing line ' + str(lc) + ' at ' + str(datetime.datetime.now()))
        words = line.split()
        num_words = len(words)
        corpus_size += num_words

        if num_words < 5:
            process_short_line(num_words, words, F, vocab)

        else:
            F[vocab[words[0]], 4 * vocab[words[1]] + 2] += 1
            F[vocab[words[0]], 4 * vocab[words[2]] + 3] += 1

            F[vocab[words[1]], 4 * vocab[words[0]] + 1] += 1
            F[vocab[words[1]], 4 * vocab[words[2]] + 2] += 1
            F[vocab[words[1]], 4 * vocab[words[3]] + 3] += 1

            F[vocab[words[-2]], 4 * vocab[words[-4]] + 0] += 1
            F[vocab[words[-2]], 4 * vocab[words[-3]] + 1] += 1
            F[vocab[words[-2]], 4 * vocab[words[-1]] + 2] += 1

            F[vocab[words[-1]], 4 * vocab[words[-3]] + 0] += 1
            F[vocab[words[-1]], 4 * vocab[words[-2]] + 1] += 1
            
            for i, word in enumerate(words[2:-2]):
                F[vocab[word], 4 * vocab[words[i-2]] + 0] += 1
                F[vocab[word], 4 * vocab[words[i-1]] + 1] += 1
                F[vocab[word], 4 * vocab[words[i+1]] + 2] += 1
                F[vocab[word], 4 * vocab[words[i+2]] + 3] += 1
                
    # compute PMI
    Fc = F.tocoo()
    word_freqs = Fc.sum(1)
    context_freqs = Fc.sum(0)
    word_freqs = word_freqs.A1
    context_freqs = context_freqs.A1

    for i,j,v in zip(Fc.row, Fc.col, Fc.data):
        F[i,j] = max( math.log((v * corpus_size) / (word_freqs[i] * context_freqs[j])), 0 )

    # compute TruncatedSVD
    svd = TruncatedSVD(n_components=200)
    Fred = svd.fit_transform(F)

    np.savetxt(outfile, Fred, delimiter=',')

    infile.close()
    vocabfile.close()
def get_lsa(x,t):
    print "LSA"
    lsa = SVD(n_components=600,algorithm="arpack")
    lsa.fit(x)
    x = lsa.transform(x)
    t = lsa.transform(t)
    return x,t
Example #14
0
def cook():
    x, y, weights = load_data()
    n_components = 200
    svd = TruncatedSVD(n_components, random_state=42)
    x_unweighted = svd.fit_transform(x)
    x_weighted = svd.fit_transform(weighted(x, weights))

    for i in range(9):
        frac = 1 - (i * 0.01 + 0.01)
        print frac

        x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac)
        classifier = AdaBoostClassifier(n_estimators=100)
        classifier.fit(x_train, y_train)
        print "Unweighted: ", classifier.score(x_test, y_test)

        x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac)
        classifier = AdaBoostClassifier(n_estimators=100)
        classifier.fit(x_train, y_train)
        print "Weighted: ", classifier.score(x_test, y_test)

        print '--------------------------'


    '''
def benchmark(k, epochs):
  print("*" * 80)
  print("k: %d, epochs: %d\n" % (k, epochs))

  #select = SelectKBest(score_func=chi2, k=k)
  select = TruncatedSVD(n_components=k)
  X_train_trunc = select.fit_transform(X_train, Y_train)
  X_test_trunc = select.transform(X_test)

  print('done truncating')

  parameters = {'C': [1, 10, 100, 1000, 10000],  'class_weight': ['auto', None], 'tol':[0.001,0.0001]}
  clf = LinearSVC(C=100000)
  #clf = grid_search.GridSearchCV(svc, parameters)
  clf.fit(X_train_trunc, Y_train)
  pred = clf.predict(X_test_trunc)

  if CREATE_SUBMISSION:
    X_submit_trunc = select.transform(X_submit)
    pred_submit = clf.predict(X_submit_trunc)
    dump_csv(pred_submit, k, epochs)

  score = metrics.f1_score(Y_test, pred)
  print("f1-score:   %0.3f" % score)

  print("classification report:")
  print(metrics.classification_report(Y_test, pred))

  print("confusion matrix:")
  print(metrics.confusion_matrix(Y_test, pred))
def truncatedSVD(data, labels, new_dimension):
    print "start truncatedSVD..."
    start = time.time()
    pca = TruncatedSVD(n_components=new_dimension)
    reduced = pca.fit_transform(data)
    end = time.time()
    return (reduced, end-start)
def preprocess(data, n_components, use_tf_idf=True):
    """
    Preproecess the data for clustering by running SVD and
    normalizing the results. This process is also known as
    LSA.

    arguments:
    data -- Dataset, if tf_idf is Truethe object must contain a
            tf_idf table alongside a raw frequencies dataframe.
    n_components -- int, the number of components to use for the SVD
                    a minimum of 100 is recommended.
    use_tf_idf -- bool, whether to use the tf-idf frequencies for the
                  preprocessing.

    returns:
    e -- float, a measure of variance explained by the SVD.
    X -- np.array, an array with the data reduced to n_components.
    """
    if use_tf_idf:
        d = data.tf_idf.as_matrix()
    else:
        d = data.df.as_matrix()
    svd = TruncatedSVD(n_components=n_components)
    X = svd.fit_transform(d)
    norm = Normalizer()

    # Record a measure of explained variance
    e = svd.explained_variance_ratio_.sum()*100
    return e, norm.fit_transform(d)
def benchmark(k, epochs):
  print("*" * 80)
  print("k: %d, epochs: %d\n" % (k, epochs))

  #select = SelectKBest(score_func=chi2, k=k)
  select = TruncatedSVD(n_components=k)
  X_train_trunc = select.fit_transform(X_train, Y_train)
  X_test_trunc = select.transform(X_test)

  print('done truncating')

  clf = DBN([X_train_trunc.shape[1], k, 4], learn_rates=0.3, learn_rate_decays=0.9, epochs=epochs, verbose=1)
  clf.fit(X_train_trunc, Y_train)
  pred = clf.predict(X_test_trunc)

  if CREATE_SUBMISSION:
    X_submit_trunc = select.transform(X_submit)
    pred_submit = clf.predict(X_submit_trunc)
    dump_csv(pred_submit, k, epochs)

  score = metrics.f1_score(Y_test, pred)
  print("f1-score:   %0.3f" % score)

  print("classification report:")
  print(metrics.classification_report(Y_test, pred))

  print("confusion matrix:")
  print(metrics.confusion_matrix(Y_test, pred))
Example #19
0
def cluster_DBSCAN(args):
	"""
	Clustering with Ward hierarchical clustering: constructs a tree and cuts it.
	"""
	#load data
	g_it = node_link_data.node_link_data_to_eden(input = args.input_file, input_type = "file")
	vec = graph.Vectorizer(r = args.radius,d = args.distance, nbits = args.nbits)
	logger.info('Vectorizer: %s' % vec)

	X = vec.transform(g_it, n_jobs = args.n_jobs)
	logger.info('Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz() / X.shape[0]))
	
	#project to lower dimensional space to use clustering algorithms
	transformer = TruncatedSVD(n_components=args.n_components)
	X_dense=transformer.fit_transform(X)

	#log statistics on data
	logger.info('Dimensionality reduction Instances: %d Features: %d with an avg of %d features per instance' % (X_dense.shape[0], X_dense.shape[1], X.getnnz() / X.shape[0]))

	#clustering
	clustering_algo = DBSCAN(eps = args.eps)
	y = clustering_algo.fit_predict(X_dense)
	msg = 'Predictions statistics: '
	msg += util.report_base_statistics(y)
	logger.info(msg)

	#save model for vectorizer
	out_file_name = "vectorizer"
	eden_io.dump(vec, output_dir_path = args.output_dir_path, out_file_name = out_file_name)
	logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)

	#save result
	out_file_name = "labels"
	eden_io.store_matrix(matrix = y, output_dir_path = args.output_dir_path, out_file_name = out_file_name, output_format = "text")
	logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)
Example #20
0
def train_pca_svm(learning_data, pca_dims, probability=True, cache_size=3000, **svm_kwargs):
    (X_train, y_train, train_ids), (X_test, y_test, test_ids) = learning_data

    pca = TruncatedSVD(n_components=pca_dims)
    n_symbols = max(
        np.max(X_train) + 1, np.max(X_test) + 1
    )
    logger.info("Forming CSR Matrices")
    x_train, x_test = create_csr_matrix(X_train, n_symbols), create_csr_matrix(X_test, n_symbols)
    logger.info("Starting PCA")
    # pseudo-supervised PCA: fit on positive class only
    pca = pca.fit(x_train[y_train > 0])

    x_train_pca = pca.transform(x_train)
    x_test_pca = pca.transform(x_test)

    logger.info("Starting SVM")
    svc = SVC(probability=probability, cache_size=cache_size, **svm_kwargs)
    svc.fit(x_train_pca, y_train)
    logger.info("Scoring SVM")
    score = svc.score(x_test_pca, y_test)
    logger.info(score)
    svc.test_score = score
    pca.n_symbols = n_symbols
    return svc, pca, x_train_pca, x_test_pca
def test_inverse_transform(algo):
    # We need a lot of components for the reconstruction to be "almost
    # equal" in all positions. XXX Test means or sums instead?
    tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo)
    Xt = tsvd.fit_transform(X)
    Xinv = tsvd.inverse_transform(Xt)
    assert_array_almost_equal(Xinv, Xdense, decimal=1)
Example #22
0
def lsa(BP, lentrain, n_components=16, preproc=True, 
    fit_area='test', min_df=3):
    """
    aka Latent semantic analysis
    """
    if preproc:
        print "pre-processing data"
        traindata = []
        for observation in BP:
            traindata.append(preprocess_pipeline(observation, "english", 
                "WordNetLemmatizer", True, True, False))
        BP = traindata

    print "fitting TfidfVectorizer"
    tfv = TfidfVectorizer(min_df=min_df,  max_features=None, strip_accents='unicode',  
        analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1,
        smooth_idf=1, sublinear_tf=1, norm='l2')
    if fit_area == 'test':
        tfv.fit(BP[lentrain:])
    elif fit_area == 'train':
        tfv.fit(BP[:lentrain])
    else:
        tfv.fit(BP)
    print "transforming data"
    BP = tfv.transform(BP)
    print "BP(post):",BP.shape

    if 1:
        # svd here
        print "use svd"
        svd = TruncatedSVD(n_components=n_components, random_state=1)
        BP = svd.fit_transform(BP)
    
    return BP
Example #23
0
def kfold(agetext,k,model,k2):
    import collections
    out = []
    for i in range(k):
        print "iteration: "+str(i)
        agetext = shuffle(agetext)
        datatb = agetext.iloc[:,1:]
        label = agetext["agegroup"].tolist()
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            datatb, label, test_size=0.15, random_state=i*6)
        data = X_train.values
        counter = collections.Counter(y_train)
        print counter
        testdata = X_test.values
        lsa = TruncatedSVD(k2, algorithm = 'arpack')
        normalizer = Normalizer(copy=False)
        X = lsa.fit_transform(data)
        X = normalizer.fit_transform(X)
        X_test = lsa.transform(testdata)
        X_test = normalizer.transform(X_test)
        model.fit(X,y_train)
        pred = model.predict(X_test)
        counter = collections.Counter(y_test)
        print counter
        counter = collections.Counter(pred)
        print counter
        out.append(round(accuracy_score(y_test, pred),5))
    print str(out)
    print np.mean(out)
Example #24
0
def lsa_summarizer(text,num_sen=5):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())

    tfvectorizer = TfidfVectorizer(tokenizer=tokenizeText)
    sparse = tfvectorizer.fit_transform(sentenceTokens).A
    lsa = TruncatedSVD(n_components=1)
    concept = lsa.fit_transform(sparse)

    pos = np.array(list(range(len(sentenceTokens))))    
    
    listlist = [list(x) for x in zip(sentenceTokens,concept,pos)]

    listlist.sort(key=lambda x: x[1],reverse=True)

    summarysentences = listlist[0:num_sen]

    summarysentences.sort(key=lambda x: x[2],reverse=False)

    summary = ""
    for n in range(num_sen):
        summary += ' ' + summarysentences[n][0]
        summary = " ".join(summary.replace(u"\xa0", u" ").strip().split())

    return summary
def SVD_CV(counts, scores, n_comp=range(10,611,100)):

	n_avg = 16
	avg_err = []
	for n in range(0,n_avg):

		X_train, X_test, y_train, y_test = cross_validation.train_test_split(counts, scores, \
											test_size=0.2, random_state=n)
		test_err = []
		for n in n_comp:
			TruncTrans = TruncatedSVD(n_components=n)
			X_trunc_train = TruncTrans.fit_transform(X_train,scores)
			regr = linear_model(X_trunc_train,y_train)
			X_trunc_test = TruncTrans.transform(X_test)
			y_pred = regr.predict(X_trunc_test)*10**(-12)+3
			test_err.append(metrics.mean_squared_error(y_test, y_pred))

		if not avg_err:
			avg_err = test_err
		else:
			avg_err = [avg_err[i]+(test_err[i]*(1.0/n_avg)) for i in range(0,len(test_err))]



	plt.plot(n_comp, avg_err, label='Out-of-Sample Error')
	plt.xlabel('n components')
	plt.ylabel('MSE')
	plt.show()
    def fit_document_matrix(self, X):
        """
        Reduce dimension of sparse matrix X
        using Latent Semantic Analysis and
        build nearst neighbor model

        Parameters
        ----------
        X: sparse csr matrix, sparse term frequency matrix or
            others weighting matrix from documents
        """
        n_components = self.n_components
        n_iter = self.n_iter
        algorithm = self.algorithm
        lsa_model = TruncatedSVD(n_components=n_components,
                                 n_iter=n_iter,
                                 algorithm=algorithm)
        # reduce dimension using Latent Semantic Analysis
        vectors = lsa_model.fit_transform(X)
        self.vectors = vectors

        # build nearest neighbor model
        nbrs_model = build_nearest_neighbors(vectors, n_recommend=self.n_recommend)
        self.nbrs_model = nbrs_model

        return self
Example #27
0
def compute_svd(Xs):
    # compute 1st principal component
    svd = TruncatedSVD(n_components=1, n_iter=20, random_state=0)
    svd.fit(Xs)
    pc = svd.components_
    print(pc.shape, svd.explained_variance_ratio_)
    return pc
Example #28
0
def buildKB16(n_comp = 200, seed_value = 123):
    ## data
    # read the training/test data  
    print('Importing Data')
    xtrain = pd.read_csv('../input/xtrain_kb6099.csv')
    xtest = pd.read_csv('../input/xtest_kb6099.csv')
    
    # separate 
    id_train = xtrain.ID; xtrain.drop('ID', axis = 1, inplace = True)
    ytrain = xtrain.target; xtrain.drop('target', axis = 1, inplace = True)
    id_test = xtest.ID; xtest.drop('ID', axis = 1, inplace = True)
    
    # fit SVD
    svd = TruncatedSVD(n_components = n_comp,n_iter=5, random_state= seed_value)
    svd.fit(xtrain)
    xtrain = svd.transform(xtrain)
    xtest = svd.transform(xtest)
    xtrain = pd.DataFrame(xtrain)
    xtest = pd.DataFrame(xtest)
    
    ## store the results
    # add indices etc
    xtrain = pd.DataFrame(xtrain)
    xtrain['ID'] = id_train
    xtrain['target'] = ytrain
#
    xtest = pd.DataFrame(xtest)
    xtest['ID'] = id_test
#
#
#    # save the files
    xtrain.to_csv('../input/xtrain_kb16c'+str(n_comp)+'.csv', index = False, header = True)
    xtest.to_csv('../input/xtest_kb16c'+str(n_comp)+'.csv', index = False, header = True)
    
    return
Example #29
0
def basic_lsi(df, n_components=200, max_df=0.5, min_df=5):
    '''
    Basic LSI model for album recommendations

    Args:
        df: dataframe with Pitchfork reviews
        n_components: number of lsi dimensions
        max_df: max_df in TfidfVectorizer
        min_df: min_df in TfidfVectorizer
    Returns:
        tfidf: sklearn fitted TfidfVectorizer
        tfidf_trans: sparse matrix with tfidf transformed data
        svd: sklearn fitted TruncatedSVD
        svd_trans: dense array with lsi transformed data

    '''

    X = df['review']
    stopwords = nltk.corpus.stopwords.words('english')

    tfidf = TfidfVectorizer(stop_words=stopwords,
                            max_df=max_df, min_df=min_df)
    tfidf_trans = tfidf.fit_transform(X)

    svd = TruncatedSVD(n_components=n_components)
    svd_trans = svd.fit_transform(tfidf_trans)

    return tfidf, tfidf_trans, svd, svd_trans
def test_sparse_formats(fmt):
    Xfmt = Xdense if fmt == "dense" else getattr(X, "to" + fmt)()
    tsvd = TruncatedSVD(n_components=11)
    Xtrans = tsvd.fit_transform(Xfmt)
    assert_equal(Xtrans.shape, (n_samples, 11))
    Xtrans = tsvd.transform(Xfmt)
    assert_equal(Xtrans.shape, (n_samples, 11))
Example #31
0
def train(n_components, demean, n_samples):
    print("Loading data...")
    movie_titles, ratings, rating_indices, n_users, n_items = get_netflix_data(
        n_samples=n_samples)
    print("number of users with ratings: {}".format(
        len(np.unique(rating_indices[:, 0]))))
    print("number of movies with ratings: {}".format(
        len(np.unique(rating_indices[:, 1]))))
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True)
    kf.get_n_splits(rating_indices)

    if not n_components:
        components = [5, 10, 15, 20, 30, 50]
        components_loss_path = np.zeros((len(components), n_splits))
        print("Finding optimal number of components...")
        for n, n_components in enumerate(components):
            print("n_components: {}".format(n_components))
            for k, (train_index,
                    test_index) in enumerate(kf.split(rating_indices)):
                mean = None
                print("Fold {}".format(k))
                test_indices = rating_indices[test_index]
                test_indices = test_indices[:,
                                            0], test_indices[:,
                                                             1], test_indices[:,
                                                                              2]
                if demean:
                    print("De-mean training data...")
                    train_indices = rating_indices[train_index]
                    mean = np.mean(train_indices[:, 2])
                    train_indices = train_indices[:,
                                                  0], train_indices[:,
                                                                    1], train_indices[:,
                                                                                      2] - mean
                    data_train = scipy.sparse.csr_matrix(
                        (train_indices[2],
                         (train_indices[0], train_indices[1])),
                        shape=(n_users, n_items))
                else:
                    user_test_indices, item_test_indices = test_indices[
                        0], test_indices[1]
                    data_train = scipy.sparse.lil_matrix(ratings)
                    data_train[user_test_indices, item_test_indices] = 0
                    data_train = scipy.sparse.csr_matrix(ratings)
                print("Finished de-meaning.")
                start = time.time()
                print("Fitting...")
                svd = TruncatedSVD(n_components=n_components)
                P = svd.fit_transform(data_train)
                Q = svd.components_
                acc, loss = evaluate(P, Q, test_indices, mean=mean)
                print("Elapsed time: {:.1f}s".format(time.time() - start))
                print("loss: {:.4f} - acc: {:.4f}".format(loss, acc))
                components_loss_path[n, k] = loss
        mean_loss = np.mean(components_loss_path, axis=1)
        best_k = components[np.argmin(mean_loss)]
        best_loss = np.amin(mean_loss)
        print("best k: {}, best loss: {:.4f}".format(best_k, best_loss))
    else:
        print("Performing cross validation...")
        mean_acc = 0.0
        mean_loss = 0.0
        for k, (train_index,
                test_index) in enumerate(kf.split(rating_indices)):
            mean = None
            print("Fold {}".format(k))
            test_indices = rating_indices[test_index]
            test_indices = test_indices[:, 0], test_indices[:,
                                                            1], test_indices[:,
                                                                             2]
            if demean:
                print("De-mean training data...")
                train_indices = rating_indices[train_index]
                mean = np.mean(train_indices[:, 2])
                train_indices = train_indices[:,
                                              0], train_indices[:,
                                                                1], train_indices[:,
                                                                                  2] - mean
                data_train = scipy.sparse.csr_matrix(
                    (train_indices[2], (train_indices[0], train_indices[1])),
                    shape=(n_users, n_items))
                print("Finished de-meaning.")
            else:
                user_test_indices, item_test_indices = test_indices[
                    0], test_indices[1]
                data_train = scipy.sparse.lil_matrix(ratings)
                data_train[user_test_indices, item_test_indices] = 0
                data_train = scipy.sparse.csr_matrix(ratings)
            start = time.time()
            print("fitting...")
            svd = TruncatedSVD(n_components=n_components)
            P = svd.fit_transform(data_train)
            Q = svd.components_
            acc, loss = evaluate(P, Q, test_indices, mean=mean)
            print("Elapsed time: {:.4f}".format(time.time() - start))
            print("loss: {:.4f} - acc: {:.4f}".format(loss, acc))
            mean_acc = (mean_acc * k + acc) / (k + 1)
            mean_loss = (mean_loss * k + loss) / (k + 1)
        print("mean loss: {:.4f} - mean acc: {:.4f}".format(
            mean_loss, mean_acc))
Example #32
0
if __name__ == "__main__":
    print 'loading x_tr...'
    t0 = time.time()
    x_tr = load_csr_matrix_from_npz('../data/processed/tf_idf_transformation/train/matrix.npz')
    print 'loading finished, time = {0}'.format(time.time()-t0)

    print 'loading y_tr...'
    t0 = time.time()
    y_tr = numpy.loadtxt('../data/processed/tf_idf_transformation/train/labels.csv', dtype='int')
    print 'loading finished, time = {0}'.format(time.time()-t0)
 
    print 'running TruncatedSVD...'
    t0 = time.time()
    from sklearn.decomposition import TruncatedSVD
    svd = TruncatedSVD(n_components=100)
    x_tr_new = svd.fit_transform(x_tr, y_tr)
    print 'running TruncatedSVD finished, x_new.shape = {0}, time = {1}'.format(x_tr_new.shape, time.time()-t0)
  
    #delete x_tr
    del x_tr

    print 'fitting model...'
    t0 = time.time()
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.linear_model import LogisticRegression
    clf = OneVsRestClassifier(LogisticRegression())
    clf.fit(x_tr_new, y_tr)
    print 'fitting finished, time = {0}'.format(time.time()-t0)

    #delete x_tr_new, y_tr
#Load train data
X_origin = pd.read_csv("train_isot.csv", ",")
Y = X_origin['label'].values
X_origin = X_origin['text'].values
print("Train set read.")

stopwords = set(ENGLISH_STOP_WORDS)

svm_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.73,
                                 stop_words=stopwords)
X = svm_vectorizer.fit_transform(X_origin)

print("Vectorized.")

svd = TruncatedSVD(n_components=200, algorithm='arpack', random_state=42)
print("SVD prepared.")
X = svd.fit_transform(X)

print("SVD finished.")
# tprs = []
# aucs = []
# mean_fpr = np.linspace(0, 1, 100)

# fig, ax = plt.subplots()
score_f = 0
score_a = 0

kf = KFold(n_splits=5, random_state=42, shuffle=True)
for i, (train, test) in enumerate(kf.split(X)):
    X_train = X[train]
Example #34
0
                print "generate %s feat" % feat_name
                # tfidf
                tfv = getTFV(ngram_range=ngram_range)
                X_tfidf_train = tfv.fit_transform(
                    dfTrain.iloc[trainInd][column_name])
                X_tfidf_valid = tfv.transform(
                    dfTrain.iloc[validInd][column_name])
                with open("%s/train.%s.feat.pkl" % (path, feat_name),
                          "wb") as f:
                    cPickle.dump(X_tfidf_train, f, -1)
                with open("%s/valid.%s.feat.pkl" % (path, feat_name),
                          "wb") as f:
                    cPickle.dump(X_tfidf_valid, f, -1)

                # svd
                svd = TruncatedSVD(n_components=svd_n_components, n_iter=15)
                X_svd_train = svd.fit_transform(X_tfidf_train)
                X_svd_test = svd.transform(X_tfidf_valid)
                with open(
                        "%s/train.%s_individual_svd%d.feat.pkl" %
                    (path, feat_name, svd_n_components), "wb") as f:
                    cPickle.dump(X_svd_train, f, -1)
                with open(
                        "%s/valid.%s_individual_svd%d.feat.pkl" %
                    (path, feat_name, svd_n_components), "wb") as f:
                    cPickle.dump(X_svd_test, f, -1)

    print("Done.")

    # Re-training
    print("For training and testing...")
def gen_plan_feas(data):
    n = data.shape[0]
    mode_list_feas = np.zeros((n, 12))
    max_dist, min_dist, mean_dist, std_dist = np.zeros((n, )), np.zeros(
        (n, )), np.zeros((n, )), np.zeros((n, ))

    max_price, min_price, mean_price, std_price = np.zeros((n, )), np.zeros(
        (n, )), np.zeros((n, )), np.zeros((n, ))

    max_eta, min_eta, mean_eta, std_eta = np.zeros((n, )), np.zeros(
        (n, )), np.zeros((n, )), np.zeros((n, ))

    min_dist_mode, max_dist_mode, min_price_mode, max_price_mode, min_eta_mode, max_eta_mode, first_mode = np.zeros(
        (n, )), np.zeros((n, )), np.zeros((n, )), np.zeros((n, )), np.zeros(
            (n, )), np.zeros((n, )), np.zeros((n, ))
    mode_texts = []
    for i, plan in tqdm(enumerate(data['plans'].values)):
        try:
            cur_plan_list = json.loads(plan)
        except:
            cur_plan_list = []
        if len(cur_plan_list) == 0:
            mode_list_feas[i, 0] = 1
            first_mode[i] = 0

            max_dist[i] = -1
            min_dist[i] = -1
            mean_dist[i] = -1
            std_dist[i] = -1

            max_price[i] = -1
            min_price[i] = -1
            mean_price[i] = -1
            std_price[i] = -1

            max_eta[i] = -1
            min_eta[i] = -1
            mean_eta[i] = -1
            std_eta[i] = -1

            min_dist_mode[i] = -1
            max_dist_mode[i] = -1
            min_price_mode[i] = -1
            max_price_mode[i] = -1
            min_eta_mode[i] = -1
            max_eta_mode[i] = -1

            mode_texts.append('word_null')
        else:
            distance_list = []
            price_list = []
            eta_list = []
            mode_list = []
            for tmp_dit in cur_plan_list:
                distance_list.append(int(tmp_dit['distance']))
                if tmp_dit['price'] == '':
                    price_list.append(0)
                else:
                    price_list.append(int(tmp_dit['price']))
                eta_list.append(int(tmp_dit['eta']))
                mode_list.append(int(tmp_dit['transport_mode']))
            mode_texts.append(' '.join(
                ['word_{}'.format(mode) for mode in mode_list]))
            distance_list = np.array(distance_list)
            price_list = np.array(price_list)
            eta_list = np.array(eta_list)
            mode_list = np.array(mode_list, dtype='int')
            mode_list_feas[i, mode_list] = 1
            distance_sort_idx = np.argsort(distance_list)
            price_sort_idx = np.argsort(price_list)
            eta_sort_idx = np.argsort(eta_list)

            max_dist[i] = distance_list[distance_sort_idx[-1]]
            min_dist[i] = distance_list[distance_sort_idx[0]]
            mean_dist[i] = np.mean(distance_list)
            std_dist[i] = np.std(distance_list)

            max_price[i] = price_list[price_sort_idx[-1]]
            min_price[i] = price_list[price_sort_idx[0]]
            mean_price[i] = np.mean(price_list)
            std_price[i] = np.std(price_list)

            max_eta[i] = eta_list[eta_sort_idx[-1]]
            min_eta[i] = eta_list[eta_sort_idx[0]]
            mean_eta[i] = np.mean(eta_list)
            std_eta[i] = np.std(eta_list)

            first_mode[i] = mode_list[0]
            max_dist_mode[i] = mode_list[distance_sort_idx[-1]]
            min_dist_mode[i] = mode_list[distance_sort_idx[0]]

            max_price_mode[i] = mode_list[price_sort_idx[-1]]
            min_price_mode[i] = mode_list[price_sort_idx[0]]

            max_eta_mode[i] = mode_list[eta_sort_idx[-1]]
            min_eta_mode[i] = mode_list[eta_sort_idx[0]]

    feature_data = pd.DataFrame(mode_list_feas)
    feature_data.columns = ['mode_feas_{}'.format(i) for i in range(12)]
    feature_data['max_dist'] = max_dist
    feature_data['min_dist'] = min_dist
    feature_data['mean_dist'] = mean_dist
    feature_data['std_dist'] = std_dist

    feature_data['max_price'] = max_price
    feature_data['min_price'] = min_price
    feature_data['mean_price'] = mean_price
    feature_data['std_price'] = std_price

    feature_data['max_eta'] = max_eta
    feature_data['min_eta'] = min_eta
    feature_data['mean_eta'] = mean_eta
    feature_data['std_eta'] = std_eta

    feature_data['max_dist_mode'] = max_dist_mode
    feature_data['min_dist_mode'] = min_dist_mode
    feature_data['max_price_mode'] = max_price_mode
    feature_data['min_price_mode'] = min_price_mode
    feature_data['max_eta_mode'] = max_eta_mode
    feature_data['min_eta_mode'] = min_eta_mode
    feature_data['first_mode'] = first_mode
    print('mode tfidf...')
    tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vec = tfidf_enc.fit_transform(mode_texts)
    svd_enc = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)
    mode_svd = svd_enc.fit_transform(tfidf_vec)
    mode_svd = pd.DataFrame(mode_svd)
    mode_svd.columns = ['svd_mode_{}'.format(i) for i in range(10)]

    data = pd.concat([data, feature_data, mode_svd], axis=1)
    data = data.drop(['plans'], axis=1)
    return data
Example #36
0
    def hac(self, n_clusters, verbose=False, tfidf=None, n_dimensions=None):
        """ Apply Hierarchical Agglomerative Clustering on a document collection.

        This method generates a hierarchical clustering tree for the collection.
        The leaves of the tree are clusters consisting of single documents.
        The tree is then saved by saving the list of merges in a file.

        Each entry of this list contains the two tree nodes that were merged to
        create a new node and the new node's id. Node ids less than the number
        of leaves represent leaves, while node ids greater than the number of
        leaves indicate internal nodes.

        Args:
            self.corpus (:obj:'Corpus'): The Corpus object of the document
                collection. Defaults to None. Only used when no pre-computed
                Tf/Idf matrix is given.
            tfidf_path (str): The path to the file containing the Tf/Idf matrix
                .pkl file. Defaults to None and in this case the Tf/Idf matrix
                is calculated.
            verbose (bool): When True additional information will be printed.
                Defaults to False.

        Returns:
            hac_model (:obj:'AgglomerativeClustering'): The HAC model fitted on
            the document collection.

        """
        # Compute or load Tf/Idf matrix.
        if tfidf is None:
            tfidf = self.extract_tfidf(self.corpus)
            print(tfidf.shape)

        # Apply latent semantic analysis.
        if n_dimensions is not None:
            print('Performing latent semantic analysis')
            svd = TruncatedSVD(n_dimensions)
            # Normalize SVD results for better clustering results.
            lsa = make_pipeline(svd, Normalizer(copy=False))
            tfidf = lsa.fit_transform(tfidf)

            print(tfidf.shape)

        # Calculate documente distance matrix from Tf/Idf matrix
        print('Constructing distance matrix...')
        dist = 1 - cosine_similarity(tfidf)

        start_time = time.time()
        print('Clustering...')
        # Generate HAC model.
        hac_model = AgglomerativeClustering(linkage='ward',
                                            n_clusters=n_clusters)
        # Fit the model on the distance matrix.
        hac_model.fit(dist)
        end_time = time.time()
        pickle.dump(hac_model, open('hac.pkl', 'wb'))

        if verbose:
            # Visualize cluster model
            children = hac_model.children_
            merges = [{
                'node_id': node_id + len(dist),
                'right': children[node_id, 0],
                'left': children[node_id, 1]
            } for node_id in range(0, len(children))]
            pickle.dump(merges, open('merges.pkl', 'wb'))
            pickle.dump(children, open('children.pkl', 'wb'))

            for merge_entry in enumerate(merges):
                print(merge_entry[1])

        print('Clustering completed after ' +
              str(round((end_time - start_time) / 60)) + "' " +
              str(round((end_time - start_time) % 60)) + "''")
        return hac_model
Example #37
0
    def kmeans(self, n_clusters, tfidf=None, n_dimensions=None, verbose=False):
        """ Applies kmeans clustering on a document collection.

        Args:
            self.corpus (:obj:'Corpus'): The Corpus object of the document
                collection. Defaults to None. Only used when no pre-computed
                Tf/Idf matrix is given.
            tfidf_path (str): The path to the file containing the Tf/Idf matrix
                .pkl file. Defaults to None and in this case the Tf/Idf matrix
                is calculated.
            verbose (bool): When True additional information will be printed.
                Defaults to False.

        Returns:
            kmodel (:obj:'Kmeans'): Scikit KMeans clustering model.

        """

        # Compute or load Tf/Idf matrix.
        if tfidf is None:
            tfidf = self.extract_tfidf(self.corpus)
            print(tfidf.shape)

        # Apply latent semantic analysis.
        if n_dimensions is not None:
            print('Performing latent semantic analysis...')
            svd = TruncatedSVD(n_dimensions)
            # Normalize SVD results for better clustering results.
            lsa = make_pipeline(svd, Normalizer(copy=False))
            tfidf = lsa.fit_transform(tfidf)
            print(tfidf.shape)

        # Do the clustering.
        start_time = time.time()
        print('Clustering...')
        kmodel = MiniBatchKMeans(n_clusters=n_clusters,
                                 init='k-means++',
                                 n_init=1,
                                 max_iter=10,
                                 verbose=True)
        kmodel.fit(tfidf)
        end_time = time.time()

        # Create a matching of the clusters and the ids of the documents
        # they contain.
        cluster_doc = pd.Series()
        for i in range(kmodel.n_clusters):
            ids = []
            for docid, cluster in enumerate(kmodel.labels_):
                if cluster == i:
                    ids.append(docid)
                    cluster_doc.loc[i] = ids

        if verbose:
            # Print some info.
            print("Top terms per cluster:")
            if n_dimensions is not None:
                original_space_centroids = svd.inverse_transform(
                    kmodel.cluster_centers_)
                order_centroids = original_space_centroids.argsort()[:, ::-1]
            else:
                order_centroids = kmodel.cluster_centers_.argsort()[:, ::-1]

            features = pickle.load(open('features.pkl', 'rb'))
            cluster_word = pd.Series()
            for i in range(n_clusters):
                cluster_features = []
                print("Cluster %d:" % i)
                for ind in order_centroids[i, :100]:
                    cluster_features.append(features[ind])
                cluster_word.loc[i] = cluster_features

        pickle.dump(kmodel, open('kmodel.pkl', 'wb'))
        pickle.dump(kmodel.cluster_centers_, open('centers.pkl', 'wb'))
        pickle.dump(cluster_doc, open('cluster_doc.pkl', 'wb'))
        pickle.dump(cluster_word, open('cluster_word.pkl', 'wb'))

        print('Clustering completed after ' +
              str(round((end_time - start_time) / 60)) + "' " +
              str(round((end_time - start_time) % 60)) + "''")

        return kmodel
Example #38
0
]

dataset = fetch_20newsgroups(subset='all',
                             categories=categories,
                             shuffle=True,
                             random_state=42)
labels_true = dataset.target
true_k = np.unique(labels_true).shape[0]
# t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5,
                             max_features=10000,
                             min_df=2,
                             stop_words='english',
                             use_idf=True)
X = vectorizer.fit_transform(dataset.data)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
# explained_variance = svd.explained_variance_ratio_.sum()

Wardhierarchial = AgglomerativeClustering(affinity='euclidean',
                                          compute_full_tree='auto',
                                          connectivity=None,
                                          linkage='ward',
                                          memory=None,
                                          n_clusters=2,
                                          pooling_func='deprecated').fit(X)
labels = Wardhierarchial.labels_
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
Example #39
0
    d2 = pdt_ttl_vec[i, :]
    dst_srch_ttl1[i] = cosine_similarity(d1, d2)

dst_srch_desc1 = np.zeros(srch_vec.shape[0])
for i in range(srch_vec.shape[0]):
    d1 = srch_vec[i, :]
    d2 = pdt_desc_vec[i, :]
    dst_srch_desc1[i] = cosine_similarity(d1, d2)

dst_ttl_desc1 = np.zeros(srch_vec.shape[0])
for i in range(srch_vec.shape[0]):
    d1 = pdt_ttl_vec[i, :]
    d2 = pdt_desc_vec[i, :]
    dst_srch_desc1[i] = cosine_similarity(d1, d2)

svd = TruncatedSVD(n_components=30, random_state=2016)

srch_vec = svd.fit_transform(srch_vec)
pdt_ttl_vec = svd.fit_transform(pdt_ttl_vec)
pdt_desc_vec = svd.fit_transform(pdt_desc_vec)

srch_vec = pd.DataFrame(
    srch_vec, columns=['srch_vec_' + str(i) for i in range(srch_vec.shape[1])])
pdt_ttl_vec = pd.DataFrame(
    pdt_ttl_vec,
    columns=['ttl_vec_' + str(i) for i in range(pdt_ttl_vec.shape[1])])
pdt_desc_vec = pd.DataFrame(
    pdt_desc_vec,
    columns=['desc_vec_' + str(i) for i in range(pdt_desc_vec.shape[1])])

id = list(df_all['id'])
Example #40
0
                transformer_list=[

                    # Pipeline for pulling features from the post's title line
                    ('title',
                     Pipeline([
                         ('selector', ItemSelector(key='title')),
                         ('tfidf',
                          TfidfVectorizer(min_df=50, stop_words='english')),
                     ])),

                    # Pipeline for standard bag-of-words model for abstract
                    ('abstract_bow',
                     Pipeline([
                         ('selector', ItemSelector(key='abstract')),
                         ('tfidf', TfidfVectorizer(stop_words='english')),
                         ('best', TruncatedSVD(n_components=50)),
                     ])),

                    # Pipeline for pulling ad hoc features from post's abstract
                    (
                        'abstract_stats',
                        Pipeline([
                            ('selector', ItemSelector(key='abstract')),
                            ('stats', TextStats()),  # returns a list of dicts
                            ('vect', DictVectorizer()
                             ),  # list of dicts -> feature matrix
                        ])),
                ],

                # weight components in FeatureUnion
                transformer_weights={
le = preprocessing.LabelEncoder()
le.fit(df["Category"])
Y_train=le.transform(df["Category"])
X_train1=df['Content'] 
X_train2=[]
for i in range(len(X_train1)):
	X_train2.append(10*df['Title'][i]+df['Content'][i])

X_train=np.array(X_train2)

#read test file
df_test=pd.read_csv("test_set.csv",sep="\t")

vectorizer=CountVectorizer(stop_words='english')
transformer=TfidfTransformer()
svd=TruncatedSVD(n_components=200, random_state=42) 
pipeline_test = Pipeline([
    ('vect', vectorizer),
    ('tfidf', transformer),
    ('svd',svd),
])
#My method---Voting Classifier
clf1 = BernoulliNB(fit_prior=False)
clf2 = KNeighborsClassifier(weights='distance',n_jobs=-1)
clf3 = RandomForestClassifier(n_estimators=500,n_jobs=-1)
clf = VotingClassifier(estimators=[('bnb',clf1),('knn',clf2),('rf',clf3)], voting='hard')
pipeline = Pipeline([
    ('vect', vectorizer),
    ('tfidf', transformer),
    ('svd',svd),
    ('clf', clf)
# print titles

for sentence in test_data['Content']:
    temp_title = ''

    for j in range(10):
        temp_title = titles2[i] + ' ' + temp_title

    sentences2.append(temp_title + PorterStemmer().stem_sentence(sentence))
    i = i + 1

#Vectorizing-LSI-Classifier
X_train = np.array(sentences)
X_test = np.array(sentences2)
clf = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopw)),\
                ('svd' , TruncatedSVD(n_components=1000) ),\
                ('clf', svm.SVC(C=10, gamma = 0.0001, kernel= 'linear', class_weight='balanced')),
               ])

clf.fit(X_train, y)
predicted = clf.predict(X_test)

#Print Results
categories = le.inverse_transform(predicted)

i = 0
CsvData2 = [['Id', 'Category']]

for t in test_data['Id']:
    CsvData2.append([t, categories[i]])
    i = i + 1
Example #43
0
def svd(*args, **kwargs):
    return TruncatedSVD(*args, **kwargs)
Example #44
0
    def test_pipeline_column_transformer(self):

        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1"
                                              if x > 0.5 else "cat2")
        X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3"
                                               if x > 0.5 else "cat4")
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="lbfgs",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ])

        categorical_transformer = Pipeline(steps=[
            (
                "onehot",
                OneHotEncoder(sparse=True, handle_unknown="ignore"),
            ),
            (
                "tsvd",
                TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4),
            ),
        ])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)
        initial_type = [
            ("numfeat", FloatTensorType([None, 3])),
            ("strfeat", StringTensorType([None, 2])),
        ]

        X_train = X_train[:11]
        model_onnx = convert_sklearn(model, initial_types=initial_type)

        dump_data_and_model(
            X_train,
            model,
            model_onnx,
            basename="SklearnPipelineColumnTransformerPipeliner",
            allow_failure="StrictVersion(onnx.__version__)"
            " < StrictVersion('1.3') or "
            "StrictVersion(onnxruntime.__version__)"
            " <= StrictVersion('0.4.0')",
        )

        if __name__ == "__main__":
            from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer

            pydot_graph = GetPydotGraph(
                model_onnx.graph,
                name=model_onnx.graph.name,
                rankdir="TP",
                node_producer=GetOpNodeProducer("docstring"),
            )
            pydot_graph.write_dot("graph.dot")

            import os

            os.system("dot -O -G=300 -Tpng graph.dot")
    'adenoma', 'neurocitoma', 'cervello', 'glioma', 'glioblastoma', 'glia',
    'lipoma', 'liposarcoma', 'adiposo'
]

pairs = {
    'b': [('fibroma', 'fibrosarcoma'), ('lipoma', 'liposarcoma'),
          ('osteoma', 'osteosarcoma'), ('papilloma', 'carcinoma'),
          ('adenoma', 'adenocarcinoma'), ('glioma', 'glioblastoma'),
          ('neurocitoma', 'neuroblastoma')],
    'r': [('fibrosarcoma', 'connettivo'), ('liposarcoma', 'adiposo'),
          ('linfoma', 'linfonodo'), ('osteosarcoma', 'osso'),
          ('mesotelioma', 'mesotelio'), ('glioblastoma', 'glia'),
          ('neuroblastoma', 'neuroni')]
}

proj = Projector(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
Y = proj.fit_transform(vectors[[index[k] for k in subset], :])
labels = words[[index[k] for k in subset]]

plt.scatter(Y[:, 0], Y[:, 1])
for label, x, y in zip(labels, Y[:, 0], Y[:, 1]):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')

for color in pairs:
    for pair in pairs[color]:
        plt.plot([Y[subset.index(pair[0]), 0], Y[subset.index(pair[1]), 0]],
                 [Y[subset.index(pair[0]), 1], Y[subset.index(pair[1]), 1]],
                 '-',
                 color=color)
Example #46
0
# Transposing the matrix

X = ratings_matrix.T
X.head()

# X = ratings_matrix
# X.head()
X.shape


X1 = X

#Decomposing the Matrix

SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape

#Correlation Matrix

correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

X.index[75]

# Index of product ID purchased by customer

i = "B00000K135"

product_names = list(X.index)
    
'''
    Modeling:
        
        - TF-IDF
        - SVD
        - Visualization
'''

# TF-IDF
vec = TfidfVectorizer(max_features = 1000, max_df = 0.5, smooth_idf = True)

x = vec.fit_transform(joined)

# SVD
svd = TruncatedSVD(n_components = 20, algorithm = 'randomized', n_iter = 100, random_state = 100)

svd.fit(x)

# Labels
terms = vec.get_feature_names()

for i, comp in enumerate(svd.components_):
    
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key = lambda x:x[1], reverse = True)[:7]
    
    print("Topic "+ str(i) + ": ", [t[0] for t in sorted_terms])
    
# Visualize
topics = svd.fit_transform(x)
Example #48
0
    def post(self):

        # Get the THEME labels
        abs_filename = ett_h.generate_dynamic_path(
            [base_folder_location, LabelType.THEME.value, label_file_name])
        labels = (ett_h.load_data_common_separated(abs_filename, ','))
        # Get the label data from input_data
        raw_label = TrainThemeUpload.input_data[ColumnName.LABEL.value]
        data = ett_t.transform_data_to_dataframe_basic(
            TrainThemeUpload.input_data, colnames)
        # Get the OneHotEncoded labels
        label_df = ett_t.one_hot_encoding(raw_label)  #17 labels dataframe
        # Rename the OneHotEncoded labels
        label_df.columns = labels
        # Get the number of labels
        num_of_labels = len(labels)
        # Data preprocessing
        nan_cleaned_data = ett_c.clean_dataframe_by_regex(
            data, RegexFilter.NON_ALPHA_NUMERIC.value
        )  # Removed all non alphanumeric characters
        d_cleaned_data = ett_c.clean_dataframe_by_regex(
            nan_cleaned_data,
            RegexFilter.DIGITS_ONLY.value)  # Removed all digits
        l_cleaned_data = ett_c.remove_non_iso_words(
            d_cleaned_data, Language.ENGLISH.value)  # Remove non-English text
        rew_cleaned_data = ett_c.remove_language_stopwords(
            l_cleaned_data, Language.ENGLISH.name)  # Remove English stop words
        l_transformed_data = ett_t.lowercase(
            rew_cleaned_data)  # Transform text to lowercase
        le_transformed_data = ett_t.stemming_mp(
            l_transformed_data
        )  # Transform text to core words i.e. playing > play
        data = le_transformed_data  # Return the newly transformed data

        # Split the data into 0.8 training datasets and 0.2 testing datasets
        X_train, X_test, y_train, y_test = train_test_split(data,
                                                            label_df,
                                                            test_size=0.2,
                                                            random_state=42)
        endpoint_output = {}
        for i in range(num_of_labels):
            model_id = str(i)
            single_label = y_train.iloc[:, i]
            label = labels[i]
            print("label", label)
            pipeline = imbPipeline([
                (ModelType.TFIDF.value,
                 TfidfVectorizer()),  # Data vectorization
                (ModelType.OVERSAMPLE.value,
                 SMOTE(random_state=42)),  # Data balancing
                (ModelType.SVD.value, TruncatedSVD()),  # Feature selection
                (ModelType.NOR.value,
                 preprocessing.MinMaxScaler()),  # Data normalization
                (ModelType.CLF.value, OneVsRestClassifier(SVC()))
            ])  # CLassification

            #list_c = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]
            list_c = [1]

            #list_n = [100, 150, 200, 250, 300, 350, 400, 450, 500, 550])
            list_n = [100]
            # Remember to add[2,\]2]
            best_score = 0
            epsilon = .005
            dictionary = {}

            for para_c in list_c:
                for para_n in list_n:
                    parameters = {
                        ModelType.TFIDF.value: [
                            TfidfVectorizer(max_features=800,
                                            ngram_range=(1, 4),
                                            norm='l2',
                                            encoding='latin-1',
                                            stop_words='english',
                                            analyzer='word')
                        ],
                        ModelType.SVD.value: [
                            TruncatedSVD(n_components=para_n,
                                         n_iter=7,
                                         random_state=42)
                        ],
                        ModelType.CLF.value: [
                            OneVsRestClassifier(
                                SVC(kernel='linear',
                                    probability=True,
                                    C=para_c))
                        ]
                    }
                    gs_clf = GridSearchCV(pipeline,
                                          parameters,
                                          cv=5,
                                          error_score='raise',
                                          scoring='f1')
                    gs_clf = gs_clf.fit(X_train, single_label)
                    current_score = gs_clf.best_score_
                    dictionary[current_score] = parameters

            for current_score in dictionary.keys():
                if current_score - epsilon > best_score:
                    best_score = current_score

            model_dict = dictionary[best_score]

            label_model_list = {}
            label_model_list['score'] = best_score

            folder_time = time.strftime("_%Y%m%d_%H%M")
            # Create Directory in the AWS S3 Bucket
            os.mkdir("/Users/yihanbao/Desktop/unisdr-training/theme/" + label +
                     "/" + label + folder_time)
            # Navigate to AWS model saving folder
            model_folder = os.path.join(
                os.path.dirname(
                    os.path.dirname(
                        os.path.dirname(
                            os.path.dirname(os.path.realpath(__file__))))),
                ett_h.generate_dynamic_path(
                    [LabelType.THEME.value, label, label + folder_time]))
            """
            # Connect to AWS
            conn = boto.s3.connect_to_region(" ",aws_access_key_id = 'AWS-Access-Key', aws_secret_access_key = 'AWS-Secrete-Key',
                                 calling_format = boto.s3.connection.OrdinaryCallingFormat())

            bucket = conn.get_bucket("oict-psdg-unisdr-train-models-v1")
        
            # AWS Key 
            aws_path = ett_h.generate_dynamic_path([LabelType.THEME.value, label, timestamp+label])
            """
            # Here to fit the training datasets to the  models with best score
            # vectorization
            vector = model_dict[ModelType.TFIDF.value][0].fit(
                X_train, single_label)
            ett_h.save_model(
                vector,
                ett_h.generate_dynamic_path(
                    [model_folder, label + folder_time + vector_model_name]))
            vectorized_df = vector.transform(X_train)
            label_model_list[
                URLName.VECURL.value] = ett_h.generate_dynamic_path(
                    [model_folder, label + folder_time + vector_model_name])
            """
            key_name = timestamp+label+model_name
            full_key_name = os.path.join(path, key_name)
            
            pickle_byte_obj = pickle.dump(vector) 
            s3_resource = resource('s3')
            s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj)
            """
            # Balcancing
            sm = SMOTE(random_state=42)
            X_res, y_res = sm.fit_resample(vectorized_df, single_label)

            # Feature selction
            svd = model_dict[ModelType.SVD.value][0].fit(X_res, y_res)
            ett_h.save_model(
                svd,
                ett_h.generate_dynamic_path([
                    model_folder, label + folder_time + dim_reductor_model_name
                ]))
            dim_reductor_df = svd.transform(X_res)
            label_model_list[
                URLName.DIMURL.value] = ett_h.generate_dynamic_path([
                    model_folder, label + folder_time + dim_reductor_model_name
                ])
            """
            key_name = timestamp+label+dim_reductor_model_name
            full_key_name = os.path.join(path, key_name)
            
            pickle_byte_obj = pickle.dump(svd) 
            s3_resource = resource('s3')
            s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj)
            """

            # Normalizing
            min_max_scaler = preprocessing.MinMaxScaler()
            nor_model = min_max_scaler.fit(dim_reductor_df, y_res)
            ett_h.save_model(
                nor_model,
                ett_h.generate_dynamic_path([
                    model_folder, label + folder_time + normalizar_model_name
                ]))
            scaled_df = nor_model.transform(dim_reductor_df)
            label_model_list[
                URLName.NORURL.value] = ett_h.generate_dynamic_path([
                    model_folder, label + folder_time + normalizar_model_name
                ])
            """
            key_name = timestamp+label+normalizar_model_name
            full_key_name = os.path.join(path, key_name)
            
            pickle_byte_obj = pickle.dump(nor_model) 
            s3_resource = resource('s3')
            s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj)
            """

            # Classifier
            clf = model_dict[ModelType.CLF.value][0].fit(scaled_df, y_res)
            clf.fit(scaled_df, y_res)
            ett_h.save_model(
                clf,
                ett_h.generate_dynamic_path(
                    [model_folder, label + folder_time + model_name]))
            label_model_list[
                URLName.MODURL.value] = ett_h.generate_dynamic_path(
                    [model_folder, label + folder_time + model_name])
            """
            key_name = timestamp+label+model_name
            full_key_name = os.path.join(path, key_name)
            
            pickle_byte_obj = pickle.dump(scaled_df) 
            s3_resource = resource('s3')
            s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj)
            """
            endpoint_output[model_id] = [label_model_list]
        output = json.dumps(endpoint_output)
        return output
Example #49
0
                'nntp', '00041032', '000062david42', '000050', '00041555', '0004244402', 'mcimail', '00043819',
                'prb', '0004246', '0004422', '00044513', '00044939','access', 'digex', 'host', 'would', 'writes',
                'posting', 'dseg'])


# In[5]:

vectorizer = TfidfVectorizer(stop_words=stopset,
                                use_idf=True, ngram_range = (1, 3))
X = vectorizer.fit_transform(corpus)


# In[6]:

#decompose into X=UST^T
lsa = TruncatedSVD(n_components = 25, n_iter = 100)
lsa.fit(X)


# In[7]:

terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
    termsInComp = zip (terms, comp)
    sortedTerms = sorted(termsInComp, key = lambda x: x[1], reverse = True) [:10]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print(" ")

Example #50
0
def svd_vector(data):
    svd = TruncatedSVD(n_components=1)
    vector = svd.fit_transform(data.ix[:, 6:].transpose())
    return [item for sublist in vector for item in sublist]
if len(fns) > 1:
    print('Multiple merged embeddings in working directory.')
    sys.exit()
else:
    m = fns[0]

print('Reading raw.')
sys.stdout.flush()
df = pd.read_csv(m, index_col=0, header=None)
if df.index.names[0] == 0:
    print('Renaming index column to SampleID.')
    df.index.names = ['SampleID']
    df.to_csv(m, compression='gzip')

mat = df.to_numpy().T
sampids = df.index
del df

print('Performing svd.')
sys.stdout.flush()
svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
svd.fit(mat)
pc = svd.components_
mat -= mat.dot(pc.T) * pc

print('Saving nonraw.')
sys.stdout.flush()
df = pd.DataFrame(mat.T, index=sampids)
df.index.names = ['SampleID']
df.to_csv(m.replace('_raw', ''), compression='gzip')
 def do_lsa(X, target_dim):
     svd = TruncatedSVD(target_dim, random_state=42)
     normalizer = Normalizer(copy=False)
     lsa = make_pipeline(svd, normalizer)
     return lsa.fit_transform(X)
Example #53
0
def main(argv):
    choose_mindf = argv[1]
    try:
        path = argv[2]
    except:
        path = None
    categories1 = [
        'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
        'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles',
        'rec.sport.baseball', 'rec.sport.hockey'
    ]
    categories2 = [
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
        'soc.religion.christian'
    ]
    cat_all = [
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
        'soc.religion.christian', 'alt.atheism', 'comp.graphics',
        'comp.os.ms-windows.misc', 'comp.windows.x', 'rec.autos',
        'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey',
        'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
        'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc',
        'talk.religion.misc'
    ]
    dclass = Data(categories1, cat_all, categories2, path)
    stop_words = text.ENGLISH_STOP_WORDS

    print('-----Part A-----')
    #plot_histogram(dclass)
    print('-----Part B-----')

    vectorizer2 = CountVectorizer(min_df=2, stop_words=stop_words, max_df=0.8)
    tfidf_transformer2 = TfidfTransformer()

    vectorizer5 = CountVectorizer(min_df=5, stop_words=stop_words, max_df=0.8)
    tfidf_transformer5 = TfidfTransformer()
    tfidf2 = preprocess(dclass,
                        dclass.training_data1,
                        vectorizer2,
                        tfidf_transformer2,
                        train=True)
    tfidf5 = preprocess(dclass,
                        dclass.training_data1,
                        vectorizer5,
                        tfidf_transformer5,
                        train=True)  #default min_df=5

    print('# of terms with min_df = 2:', tfidf2[0, :].toarray().shape[1],
          '\n# of terms with min_df = 5:', tfidf5[0, :].toarray().shape[1])

    d_tfidf = {'2': tfidf2, '5': tfidf5}
    d_vectorizer = {'2': vectorizer2, '5': vectorizer5}
    d_transformer = {'2': tfidf_transformer2, '5': tfidf_transformer5}

    print('-----Part C-----')

    vectorizerc_2 = CountVectorizer(min_df=2,
                                    stop_words=stop_words,
                                    max_df=0.8)
    tfidf_transformerc_2 = TfidfTransformer()

    tfidf_c_2 = preprocess(dclass,
                           dclass.training_data2,
                           vectorizerc_2,
                           tfidf_transformerc_2,
                           train=True,
                           ICF=True)  #default min_df=5, use TF-ICF
    find_10most(dclass, tfidf_c_2)

    vectorizerc_5 = CountVectorizer(min_df=5,
                                    stop_words=stop_words,
                                    max_df=0.8)
    tfidf_transformerc_5 = TfidfTransformer()

    tfidf_c_5 = preprocess(dclass,
                           dclass.training_data2,
                           vectorizerc_5,
                           tfidf_transformerc_5,
                           train=True,
                           ICF=True)  #default min_df=5, use TF-ICF
    find_10most(dclass, tfidf_c_5)

    print('-----Part D-----')  #SVD and NMF base on TF-IDF5 result
    svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)
    D_LSI = svd.fit_transform(d_tfidf[choose_mindf])
    model = NMF(n_components=50, init='random', random_state=0)
    D_NMF = model.fit_transform(d_tfidf[choose_mindf])
    print('LSI.shape:', D_LSI.shape, '\nNMF.shape:', D_NMF.shape)

    print('-----Part E-----')  #SVM
    tfidftest = preprocess(dclass,
                           dclass.testing_data1,
                           d_vectorizer[choose_mindf],
                           d_transformer[choose_mindf],
                           train=False)  #testing data
    D_LSI_test = svd.transform(tfidftest)
    D_NMF_test = model.transform(tfidftest)
    print('for D_LSI:')
    part_e(dclass, D_LSI, D_LSI_test)
    print('for D_NMF:')
    part_e(dclass, D_NMF, D_NMF_test)

    print('-----Part F-----')
    print('for D_LSI:')
    part_f(dclass, D_LSI, D_LSI_test)
    print('for D_NMF:')
    part_f(dclass, D_NMF, D_NMF_test)

    print('-----Part G-----')
    part_g(dclass, D_NMF, D_NMF_test, dclass.training_target1)

    print('-----Part H-----')
    part_h(dclass, D_LSI, D_LSI_test)
    part_h(dclass, D_NMF, D_NMF_test)

    print('-----Part I-----')
    part_i(dclass, D_LSI, D_LSI_test)
    part_i(dclass, D_NMF, D_NMF_test)

    print('-----Part J-----')

    tfidf2_j = preprocess(dclass,
                          dclass.training_dataj,
                          vectorizer2,
                          tfidf_transformer2,
                          train=True)
    D_LSI_j = svd.fit_transform(tfidf2_j)
    D_NMF_j = model.fit_transform(tfidf2_j)

    tfidftest_j = preprocess(dclass,
                             dclass.testing_dataj,
                             vectorizer2,
                             tfidf_transformer2,
                             train=False)  #testing data
    D_LSI_test_j = svd.transform(tfidftest_j)
    D_NMF_test_j = model.transform(tfidftest_j)

    print('----------------Naive Bayes in J-----------------')
    part_g(dclass, D_NMF_j, D_NMF_test_j, dclass.training_targetj, True)

    print('----------------SVM in J with LSI data-----------')
    part_j_SVM(dclass, D_LSI_j, D_LSI_test_j)

    print('----------------SVM in J with NMF data-----------')
    part_j_SVM(dclass, D_NMF_j, D_NMF_test_j)
Example #54
0
def train_with_bag_of_words(X_train,
                            y_train,
                            scorer,
                            classifier='SVC',
                            search=True):
    """
    Pass the data through a pipeline and return a trained model.

    Args:
        X_train: Train data
        y_train: Labels for the train data (transformed by LabelEncoder)
        search : Whether to search for the best hyperparameters
    """

    estimators = {
        'SVC':
        SVC(
            C=5.1,
            kernel='linear',
            decision_function_shape='ovr',
            #class_weight            = 'balanced' # better without 'balanced'
        ),
        'LogisticRegression':
        LogisticRegression(C=5.1, ),
        'GradientBoostingClassifier':
        GradientBoostingClassifier(learning_rate=0.3),
    }

    if classifier != 'VotingClassifier':
        clf = estimators.get(classifier)
    else:
        estimators['SVC'].probability = True
        clf = VotingClassifier(estimators=[(k, v)
                                           for k, v in estimators.items()],
                               voting='soft')

    print(clf)

    pipeline = Pipeline(
        [
            (
                'col_transf',
                ColumnTransformer(
                    [
                        ('scaler', StandardScaler(), [
                            'budget', 'client.feedback',
                            'client.reviews_count', 'client.jobs_posted',
                            'client.past_hires'
                        ]),
                        ('title_vec',
                         Pipeline([
                             ('preprocessor', SpacyPreprocessor()),
                             ('tfidf',
                              TfidfVectorizer(tokenizer=identity,
                                              preprocessor=None,
                                              lowercase=False,
                                              use_idf=True,
                                              ngram_range=(2, 2))),
                             ('svd', TruncatedSVD(n_components=150)),
                         ]), 'title'),
                        (
                            'snippet_vec',
                            Pipeline([
                                ('preprocessor', SpacyPreprocessor()),
                                (
                                    'tfidf',
                                    TfidfVectorizer(
                                        tokenizer=identity,
                                        preprocessor=None,
                                        lowercase=False,
                                        use_idf=True,
                                        sublinear_tf=
                                        False,  # not good results when True
                                        ngram_range=(1, 2))),
                                ('svd', TruncatedSVD(n_components=100)),
                            ]),
                            'snippet'),
                        ('cat', ce.CatBoostEncoder(),
                         ["job_type", 'category2', 'client.country']),
                    ],
                    remainder='drop')),

            #('oversampling', ADASYN(random_state=42)),
            ('classifier', clf),
        ],
        verbose=True)

    if search:

        log_space = gen_parameters_from_log_space(low_value=5,
                                                  high_value=8,
                                                  n_samples=10)

        lin_space = np.arange(2, 8, 2, dtype=np.int)

        if classifier == 'SVC':
            grid = {
                # 'union__title_vec__tfidf__ngram_range'   : [(1,2), (2,2)],
                # 'union__snippet_vec__tfidf__ngram_range' : [(1,2), (2,2)],
                # 'union__snippet_vec__svd__n_components'  : np.arange(50, 301, 50),
                # 'union__title_vec__svd__n_components'    : np.arange(100, 301, 50),
                'classifier__C': log_space,
            }

        elif classifier == 'LogisticRegression':
            grid = {
                'classifier__C': gen_parameters_from_log_space(0.1, 10, 10),
            }

        elif classifier == 'GradientBoostingClassifier':
            grid = {
                'classifier__learning_rate':
                gen_parameters_from_log_space(0.01, 1, 10),
            }

        elif classifier == 'VotingClassifier':
            grid = {
                'classifier__lr__C':
                gen_parameters_from_log_space(0.1, 10, 10),
                'classifier__C':
                gen_parameters_from_log_space(5, 8, 10),
                'classifier__learning_rate':
                gen_parameters_from_log_space(0.01, 1, 10),
            }

        # With scoring="ovo", computes the average AUC of all possible pairwise
        # combinations of classes. Insensitive to class imbalance when
        # average='macro'.
        # Also see: https://stackoverflow.com/a/62471736/1253729

        searcher = GridSearchCV(
            estimator=pipeline,
            param_grid=grid,
            n_jobs=4,
            return_train_score=True,
            refit=True,
            verbose=True,
            cv=StratifiedKFold(n_splits=3),
            scoring=scorer,
        )

        model = searcher.fit(X_train, y_train.values.ravel())
        print(f"Best found parameters: {searcher.best_params_}")

    else:
        model = pipeline.fit(X_train, y_train.values.ravel())

    return model
Example #55
0
        rows.append({'text': text, 'class': classification})
        index.append(filename)

    data_frame = DataFrame(rows, index=index)
    return data_frame

data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

data = data.reindex(np.random.permutation(data.index))
## now split files into training data and labels. probably tuple (filename, r/d)

classifier = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svd', TruncatedSVD(algorithm='randomized', n_components=300)),
    ('clf', XGBClassifier())
])

#classifier.fit(data['text'].values, data['class'].values)

k_fold = KFold(n_splits=8)
scores = []
confusion = np.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold.split(data):
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values
    for size in tqdm(size_list):
        model = KeyedVectors.load("./trained_model/fasttext_gensim_" + str(size) + ".model")
        words_np = []
        words_label = []
        for word in list_words:
            words_np.append(model[word])
            words_label.append(word)
        word_vector_reduced = {}
        for index, vec in enumerate(words_np):
            word_vector_reduced[words_label[index]] = vec
        list_cosin_similarity = []
        for x, y in zip(data["Word1"], data["Word2"]):
            list_cosin_similarity.append(round(cosin_similarity(x, y, word_vector_reduced), 2))
        data['Relation_number'] = new_col
        data["FastText_" + str(size)] = list_cosin_similarity
        if size == 200:
            for new_size in size_list[:-1]:
                svd = TruncatedSVD(n_components=new_size, n_iter=30)
                svd.fit(words_np)
                reduced = svd.transform(words_np)
                word_vector_reduced = {}
                for index, vec in enumerate(reduced):
                    word_vector_reduced[words_label[index]] = vec
                list_cosin_similarity = []
                for x, y in zip(data["Word1"], data["Word2"]):
                    list_cosin_similarity.append(round(cosin_similarity(x, y, word_vector_reduced), 2))
                data["FastText_SVD_" + str(new_size)] = list_cosin_similarity
    # Ghi ket qua ra file csv
    tmp_name = os.path.basename(path_visim).split('.')[0] + '_result.csv'
    data.to_csv(os.path.join("./result", tmp_name), sep="\t")
Example #57
0
    print '=============='
    print metrics.confusion_matrix(labels, km.labels_)
    print '=============='
    print '-----------------------------------------------------'
    #==============================================================================






#=========================Reduce Dimensionality (SVD)==========================
print '##############################################################'
for i in range(0,5):
    print 'Performing truncatedSVD...'
    svd = TruncatedSVD(n_components = 165, n_iter = 13,random_state = 42)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    
    X_reduced = lsa.fit_transform(X)
    
    k_means(X_reduced, labels, 'truncatedSVD')
#==============================================================================



#=========================Reduce Dimensionality (PCA)==========================
print '##############################################################'
for i in range(0,5):
    print 'Performing PCA...'
    
Example #58
0
# Tokenize each document into words
# Gets rid of stop words, and stemmed version of word
# Ignores words appearing in less then 5 (or 2 if min_df = 2) documents 
vectorizer = CountVectorizer(min_df=5, stop_words= stop_words, tokenizer=LemmaTokenizer() )
X_train_counts = vectorizer.fit_transform(eight_train.data)
X_test_counts = vectorizer.transform(eight_test.data)

# TFIDF
# We set smooth_idf = false so we use the equation idf(d, t) = log [ n / df(d, t) ] + 1
tfidf_transformer = TfidfTransformer(smooth_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# 'arpack' for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds)
svd = TruncatedSVD(n_components=50, algorithm='arpack')
X_train_lsi = svd.fit_transform(X_train_tfidf)
X_test_lsi = svd.transform(X_test_tfidf)

# separate into two groups(Computer Tech & Recreation)
train_target_group = [ int(x / 4) for x in eight_train.target]
test_actual= [ int(x / 4) for x in eight_test.target]

# Logistic Regresstion Classifier 
log_reg = LogisticRegression()
log_reg.fit(X_train_lsi, train_target_group)

predicted = log_reg.predict(X_test_lsi)
predicted_probs = log_reg.predict_proba(X_test_lsi)

fpr, tpr, _ = roc_curve(test_actual, predicted_probs[:,1])
# Fit and transform
principalComponents = pca.fit_transform(X_scaled)

# Print ratio of variance explained
#After transformation, we keep the three (as specified in n_components) transformed features with the highest
#transformed covariance eigenvalues
print("PCA explained variance ratios: {}".format(
    pca.explained_variance_ratio_))
print("PCA components: {}".format(pca.components_))
print("The PCA component vector has size {}, because there are {} vectors with length {}".format(pca.components_.shape, \
 pca.components_.shape[0], pca.components_.shape[1]))

########## Practicing singular value decomposition

# SVD
svd = TruncatedSVD(n_components=2)

# Fit and transform
principalComponents = svd.fit_transform(X_scaled)

# Print ratio of variance explained
print("SVD explained variance ratios: {}".format(
    svd.explained_variance_ratio_))

################ Practicing creating a PCA plot for visualization
#first project data points onto the two PCA axes. Note that principle components coming out of svd are already normalized.
df_pca = pd.DataFrame(np.transpose(np.array([np.dot(X_scaled, svd.components_[0]), np.dot(X_scaled, svd.components_[1]),\
 cancer.target])), \
 columns=['Principal component 1', 'Principal component 2', 'Target value'])

targets = [0, 1]
#UNSUPERVISED MODEL
from model import *
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans


Sparse SVD on tf-idf to reduce features to 50
print("start dimensionality reduction")
data = get_vectorized_tweets('training_vecs.npy').toarray()
svd_model = TruncatedSVD(n_components=50)
data_svd = svd_model.fit_transform(data)
print("start TSNE")
tsne_model = TSNE(n_components = 2)
data_tsne = tsne_model.fit_transform(data_svd)
np.save('tsne_training_data.npy', data_tsne)
data_tsne = sample(np.asarray(get_vectorized_tweets('tsne_training_data.npy')), 500)
print(data_tsne.shape)
cluster_labels = KMeans(n_clusters = 5).fit(data_tsne).labels_

import matplotlib.pyplot as plt
print("scatter:")
plt.scatter(data_tsne[:,0], data_tsne[:,1], c = cluster_labels)
plt.show()

#UNSUPERVISED MODEL ONLY TOXIC SPEECH
#select only toxic speech
df_data = pd.read_csv("twitter-sentiment-analysis-hatred-speech/train.csv",names=('id','label','tweet'),header=None)
labels = df_data.to_numpy().T[1]
data_tsne = np.asarray(get_vectorized_tweets('tsne_training_data.npy'))