def comparison_test(text):
    import sklearn.feature_extraction.text as txt
    h_trick = txt.HashingVectorizer(n_features=20, binary=True, norm=None)
    oh_encoder = txt.CountVectorizer()
    oh_encoded = oh_encoder.fit_transform(text)
    hashing = h_trick.transform(text)
    return oh_encoded, hashing
def QuickClusterParamaterFinder(data):
    Cost = list()
    vectorizer = txtvectorizer.HashingVectorizer(analyzer='word',
                                                 ngram_range=(1, 10))
    vectors = vectorizer.transform(data['content'].dropna())

    for c in range(1, 100, 5):
        kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=c)
        kmeans.fit_transform(vectors)
        Cost.append(kmeans.inertia_)
        print(str(c) + " / 100")
    plt.plot(range(1, 100, 5), Cost)
    return Cost
def build_vectors(sentences, vacabulary_size):
    vectorizer = skyfe.CountVectorizer()
    trans = vectorizer.fit_transform(sentences)
    fname = vectorizer.get_feature_names()
    print(trans)
    print(trans.toarray())
    print(fname)

    #enable tf-idf
    transformer = skyfe.TfidfTransformer()
    tfidf = transformer.fit_transform(trans)
    print(tfidf.toarray())
    print(tfidf.get_feature_names())

    #hashed
    vectorizer2 = skyfe.HashingVectorizer(n_features=6, norm=None)
    trans = vectorizer2.fit_transform(sentences)
    #fname = vectorizer2.get_feature_names()
    print(trans.toarray())
Beispiel #4
0
def cluster_topics():
    #model = cluster.Birch(
    #branching_factor=2,
    #threshold=0.002 # Lower = more clusters, higher = fewer clusters
    #)

    #model = cluster.KMeans(
    #branching_factor=10,
    #threshold=0.1 # Lower = more clusters, higher = fewer clusters
    #)

    model = cluster.DBSCAN(min_samples=2, eps=0.2)

    #model = cluster.AffinityPropagation(
    #)

    vectorizer = text.HashingVectorizer(
        analyzer='char_wb',  # The feature is made of words not characters
        norm='l2',  # Normalize the words
        lowercase=True,  # Converts everything to lowercase
        stop_words=stopwords)

    num_samples = 10000
    offset = 0

    while True:
        log.debug(u"Loading topics...")
        topic_rows = db.session.query(
            models.TopicModel.id,
            models.TopicModel.topic).filter_by(clustered=False).order_by(
                models.TopicModel.id.asc()).limit(num_samples).offset(
                    offset).all()

        if not topic_rows:
            break

        log.debug(u"Loaded {} topics".format(len(topic_rows)))

        offset += len(topic_rows)

        go_cluster(vectorizer, model, topic_rows)
#!/usr/bin/python
# -*- coding: utf-8 -*-
#[email protected]
"""
==============================
文本向量化方法3
==============================
Tfidf方法进行文本向量化过程中如果单词量很大会遇到内存问题
因此利用hash编码可以将特征数减少
这个方法本质就是将单词进行hash编码,因此一个编码可以对应多个词语
从而压缩内存
"""

from sklearn.datasets import fetch_20newsgroups
import sklearn.feature_extraction.text as t2v
import numpy as np
#获取数据
newsgroups_train = fetch_20newsgroups(data_home="data", subset='train')
newsgroups_test = fetch_20newsgroups(data_home="data", subset='test')
#单词向量化
vectorizer = t2v.HashingVectorizer(n_features=6)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

print("class:", newsgroups_train.target_names[newsgroups_train.target[1]])
print("data:", newsgroups_train.data[1])
print(vectors[1])
# take document stream from stream_docs and return specific number of documents
def get_minibatch(doc_stream, size):
    docs, labels = [], []
    try:
        for _ in range(size):
            doc, label = next(doc_stream)
            docs.append(doc)
            labels.append(label)
    except StopIteration:
        return None, None
    return docs, labels


# use data independent hasher
vect = text.HashingVectorizer(decode_error='ignore',
                              n_features=2**21,
                              preprocessor=None,
                              tokenizer=preprocessor_two)
clf = lm.SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')
# perform out of core learning
classes = np.array([0, 1])
for _ in range(45):
    data_train, target_train = get_minibatch(doc_stream, size=1000)
    if not data_train:
        break
    data_train = vect.transform(data_train)
    clf.partial_fit(data_train, target_train, classes=classes)
# use last 5000 documents for evaluation
data_test, target_test = get_minibatch(doc_stream, size=5000)
data_test = vect.transform(data_test)
print('Accuracy: %.3f' % clf.score(data_test, target_test))
Beispiel #7
0
@author: Administrator
"""

from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', 
                                     remove=('headers', 'footers', 'quotes'))

#from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.naive_bayes import MultinomialNB
#Bernoulli = BernoulliNB(alpha=0.01)
Multinomial = MultinomialNB(alpha=0.01)

import sklearn.feature_extraction.text as txt
multinomial_hashing_trick = txt.HashingVectorizer(stop_words='english', 
                            binary=False, norm=None, non_negative=True)
#binary_hashing_trick = txt.HashingVectorizer(stop_words='english', 
#                            binary=True, norm=None, non_negative=True)

Multinomial.fit(multinomial_hashing_trick.transform(newsgroups_train.data),
                newsgroups_train.target)
#Bernoulli.fit(binary_hashing_trick.transform(newsgroups_train.data),
#                newsgroups_train.target)
from sklearn.metrics import accuracy_score
for m, h in [(Multinomial, multinomial_hashing_trick)]:
    print 'Accuracy for %s: %.3f' % (m, accuracy_score(y_true=newsgroups_test.target,
                                                       y_pred=m.predict(h.transform(
                                                       newsgroups_test.data))))
                                                  
def vectorizeStrings(documents, ngramRange):
    vectorizer = txtvectorizer.HashingVectorizer(analyzer='char',
                                                 ngram_range=ngramRange)
    vectors = vectorizer.transform(documents.fillna(""))
    return vectors
Beispiel #9
0
test_data['Position_Extra'] = test_data['Position_Extra'].apply(clean)
test_data['Program_Description'] = test_data['Program_Description'].apply(clean)
test_data['SubFund_Description'] = test_data['SubFund_Description'].apply(clean)
test_data['Sub_Object_Description'] = test_data['Sub_Object_Description'].apply(clean)
test_data['Text_1'] = test_data['Text_1'].apply(clean)
test_data['Text_2'] = test_data['Text_2'].apply(clean)
test_data['Text_3'] = test_data['Text_3'].apply(clean)
test_data['Text_4'] = test_data['Text_4'].apply(clean)

# create a single new column for cleaned text data
training_data["combined"] = [' '.join(row) for row in training_data[training_data.columns].values]
test_data["combined"] = [' '.join(row) for row in test_data[test_data.columns].values]

# initialize TFIDF vectorizer and Hashing Vectorizer
tfidf = txt.TfidfVectorizer(ngram_range=(2, 1), max_df=1.0, min_df=10)
hsv = txt.HashingVectorizer()

# fit tfidf and hashing vectorizer to train data
tfidf.fit(training_data['combined'])
hsv.fit(test_data['combined'])

# transform the training and test datasets to obtain a sparse matrix
X_tfidf = tfidf.transform(training_data['combined'])
X_test_tfidf = tfidf.transform(test_data['combined'])

X_hsv = hsv.transform(training_data['combined'])
X_test_hsv = hsv.transform(test_data['combined'])

X = sparse.hstack((X_hsv, X_tfidf))
X_test = sparse.hstack((X_test_hsv, X_test_tfidf))
Beispiel #10
0
def hash_vector(text,features):
    hash_vectorizer = txt.HashingVectorizer(n_features=features,binary=True,norm=None)
    text_vector = hash_vectorizer.transform(text)
    return text_vector
Beispiel #11
0
"""

__author__ = "Adrian Langseth"

import pickle
import sklearn.feature_extraction.text as skt
import time
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

t0 = time.time()

k = pickle.load(open("sklearn-data.pickle", "rb"))

vectorizer = skt.HashingVectorizer(stop_words='english', analyzer='word')

xtrain_transformed = vectorizer.fit_transform(k['x_train'])
xtest_transformed = vectorizer.fit_transform(k['x_test'])

# NB classifier part
NBclassifier = BernoulliNB()  # Build Model
NBclassifier.fit(xtrain_transformed, k['y_train'])  # Fitting model
NBpredicted_y = NBclassifier.predict(xtest_transformed)  # Make prediction
NBaccuracy_score = accuracy_score(k['y_test'],
                                  NBpredicted_y)  # Evaluate prediction

# Decision Tree part
DTclassifier = DecisionTreeClassifier(max_depth=64,
                                      criterion="entropy")  # Create Model
DTclassifier.fit(xtrain_transformed, k['y_train'])  # Fit model
Beispiel #12
0
if __name__ == "__main__":
    input_files = list(glob(str(base_dir / "data" / "*" / "*")))
    data = []
    for file_path in input_files:
        with open(file_path, "rt") as fh:
            data.append(fh.read())

    dataset_size = 91  # MB for 20 newsgroup dataset

    print("# vectorizing {} documents:".format(len(data)))

    for label, vect in [
        ("HashingVectorizer (vtext)", vtext.HashingVectorizer(norm=None)),
        (
            "HashingVectorizer (scikit-learn)",
            skt.HashingVectorizer(lowercase=False, norm=None),
        ),
        ("CountVectorizer (vtext)", vtext.CountVectorizer(lowercase=False)),
        ("CountVectorizer (scikit-learn)",
         skt.CountVectorizer(lowercase=False)),
    ]:

        t0 = time()

        X = vect.fit_transform(data)

        dt = time() - t0

        print("{:>40}: {:.2f}s [{:.1f} MB/s], shape={}, nnz={}".format(
            label, dt, dataset_size / dt, X.shape, X.nnz))
Beispiel #13
0
    for word in input_string.split(' '):
        index = abs(
            hash(word)
        ) % vector_size  #turn the position in vector where hash(word)%vector_size into 1
        feature_vector[index] = 1
    return feature_vector


# if vector_size too small, there will be easily overlapping

from scipy.sparse import csc_matrix

print csc_matrix([1, 0, 0, 0, 0, 1, 1, 0, 1, 0])

# Sparse CSC_matrix only record the position of 1, by eliminating 0, saving a lot of memory

#------------Using BUiltin HashingVectorizer -------------------
import sklearn.feature_extraction.text as txt

sklearn_hashing_trick = txt.HashingVectorizer(n_features=20,
                                              binary=True,
                                              norm=None)
text_vector = sklearn_hashing_trick.transform(
    [' Python for data science', 'Python for machine learning'])
text_vector

#transform text into vector, and surpress using sparse-like funciton

# CountVectorizer: Optimally encodes text into a data matrix but cannot address subsequent novelties in text.
# HashingVectorizer: Provides flexibility in situations when it is likely that the application will receive new data, but is less optimal than techniques based on hashing functions.