Python CountVectorizer.get_params Examples

Programming Language: Python

Namespace/Package Name: sklearn.feature_extraction.text

Class/Type: CountVectorizer

Method/Function: get_params

Examples at hotexamples.com: 23

The CountVectorizer is a tool in the Python sklearn.feature_extraction.text library that allows you to convert text documents into numerical matrices. It counts the frequency of each word in an inputted document and returns a matrix with the words as columns and the frequency count as data points.

The get_params() method can be used to retrieve the parameters used to instantiate the CountVectorizer object. This is useful when you want to see the default parameter settings or when you want to compare the settings used in different instances.

Example:

from sklearn.feature_extraction.text import CountVectorizer

# Instantiate a CountVectorizer object
vectorizer = CountVectorizer()

# Get the parameters for the CountVectorizer object
params = vectorizer.get_params()

# Print the parameters
print(params)

Output:

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': ,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In this example, the CountVectorizer object is instantiated without any arguments. The get_params() method is then used to retrieve the default parameters for the object. These parameters are returned in a dictionary format. Note: The code examples were written in Python programming language and used a package/library called sklearn.feature_extraction.text.

Python CountVectorizer.get_params - 23 examples found. These are the top rated real world Python examples of sklearn.feature_extraction.text.CountVectorizer.get_params extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CountVectorizer(30)

_validate_vocabulary(30)

fit_transform(30)

fit(30)

build_tokenizer(30)

build_analyzer(30)

get_stop_words(30)

get_params(21)

get_feature_names_out(15)

build_preprocessor(13)

__init__(10)

get_feature_names(9)

dictionary_freeze(6)

count(4)

analyzer(4)

fixed_vocabulary(3)

astype(3)

_count_vocab(2)

copy(2)

fit_trainsform(2)

get_features_names(2)

append(2)

_word_ngrams(2)

get_feature_name(1)

getSenVec(1)

_sort_features(1)

get_features(1)

get_sentence_vector(1)

get_shape(1)

getOutputCol(1)

fit_Transform(1)

fit_trasform(1)

fit_transfrom(1)

fit_transforn(1)

__repr__(1)

fir_transform(1)

__dict__(1)

extract_ngrams(1)

delete_temporary_training_data(1)

count_features(1)

_limit_features(1)

fir(1)

Example #1

Show file

def read_messages(filepath):
    data = pd.read_csv("./spam.csv",
                       encoding="latin1",
                       names=["labels", "text", "", "", ""])
    data = data.filter(["labels", "text"])  #remove extra columns
    mapping = {"spam": 0, "ham": 1}
    data = data.replace({"labels": mapping})

    ps = PorterStemmer()

    for index, value in data.iterrows():
        text = value["text"]
        text = porter_stemmer(text, ps)
        data.set_value(index, "text", text)

    #counts the number of uses per word
    count_vectorizer = CountVectorizer()
    counts = count_vectorizer.fit_transform(data["text"])
    labels = data["labels"]
    # assert(len(labels) == len(counts))
    # print("Number of examples", len(labels))
    print("params", count_vectorizer.get_params())
    print("type", type(count_vectorizer))
    print("shape", np.shape(counts))
    return labels, counts, count_vectorizer

Example #2

Show file

File: ie.py Project: xxxyy95/NLP_Biomedical-Event-Extraction

def featTransform(sents_train, sents_test):
    cv = CountVectorizer()
    cv.fit(sents_train)
    print(cv.get_params())
    features_train = cv.transform(sents_train)
    features_test = cv.transform(sents_test)
    return features_train, features_test, cv

Example #3

Show file

File: preproc_tools.py Project: eduardopaul/twitter_sentiment_analysis

 def get_params(self, deep=True):
     params = super().get_params(deep)
     # Hack to make get_params return base class params...
     cp = copy.copy(self)
     cp.__class__ = CountVectorizer
     params.update(CountVectorizer.get_params(cp, deep))
     return params

Example #4

Show file

File: ie.py Project: CBrucePerkins/stat-nlp-book

def featTransform(sents_train, sents_test):
    cv = CountVectorizer()
    cv.fit(sents_train)
    print(cv.get_params())
    features_train = cv.transform(sents_train)
    features_test = cv.transform(sents_test)
    return features_train, features_test, cv

Example #5

Show file

    def Common_Vectorizer_usage():
        from sklearn.feature_extraction.text import CountVectorizer
        vectorizer = CountVectorizer(min_df=1)
        corpus = [
            'This is the first document.',
            'This is the second second document.',
            'And the third one.',
            'Is this the first document?',
        ]

        analyze = vectorizer.build_analyzer()
        print analyze("This is a text document to analyze.")
        print analyze("This is a text document to analyze.") == ['this', 'is', 'text', 'document', 'to', 'analyze']
        
        X=vectorizer.fit_transform(corpus)
        print vectorizer.get_feature_names()
        print vectorizer.vocabulary_    #.get('document')
        print vectorizer.transform(['Something completely new.']).toarray()
        print list(X) 
        
        #bigram========================================================
        bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
        analyze = bigram_vectorizer.build_analyzer()
        print analyze('Bi-grams are cool!')
        X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
        print X_2

        feature_index = bigram_vectorizer.vocabulary_.get('is this')
        print X_2[:, feature_index] 
        
        #marui test
        print '\n\nmarui test====================='
        def t_preprocessor(s):
            return ','.join([x.lower() for x in s.split(' ')])

        stop_words1=['is','a','this']           #is ok: frozenset(['a', 'this', 'is'])
        stop_words2={'is':0,'a':1,'this':2}     #is ok: convert to frozenset(['a', 'this', 'is'])    
            
        cv = CountVectorizer(preprocessor=t_preprocessor,stop_words=stop_words2)
        params=cv.get_params()
        print 'get_params()',type(params),'---------------'
        for k in params:
            print k,'\t',params[k]
        print 'get_params end--------------'
        print '\nget_stop_words=',cv.get_stop_words()
        
        cv.fit(corpus)
        print cv.get_feature_names()
        print cv.transform(corpus).toarray()
        print '\n测试preprocesser, result:\t',cv.build_preprocessor()('this is a document')
        print '\n测试tokenizer,result',cv.build_tokenizer()('this is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th-is is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th_is is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th&is is a document')

        """

Example #6

Show file

File: topic_modelling.py Project: xihajun/Art-vs-History-Open-Data-Hackathon-Code

def LDA(docs_raw,n_topics):
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                    stop_words = en_stop,
                                    # token choose the words with more than 4 length and get rid of the speical words
                                    token_pattern = r'\b\w+\b')
    dtm_tf = tf_vectorizer.fit_transform(docs_raw)
    tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
    dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
    # for TF DTM
    lda_tf = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    %time lda_tf.fit(dtm_tf)
    # for TFIDF DTM
    lda_tfidf = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    %time lda_tfidf.fit(dtm_tfidf)
    return(lda_tf, dtm_tf, tf_vectorizer)

Example #7

Show file

File: old.py Project: itai12312/code_similarity

def main2(params):
    df = pd.read_csv(join(params.input_folder,
                          'tokenized1/084_update_quality_minmax_sizeFixture.cs.tree-viewer.txt'), header=None)
    df = df[df[0].notnull()]
    df.applymap(filter_type)
    matrix = CountVectorizer(max_features=10)
    X = matrix.fit_transform(df[0]).toarray()
    print(matrix.vocabulary_)
    print(matrix.get_params())
    df[0].iloc[0:10].str.cat(sep=' ')
    starters = df.loc[df[0] == "BEGIN_METHOD"]
    enders = df.loc[df[0] == "END_METHOD"]
    zipped = list(zip(starters.index, enders.index))
    functions_list = []
    for begin, end in zipped:
        functions_list.append(df[0].iloc[begin:end+1].str.cat(sep=' '))

Example #8

Show file

File: svd.py Project: river-matt/newsdata

def model(df):
    #hasher = HashingVectorizer(n_features=100, analyzer='word', stop_words='english', alternate_sign=False, norm=None)
    hasher = CountVectorizer(analyzer='word', stop_words='english')
    vectorizer = make_pipeline(hasher, TfidfTransformer(use_idf=False))
    X = vectorizer.fit_transform(df['links'])
    print(hasher.get_params())
    normalizer = Normalizer(copy=False)
    svd = TruncatedSVD(n_components=12)
    lsa = make_pipeline(svd, normalizer)
    Y = lsa.fit_transform(X)
    km = KMeans(n_clusters=4, init='k-means++', max_iter=1000, n_init=1)
    Z = km.fit_predict(Y)
    df['labels'] = Z
    df['first'] = Y[:, 0]
    df['second'] = Y[:, 1]
    df['third'] = Y[:, 2]

    result = df[['outlet', 'total', 'labels', 'first', 'second', 'third']]
    return result

Example #9

Show file

File: extract_topics.py Project: cloudbopper/perysis

def gen_document_term_matrices(args, data):
    """Generates document-term matrices"""

    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
    tf_vectorizer = CountVectorizer(stop_words="english",
                                    max_features=args.num_features,
                                    max_df=0.95,
                                    min_df=2)
    dtm_tf = tf_vectorizer.fit_transform(data)
    with open("%s/dtm_tf.pkl" % args.output_dir, "wb") as dtm_file:
        pickle.dump(tf_vectorizer, dtm_file)
        pickle.dump(dtm_tf, dtm_file)

    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
    dtm_tfidf = tfidf_vectorizer.fit_transform(data)
    with open("%s/dtm_tfidf.pkl" % args.output_dir, "wb") as dtm_file:
        pickle.dump(tfidf_vectorizer, dtm_file)
        pickle.dump(dtm_tfidf, dtm_file)
    return tf_vectorizer, dtm_tf, dtm_tfidf

Example #10

Show file

def topic_modelling(data):
    abstracts = []
    for abstract in data:
        # Remove punctuation
        abstract = re.sub('[,\.!?]', '', abstract)
        # Remove numbers
        abstract = re.sub('[0-9]', '', abstract)
        # Convert the abstracts to lowercase
        abstract = abstract.lower()
        abstracts.append(abstract)
    # Splitting abstracts
    snnipets = []
    for abstract in abstracts:
        if abstract != "abstract not available":
            length = len(abstract)
            index = 0
            last_i = 0
            n = 256
            while index < length:
                i = abstract.rfind(". ", index, index + n)
                if i == -1 or i == index:
                    i = index + n
                text = abstract[index:i + 2]
                index = i + 2
                snnipets.append(text)
    # Creating LDA
    #number_topics = 5
    tf_vectorizer = CountVectorizer(stop_words='english')
    tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
    dtm_tfidf = tfidf_vectorizer.fit_transform(snnipets)
    lda_tfidf = LDA(random_state=0)
    lda_tfidf.fit(dtm_tfidf)
    # Visualizing LDA
    data = pyLDAvis.sklearn.prepare(lda_tfidf,
                                    dtm_tfidf,
                                    tfidf_vectorizer,
                                    mds='mmds')
    html = pyLDAvis.prepared_data_to_html(data, template_type="simple")
    return html

Example #11

Show file

File: count.py Project: rdorado79/nb-al

from sklearn.feature_extraction.text import CountVectorizer
import itertools
import numpy as np

data = ['She did not cheat on the test, for it was not the right thing to do.','I think I will buy the red car, or I will lease the blue one.','I really want to go to work, but I am too sick to drive.','I am counting my calories, yet I really want dessert.']
count_vect = CountVectorizer()


cx = count_vect.fit_transform(data)

vocab = {}
for key, value in count_vect.vocabulary_.iteritems():
  vocab[value] = key

#print vocab

start = 0
for i, end in enumerate(cx.indptr[1:]):
  for j, val in zip(cx.indices[start:end], cx.data[start:end]):
    print "("+str(i)+","+str(j)+"): "+str(val)+" => '"+vocab[j]+"'"
    #print vocab[j]+" ",
  print ""
  start=end

print count_vect.get_params()
#for i,j,v in zip(cx.row, cx.col, cx.data):
#     print (i,j,v)

Example #12

Show file

File: topicModel.py Project: ujjwal95/tappe_project

for file in files:  #Topic modeling that reads in utterances
    df = pd.read_csv(file)
    utterance_temp.append(df['stringList'].tolist())

utterance_raw = [item for sublist in utterance_temp for item in sublist]

tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                stop_words='english',
                                lowercase=True,
                                token_pattern=r'\b[a-zA-Z]{3,}\b',
                                max_df=0.2,
                                min_df=0)
dtm_tf = tf_vectorizer.fit_transform(utterance_raw)

tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(utterance_raw)

# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=30, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=30, random_state=0)
lda_tfidf.fit(dtm_tfidf)

nmf_tf = NMF(n_components=80, random_state=1, alpha=.1,
             l1_ratio=.5).fit(dtm_tf)

# nmf_tfidf = NMF(n_components=10, random_state=1,
#           beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
#           l1_ratio=.5).fit(dtm_tfidf)

Example #13

Show file

matrix.vocabulary_

# In[93]:

sizes = sorted(np.asarray(cv_fit.sum(axis=0))[0], reverse=True)
print(sizes)

# In[94]:

values = list(matrix.vocabulary_.keys())
values

# In[96]:

matrix.get_params()

# #### How to combine multiple rows into a single row with pandas

# In[97]:

df[0].iloc[0:10].str.cat(sep=' ')

# In[ ]:

get_ipython().run_line_magic('pinfo', 'matrix')

# #### separate functions from each other

# In[98]:

Example #14

Show file

File: NewsBias.py Project: jonathanklinn/news_analysis

class NewsBias:
    def __init__(self):
        self.tf_vectorizer = []
        self.tf = []
        self.lda_model = []
        self.feature_names = []
        self.topics_mat = []
        self.sentiment_by_topic = []

    def fix_sites(mongo_db):
        fix_cnn(mongo_db)
        fix_huffpo(mongo_db)

    def from_mongo(self, db_name):
        df = get_df(db_name)
        df = clean_df(df)
        df = df[pd.notnull(df['processed_text'])]
        df = df[df['processed_text'] != '']

        return df

    def from_csv(self, csv_name):
        try:
            df = pd.read_csv('data/' + csv_name, parse_dates=False)
            return df
        except:
            print('CSV file does not exist!')
            print('Make sure CSV file is in data folder.')
            return False

    def to_csv(self, df, filename):
        filename = 'data/' + filename
        df.to_csv(filename, index=False)
        print('CSV file saved to: ' + filename)

    def update_from_bucket(self, filename):
        path = os.getcwd()
        # Example filename: 'dsiprojectdata/rss_feeds_new.tar'
        result = from_bucket(filename, path)
        if not result:
            print('Error updating data from bucket!')
            print(
                'Make sure you include folder and file in filename from bucket.'
            )

    def update_to_bucket(self, filename, bucketname, mongo_db=False):
        # If mongo database then just give database name as filename
        if mongo_db:
            cwd = os.getcwd()
            # Give permission to bash file then run
            p1 = subprocess.Popen('chmod',
                                  '+x',
                                  'backup.sh',
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            out1, err1 = p1.communicate()
            p2 = subprocess.Popen(cwd + '/backup.sh',
                                  filename,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            out2, err2 = p2.communicate()
        else:
            p = subprocess.Popen('/usr/bin/aws',
                                 's3',
                                 'cp',
                                 filename,
                                 's3://' + bucketname + '/',
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            out, err = p.communicate()

    def run_lda(self, df, max_features=1000, n_topics=20):
        df = df[pd.notnull(df['processed_text'])]
        processed_text = df['processed_text'].values.tolist()
        # Inclued quotes in LDA
        processed_quote = df['processed_quote'].values.tolist()
        processed_tweet = df['processed_tweet'].values.tolist()
        processed_all = []
        for text, quote, tweet in zip(processed_text, processed_quote):
            # Check if quote is nan
            if type(quote) == float:
                quote = ''
            if type(tweet) == float:
                tweet = ''
            processed_all.append(text + quote + tweet)
        try:
            self.tf_vectorizer = CountVectorizer(max_df=0.95,
                                                 min_df=0.05,
                                                 max_features=max_features,
                                                 stop_words='english')
            self.tf = self.tf_vectorizer.fit_transform(processed_all)
        except:
            import pdb
            pdb.set_trace()
        self.lda_model = LatentDirichletAllocation(n_topics=n_topics,
                                                   max_iter=5,
                                                   learning_method='online',
                                                   learning_offset=50.,
                                                   random_state=0,
                                                   n_jobs=-1)

        self.lda_model.fit(self.tf)

        self.feature_names = np.array(self.tf_vectorizer.get_feature_names())
        self.topics_mat = self.lda_model.components_

        return self.lda_model

    def run_gensim_lda(self, df, n_topics=20):
        self.lda_model = gensim_lda(df, n_topics)

    def get_top_word_by_topic(topic, n_words):
        return self.feature_names[np.argsort(
            self.topics_mat[topic, :])[::-1]][:n_words]

    def visualize_lda(self, df, display=False):
        if self.lda_model == []:
            self.run_lda(df)
        max_features = self.tf_vectorizer.get_params()['max_features']
        n_topics = self.lda_model.get_params()['n_topics']
        vis_data = pyLDAvis.sklearn.prepare(self.lda_model,
                                            self.tf,
                                            self.tf_vectorizer,
                                            R=n_topics,
                                            n_jobs=-1)
        pyLDAvis.save_html(
            vis_data, 'plots/pyLDAvis_' + str(max_features) + 'feats_' +
            str(n_topics) + 'topics.html')
        if display:
            pyLDAvis.show(vis_data)

    def get_sentiment_of_words(self, df):
        sentiment_of_words = sentiment_of_words_wordnet(df)

        return sentiment_of_words

    def get_sentiment_by_topic(self, df, display=False):
        n_topics = self.lda_model.get_params()['n_topics']

        self.sentiment_by_topic = sentiment_by_topic_wordnet(
            df, self.topics_mat, self.feature_names)

        if display:
            for i, site in enumerate(sentiment_by_topic.keys()):
                plt.subplot(3, 4, i + 1)
                score = []
                for topic in range(n_topics):
                    score.append(sentiment_by_topic[site][topic][3])
                score = np.array(score)
                score /= sum(np.abs(score))
                plt.bar(np.arange(len(score)), score, align='center')
                plt.ylabel('Score')
                plt.title('Score by Topic for ' + site)
            plt.subplots_adjust(hspace=0.4, wspace=0.4)
            plt.show()

        return self.sentiment_by_topic

    def length_of_articles_hist(self, df):
        for i, site in enumerate(df['source'].unique()):
            plt.subplot(3, 4, i + 1)
            new_df = df[df['source'] == site]
            article_len = [
                len(article.split(' ')) for article in new_df['article_text']
            ]
            plt.hist(article_len, normed=True)
            plt.xlabel('Length of Article')
            plt.ylabel('# of Articles')
            plt.title('Length of articles for ' + site)
        plt.subplots_adjust(hspace=0.4, wspace=0.4)
        plt.show()

    def pickle_everything(self):
        filename = '../pickles/lda_model.pkl'
        pickle.dump(self.lda_model, open(filename, 'wb'), protocol=2)

        filename = '../pickles/tf_vectorizer.pkl'
        pickle.dump(self.tf_vectorizer, open(filename, 'wb'), protocol=2)

Example #15

Show file

fid.close()
f2.close()
#    i = i + 1
#    print i

print len(corpus)

vectorizer = CountVectorizer()
transformer = TfidfTransformer()

tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
word = vectorizer.get_feature_names()
weight = tfidf.toarray()

vectorP = vectorizer.get_params()
tfidfP = transformer.get_params()

print 'vectorP:', vectorP
print 'tfidfP:', tfidfP

joblib.dump(vectorizer, "vectorizer" + str(sys.argv[1]) + ".m")
joblib.dump(transformer, "tfidf" + str(sys.argv[1]) + ".m")

resName = "BaiduTfidf_Result.txt"
result = codecs.open(resName, 'w', 'utf-8')
for j in range(len(word)):
    result.write(word[j] + ' ')
result.write('\r\n\r\n')

for i in range(len(weight)):

Example #16

Show file

X = cv.fit_transform(corpus).toarray()

messages.columns

y = messages['Label_enc']

## Divide the dataset into Train and Test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=0)

cv.get_feature_names()[:20]
cv.get_params()

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

from sklearn import metrics
import numpy as np
import itertools

classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred)

import matplotlib.pyplot as plt

Example #17

Show file

X_train, X_test, y_train, y_test = train_test_split(df['cleaned'],
                                                    df.opinion,
                                                    test_size=0.2)

count = CountVectorizer(ngram_range=(1, 2),
                        analyzer="word",
                        min_df=10,
                        max_df=0.9)
#count = CountVectorizer(ngram_range=(1,2),analyzer="word")
#count = CountVectorizer(lowercase=False)

temp = count.fit_transform(xtrain)
print(count.vocabulary_.__len__())
#print(count.get_feature_names())
print(count.get_params())  #her satırda bir cümle nin vectorizeri

tdif = TfidfTransformer()

temp2 = tdif.fit_transform(temp)
print(temp2)
text_regression = LogisticRegression()

model = text_regression.fit(temp2, ytrain)

prediction_data = tdif.transform((count.transform(xtest)))
#prediction_data = count.transform(xtest)
predicted = model.predict(prediction_data)

print(model.get_params())

Example #18

Show file

    message = [
        ps.stem(word) for word in message
        if not word in stopwords.words('english')
    ]
    meassage = ' '.join(message)
    corpus.append(meassage)

print(corpus[0])

#now applying countvector and apply bag of words

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, ngram_range=(1, 3))
x = cv.fit_transform(corpus).toarray()
print(cv.get_feature_names()[:20])  #check the top 20 feature names
print(cv.get_params())  #input parameter formed and their types
"""vector formed in bag of words"""
# count_df = pd.DataFrame(x_train, columns = cv.get_feature_names())
# count_df.head()

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

import numpy as np
from sklearn import metrics

Example #19

Show file

def generate_vectors(train_url,
                     test_url=None,
                     column='article',
                     trans_type=None,
                     max_n=1,
                     min_df=1,
                     max_df=1.0,
                     max_features=1,
                     sublinear_tf=True,
                     balanced=False,
                     re_weight=0,
                     verbose=False,
                     drop_words=0):
    """ generate X, y, X_test vectors with csv(with header) url use pandas and CountVectorizer

    Args:
        train_url: url to train csv
        test_url: url to test csv, set to None if not need X_test
        column: column to use as feature
        trans_type: specific transformer, {'dc','idf'}
        max_n: max_n for ngram_range
        min_df: min_df for CountVectorizer
        max_df: max_df for CountVectorizer
        max_features: max_features for CountVectorizer
        sublinear_tf: sublinear_tf for default TfdcTransformer
        balanced: balanced for default TfdcTransformer, for idf transformer, it is use_idf
        re_weight: re_weight for TfdcTransformer
        verbose: True to show more information
        drop_words: randomly delete some words from sentences

    Returns:
        X, y, X_test

    """
    verbose and print("loading '%s' level data from %s with pandas" %
                      (column, train_url))

    train_df = load_to_df(train_url)

    # vectorizer
    vec = CountVectorizer(ngram_range=(1, max_n),
                          min_df=min_df,
                          max_df=max_df,
                          max_features=max_features,
                          token_pattern='\w+')
    s_time = time()
    verbose and print("finish loading, vectorizing")
    verbose and print("vectorizer params:", vec.get_params())

    sequences = train_df[column]
    # delete some words randomly
    for i, row in enumerate(sequences):
        if drop_words <= 0:
            break
        if np.random.ranf() < drop_words:
            row = np.array(row.split())
            sequences.at[i] = ' '.join(row[np.random.ranf(row.shape) > 0.35])

    X = vec.fit_transform(sequences)
    e_time = time()
    verbose and print("finish vectorizing in %.3f seconds, transforming" %
                      (e_time - s_time))
    # transformer
    if trans_type is None or trans_type == 'idf':
        trans = TfidfTransformer(sublinear_tf=sublinear_tf, use_idf=balanced)
    else:
        trans = TfdcTransformer(sublinear_tf=sublinear_tf,
                                balanced=balanced,
                                re_weight=re_weight)

    verbose and print("transformer params:", trans.get_params())
    y = np.array((train_df["class"]).astype(int))
    X = trans.fit_transform(X, y)

    X_test = None
    if test_url:
        verbose and print("transforming test set")
        test_df = load_to_df(test_url)
        X_test = vec.transform(test_df[column])
        X_test = trans.transform(X_test)

    s_time = time()
    verbose and print("finish transforming in %.3f seconds\n" %
                      (s_time - e_time))
    return X, y, X_test

Example #20

Show file

File: lda_tm_sklearn.py Project: mavoll/SmSqr

def train_topic_model(wordcloud_path, number_topics, model_path,
                      preprocessed_path, clusterable_words, query_words,
                      filename, out, use_tfidf, expert_terms):

    outfile, outfile_pos, wordcloud_file, wordcloud_json, statistic_topics_json = prepare_infrastructure.prepare_file_names_train(
        filename, out)
    stop_words_german, stop_words_english = prepare_stopwords(query_words)
    # Json file to store topics and their word distribution
    if os.path.isfile(wordcloud_path):
        os.remove(wordcloud_path)
    file = open(wordcloud_path, 'a', encoding='utf8')
    var1 = {'name': 'topics', 'children': []}
    place_holder_list = var1.get('children')

    infile = clusterable_words

    # Removing stopwords from the clusterable words
    fin = open(infile, 'r', encoding='utf8')
    fout = open(outfile, "w+", encoding='utf8')
    for line in fin.readlines():
        for word in line.split():
            if not word in stop_words_german:
                fout.write(word + ' ')
        fout.write('\n')
    fin.close()
    fout.close()

    # Learn the vocabulary dictionary and return term-document matrix (BOWs)
    new_data, stem_lemma_dict = stem_clusterable_words(outfile)

    empty_lines = []
    new_data_sklearn = []  # replace by sklearn
    for i, n in enumerate(new_data):
        if new_data[i] == '':
            empty_lines.append(i)
        else:
            new_data_sklearn.append(n.split(' '))

    with open('output/new_data_gensim.sav', 'wb') as f:
        pickle.dump(new_data_sklearn, f)

    topic_input = open('output/topic_input.txt', "w+", encoding='utf8')
    for k in new_data:
        topic_input.write(k + '\n')
    topic_input.close()

    if use_tfidf is False:
        print("INFO: no tfidf in use")
        tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                        stop_words='english',
                                        lowercase=True,
                                        token_pattern=r'\b[a-zA-Z]{3,}\b',
                                        max_df=1.0,
                                        min_df=3)
        dtm_tf = tf_vectorizer.fit_transform(new_data)
        vocab = tf_vectorizer.get_feature_names()
        #print(dtm_tf.shape)
        lda_model = LatentDirichletAllocation(n_topics=number_topics,
                                              random_state=0,
                                              max_iter=50,
                                              max_doc_update_iter=500)
        lda_model.fit(dtm_tf)
    else:
        print("INFO: tfidf in use")
        tf_vectorizer_init = CountVectorizer(strip_accents='unicode',
                                             stop_words='english',
                                             lowercase=True,
                                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                                             max_df=1.0,
                                             min_df=3)
        tf_vectorizer = TfidfVectorizer(**tf_vectorizer_init.get_params())
        dtm_tf = tf_vectorizer.fit_transform(new_data)
        vocab = tf_vectorizer.get_feature_names()
        #print(dtm_tf.shape)
        lda_model = LatentDirichletAllocation(n_topics=number_topics,
                                              random_state=0,
                                              max_iter=50,
                                              max_doc_update_iter=500)
        lda_model.fit(dtm_tf)

    pickle.dump(lda_model, open(model_path, 'wb'))
    model = pickle.load(open(model_path, 'rb'))
    prepare_topic_distribution(model, place_holder_list, stem_lemma_dict,
                               vocab)
    save_word_cloud_json(var1, file)
    display_word_cloud(number_topics, wordcloud_file, wordcloud_json)
    save_train_topic_to_json(model, dtm_tf, preprocessed_path,
                             statistic_topics_json, expert_terms,
                             wordcloud_path)
    vis = pyLDAvis.sklearn.prepare(lda_model,
                                   dtm_tf,
                                   tf_vectorizer,
                                   sort_topics=False)
    pyLDAvis.save_html(vis, 'output/LDA_Visualization_sklearn.html')

Example #21

Show file

                                                    test_size=0.33,
                                                    random_state=25)

# In[25]:

y_test.shape

# In[26]:

cv.get_feature_names(
)[:
  20]  # Top 20 feature names for this data set #, which shows 2 words and 3 words togather #

# In[27]:

cv.get_params()  # will give details for the count vectorizer applied #

# In[28]:

# Data set after applying tyhe count vectporizer #

df_count = pd.DataFrame(X, columns=cv.get_feature_names())
df_count.head()

# In[29]:

# Applying the Multionomial NB algorithm #

from sklearn.naive_bayes import MultinomialNB

mn = MultinomialNB()

Example #22

Show file

# define steps in the pipeline

# (1) define parameters of count vectorizer
vec = CountVectorizer(
    analyzer="word",
    stop_words='english',
    ngram_range=(1, 2),
    #preprocessor=None,
    tokenizer=word_tokenize,
    max_features=10000)

# inspect:
vec.get_stop_words()
vec.get_feature_names()[:10]  #first 10 features (unigrams and bigrams)
vec.get_params()

# (2) classifier
lr = LogisticRegression()

# inspect:
lr.get_params()

# define a scikit learn pipeline

pipe_bigram_lr_clf = Pipeline([('vectorizer', vec), ('classifier', lr)])

# inspect: steps
pipe_bigram_lr_clf.named_steps

### Fit transformer/classifier Pipeline to train data (X_train and y_train) ----

Example #23

Show file

def generate_vectors(train_url,
                     test_url=None,
                     column='article',
                     trans_type=None,
                     max_n=1,
                     min_df=1,
                     max_df=1.0,
                     max_features=1,
                     sublinear_tf=True,
                     balanced=False,
                     re_weight=0,
                     verbose=False,
                     drop_words=0,
                     multilabel_out=False,
                     label_col='subjects',
                     only_single=True,
                     shuffle=True,
                     apply_fun=None):
    """ generate X, y, X_test vectors with csv(with header) url use pandas and CountVectorizer

    Args:
        train_url: url to train csv
        test_url: url to test csv, set to None if not need X_test
        column: column to use as feature
        trans_type: specific transformer, {'dc','idf', 'hashing'}
        max_n: max_n for ngram_range
        min_df: min_df for CountVectorizer
        max_df: max_df for CountVectorizer
        max_features: max_features for CountVectorizer
        sublinear_tf: sublinear_tf for default TfdcTransformer
        balanced: balanced for default TfdcTransformer, for idf transformer, it is use_idf
        re_weight: re_weight for TfdcTransformer
        verbose: True to show more information
        drop_words: randomly delete some words from sentences
        multilabel_out: return y as multilabel format
        label_col: col name of label
        only_single: only keep records of single label
        shuffle: re sample train data
        apply_fun: callable to be applied on label column

    Returns:
        X, y, X_test

    """
    verbose and print("loading '%s' level data from %s with pandas" %
                      (column, train_url))

    train_df = pd.read_csv(train_url)
    if shuffle:
        train_df = train_df.sample(frac=1)
    if only_single:
        train_df = train_df[train_df['subjects'].apply(lambda x: len(x) < 2)]

    # vectorizer
    s_time = time()
    analyzer = 'word' if column == 'word_seg' else 'char'
    vec = CountVectorizer(analyzer=analyzer,
                          ngram_range=(1, max_n),
                          min_df=min_df,
                          max_df=max_df,
                          max_features=max_features,
                          token_pattern='\w+')
    verbose and print("finish loading, vectorizing")
    verbose and print("vectorizer params:", vec.get_params())
    sequences = train_df[column]
    # delete some words randomly
    for i, row in enumerate(sequences):
        if drop_words <= 0:
            break
        if np.random.ranf() < drop_words:
            row = np.array(row.split())
            sequences.at[i] = ' '.join(row[np.random.ranf(row.shape) > 0.35])
    X = sequences if trans_type == 'hashing' else vec.fit_transform(sequences)
    e_time = time()
    verbose and print("finish vectorizing in %.3f seconds, transforming" %
                      (e_time - s_time))

    # transformer
    if trans_type is None or trans_type == 'idf':
        trans = TfidfTransformer(sublinear_tf=sublinear_tf, use_idf=balanced)
    elif trans_type == 'dc':
        trans = TfdcTransformer(sublinear_tf=sublinear_tf,
                                balanced=balanced,
                                re_weight=re_weight)
    else:
        trans = HashingVectorizer(analyzer=analyzer,
                                  ngram_range=(1, max_n),
                                  n_features=max_features,
                                  token_pattern='\w+',
                                  binary=not balanced)
    verbose and print(trans_type, "transformer params:", trans.get_params())

    if multilabel_out:
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_df[label_col].apply(str.split))
        verbose and print("multilabel columns:\n", mlb.classes_)
    else:
        y = train_df[label_col].apply(apply_fun).values if apply_fun is not None \
            else train_df[label_col].values
    X = trans.fit_transform(X, y)

    X_test = None
    if test_url:
        verbose and print("transforming test set")
        test_df = pd.read_csv(test_url)
        X_test = test_df[column] if trans_type == 'hashing' else vec.transform(
            test_df[column])
        X_test = trans.transform(X_test)
    s_time = time()
    verbose and print("finish transforming in %.3f seconds\n" %
                      (s_time - e_time))
    return X, y, X_test