Example #1
0
def read_messages(filepath):
    data = pd.read_csv("./spam.csv",
                       encoding="latin1",
                       names=["labels", "text", "", "", ""])
    data = data.filter(["labels", "text"])  #remove extra columns
    mapping = {"spam": 0, "ham": 1}
    data = data.replace({"labels": mapping})

    ps = PorterStemmer()

    for index, value in data.iterrows():
        text = value["text"]
        text = porter_stemmer(text, ps)
        data.set_value(index, "text", text)

    #counts the number of uses per word
    count_vectorizer = CountVectorizer()
    counts = count_vectorizer.fit_transform(data["text"])
    labels = data["labels"]
    # assert(len(labels) == len(counts))
    # print("Number of examples", len(labels))
    print("params", count_vectorizer.get_params())
    print("type", type(count_vectorizer))
    print("shape", np.shape(counts))
    return labels, counts, count_vectorizer
def featTransform(sents_train, sents_test):
    cv = CountVectorizer()
    cv.fit(sents_train)
    print(cv.get_params())
    features_train = cv.transform(sents_train)
    features_test = cv.transform(sents_test)
    return features_train, features_test, cv
 def get_params(self, deep=True):
     params = super().get_params(deep)
     # Hack to make get_params return base class params...
     cp = copy.copy(self)
     cp.__class__ = CountVectorizer
     params.update(CountVectorizer.get_params(cp, deep))
     return params
Example #4
0
def featTransform(sents_train, sents_test):
    cv = CountVectorizer()
    cv.fit(sents_train)
    print(cv.get_params())
    features_train = cv.transform(sents_train)
    features_test = cv.transform(sents_test)
    return features_train, features_test, cv
Example #5
0
    def Common_Vectorizer_usage():
        from sklearn.feature_extraction.text import CountVectorizer
        vectorizer = CountVectorizer(min_df=1)
        corpus = [
            'This is the first document.',
            'This is the second second document.',
            'And the third one.',
            'Is this the first document?',
        ]

        analyze = vectorizer.build_analyzer()
        print analyze("This is a text document to analyze.")
        print analyze("This is a text document to analyze.") == ['this', 'is', 'text', 'document', 'to', 'analyze']
        
        X=vectorizer.fit_transform(corpus)
        print vectorizer.get_feature_names()
        print vectorizer.vocabulary_    #.get('document')
        print vectorizer.transform(['Something completely new.']).toarray()
        print list(X) 
        
        #bigram========================================================
        bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
        analyze = bigram_vectorizer.build_analyzer()
        print analyze('Bi-grams are cool!')
        X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
        print X_2

        feature_index = bigram_vectorizer.vocabulary_.get('is this')
        print X_2[:, feature_index] 
        
        #marui test
        print '\n\nmarui test====================='
        def t_preprocessor(s):
            return ','.join([x.lower() for x in s.split(' ')])

        stop_words1=['is','a','this']           #is ok: frozenset(['a', 'this', 'is'])
        stop_words2={'is':0,'a':1,'this':2}     #is ok: convert to frozenset(['a', 'this', 'is'])    
            
        cv = CountVectorizer(preprocessor=t_preprocessor,stop_words=stop_words2)
        params=cv.get_params()
        print 'get_params()',type(params),'---------------'
        for k in params:
            print k,'\t',params[k]
        print 'get_params end--------------'
        print '\nget_stop_words=',cv.get_stop_words()
        
        cv.fit(corpus)
        print cv.get_feature_names()
        print cv.transform(corpus).toarray()
        print '\n测试preprocesser, result:\t',cv.build_preprocessor()('this is a document')
        print '\n测试tokenizer,result',cv.build_tokenizer()('this is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th-is is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th_is is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th&is is a document')

        """
def LDA(docs_raw,n_topics):
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                    stop_words = en_stop,
                                    # token choose the words with more than 4 length and get rid of the speical words
                                    token_pattern = r'\b\w+\b')
    dtm_tf = tf_vectorizer.fit_transform(docs_raw)
    tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
    dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
    # for TF DTM
    lda_tf = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    %time lda_tf.fit(dtm_tf)
    # for TFIDF DTM
    lda_tfidf = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    %time lda_tfidf.fit(dtm_tfidf)
    return(lda_tf, dtm_tf, tf_vectorizer)
Example #7
0
def main2(params):
    df = pd.read_csv(join(params.input_folder,
                          'tokenized1/084_update_quality_minmax_sizeFixture.cs.tree-viewer.txt'), header=None)
    df = df[df[0].notnull()]
    df.applymap(filter_type)
    matrix = CountVectorizer(max_features=10)
    X = matrix.fit_transform(df[0]).toarray()
    print(matrix.vocabulary_)
    print(matrix.get_params())
    df[0].iloc[0:10].str.cat(sep=' ')
    starters = df.loc[df[0] == "BEGIN_METHOD"]
    enders = df.loc[df[0] == "END_METHOD"]
    zipped = list(zip(starters.index, enders.index))
    functions_list = []
    for begin, end in zipped:
        functions_list.append(df[0].iloc[begin:end+1].str.cat(sep=' '))
Example #8
0
def model(df):
    #hasher = HashingVectorizer(n_features=100, analyzer='word', stop_words='english', alternate_sign=False, norm=None)
    hasher = CountVectorizer(analyzer='word', stop_words='english')
    vectorizer = make_pipeline(hasher, TfidfTransformer(use_idf=False))
    X = vectorizer.fit_transform(df['links'])
    print(hasher.get_params())
    normalizer = Normalizer(copy=False)
    svd = TruncatedSVD(n_components=12)
    lsa = make_pipeline(svd, normalizer)
    Y = lsa.fit_transform(X)
    km = KMeans(n_clusters=4, init='k-means++', max_iter=1000, n_init=1)
    Z = km.fit_predict(Y)
    df['labels'] = Z
    df['first'] = Y[:, 0]
    df['second'] = Y[:, 1]
    df['third'] = Y[:, 2]

    result = df[['outlet', 'total', 'labels', 'first', 'second', 'third']]
    return result
Example #9
0
def gen_document_term_matrices(args, data):
    """Generates document-term matrices"""

    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
    tf_vectorizer = CountVectorizer(stop_words="english",
                                    max_features=args.num_features,
                                    max_df=0.95,
                                    min_df=2)
    dtm_tf = tf_vectorizer.fit_transform(data)
    with open("%s/dtm_tf.pkl" % args.output_dir, "wb") as dtm_file:
        pickle.dump(tf_vectorizer, dtm_file)
        pickle.dump(dtm_tf, dtm_file)

    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
    dtm_tfidf = tfidf_vectorizer.fit_transform(data)
    with open("%s/dtm_tfidf.pkl" % args.output_dir, "wb") as dtm_file:
        pickle.dump(tfidf_vectorizer, dtm_file)
        pickle.dump(dtm_tfidf, dtm_file)
    return tf_vectorizer, dtm_tf, dtm_tfidf
Example #10
0
def topic_modelling(data):
    abstracts = []
    for abstract in data:
        # Remove punctuation
        abstract = re.sub('[,\.!?]', '', abstract)
        # Remove numbers
        abstract = re.sub('[0-9]', '', abstract)
        # Convert the abstracts to lowercase
        abstract = abstract.lower()
        abstracts.append(abstract)
    # Splitting abstracts
    snnipets = []
    for abstract in abstracts:
        if abstract != "abstract not available":
            length = len(abstract)
            index = 0
            last_i = 0
            n = 256
            while index < length:
                i = abstract.rfind(". ", index, index + n)
                if i == -1 or i == index:
                    i = index + n
                text = abstract[index:i + 2]
                index = i + 2
                snnipets.append(text)
    # Creating LDA
    #number_topics = 5
    tf_vectorizer = CountVectorizer(stop_words='english')
    tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
    dtm_tfidf = tfidf_vectorizer.fit_transform(snnipets)
    lda_tfidf = LDA(random_state=0)
    lda_tfidf.fit(dtm_tfidf)
    # Visualizing LDA
    data = pyLDAvis.sklearn.prepare(lda_tfidf,
                                    dtm_tfidf,
                                    tfidf_vectorizer,
                                    mds='mmds')
    html = pyLDAvis.prepared_data_to_html(data, template_type="simple")
    return html
Example #11
0
from sklearn.feature_extraction.text import CountVectorizer
import itertools
import numpy as np

data = ['She did not cheat on the test, for it was not the right thing to do.','I think I will buy the red car, or I will lease the blue one.','I really want to go to work, but I am too sick to drive.','I am counting my calories, yet I really want dessert.']
count_vect = CountVectorizer()


cx = count_vect.fit_transform(data)

vocab = {}
for key, value in count_vect.vocabulary_.iteritems():
  vocab[value] = key

#print vocab

start = 0
for i, end in enumerate(cx.indptr[1:]):
  for j, val in zip(cx.indices[start:end], cx.data[start:end]):
    print "("+str(i)+","+str(j)+"): "+str(val)+" => '"+vocab[j]+"'"
    #print vocab[j]+" ",
  print ""
  start=end

print count_vect.get_params()
#for i,j,v in zip(cx.row, cx.col, cx.data):
#     print (i,j,v)
Example #12
0
for file in files:  #Topic modeling that reads in utterances
    df = pd.read_csv(file)
    utterance_temp.append(df['stringList'].tolist())

utterance_raw = [item for sublist in utterance_temp for item in sublist]

tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                stop_words='english',
                                lowercase=True,
                                token_pattern=r'\b[a-zA-Z]{3,}\b',
                                max_df=0.2,
                                min_df=0)
dtm_tf = tf_vectorizer.fit_transform(utterance_raw)

tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(utterance_raw)

# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=30, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=30, random_state=0)
lda_tfidf.fit(dtm_tfidf)

nmf_tf = NMF(n_components=80, random_state=1, alpha=.1,
             l1_ratio=.5).fit(dtm_tf)

# nmf_tfidf = NMF(n_components=10, random_state=1,
#           beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
#           l1_ratio=.5).fit(dtm_tfidf)
Example #13
0
matrix.vocabulary_

# In[93]:

sizes = sorted(np.asarray(cv_fit.sum(axis=0))[0], reverse=True)
print(sizes)

# In[94]:

values = list(matrix.vocabulary_.keys())
values

# In[96]:

matrix.get_params()

# #### How to combine multiple rows into a single row with pandas

# In[97]:

df[0].iloc[0:10].str.cat(sep=' ')

# In[ ]:

get_ipython().run_line_magic('pinfo', 'matrix')

# #### separate functions from each other

# In[98]:
Example #14
0
class NewsBias:
    def __init__(self):
        self.tf_vectorizer = []
        self.tf = []
        self.lda_model = []
        self.feature_names = []
        self.topics_mat = []
        self.sentiment_by_topic = []

    def fix_sites(mongo_db):
        fix_cnn(mongo_db)
        fix_huffpo(mongo_db)

    def from_mongo(self, db_name):
        df = get_df(db_name)
        df = clean_df(df)
        df = df[pd.notnull(df['processed_text'])]
        df = df[df['processed_text'] != '']

        return df

    def from_csv(self, csv_name):
        try:
            df = pd.read_csv('data/' + csv_name, parse_dates=False)
            return df
        except:
            print('CSV file does not exist!')
            print('Make sure CSV file is in data folder.')
            return False

    def to_csv(self, df, filename):
        filename = 'data/' + filename
        df.to_csv(filename, index=False)
        print('CSV file saved to: ' + filename)

    def update_from_bucket(self, filename):
        path = os.getcwd()
        # Example filename: 'dsiprojectdata/rss_feeds_new.tar'
        result = from_bucket(filename, path)
        if not result:
            print('Error updating data from bucket!')
            print(
                'Make sure you include folder and file in filename from bucket.'
            )

    def update_to_bucket(self, filename, bucketname, mongo_db=False):
        # If mongo database then just give database name as filename
        if mongo_db:
            cwd = os.getcwd()
            # Give permission to bash file then run
            p1 = subprocess.Popen('chmod',
                                  '+x',
                                  'backup.sh',
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            out1, err1 = p1.communicate()
            p2 = subprocess.Popen(cwd + '/backup.sh',
                                  filename,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            out2, err2 = p2.communicate()
        else:
            p = subprocess.Popen('/usr/bin/aws',
                                 's3',
                                 'cp',
                                 filename,
                                 's3://' + bucketname + '/',
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            out, err = p.communicate()

    def run_lda(self, df, max_features=1000, n_topics=20):
        df = df[pd.notnull(df['processed_text'])]
        processed_text = df['processed_text'].values.tolist()
        # Inclued quotes in LDA
        processed_quote = df['processed_quote'].values.tolist()
        processed_tweet = df['processed_tweet'].values.tolist()
        processed_all = []
        for text, quote, tweet in zip(processed_text, processed_quote):
            # Check if quote is nan
            if type(quote) == float:
                quote = ''
            if type(tweet) == float:
                tweet = ''
            processed_all.append(text + quote + tweet)
        try:
            self.tf_vectorizer = CountVectorizer(max_df=0.95,
                                                 min_df=0.05,
                                                 max_features=max_features,
                                                 stop_words='english')
            self.tf = self.tf_vectorizer.fit_transform(processed_all)
        except:
            import pdb
            pdb.set_trace()
        self.lda_model = LatentDirichletAllocation(n_topics=n_topics,
                                                   max_iter=5,
                                                   learning_method='online',
                                                   learning_offset=50.,
                                                   random_state=0,
                                                   n_jobs=-1)

        self.lda_model.fit(self.tf)

        self.feature_names = np.array(self.tf_vectorizer.get_feature_names())
        self.topics_mat = self.lda_model.components_

        return self.lda_model

    def run_gensim_lda(self, df, n_topics=20):
        self.lda_model = gensim_lda(df, n_topics)

    def get_top_word_by_topic(topic, n_words):
        return self.feature_names[np.argsort(
            self.topics_mat[topic, :])[::-1]][:n_words]

    def visualize_lda(self, df, display=False):
        if self.lda_model == []:
            self.run_lda(df)
        max_features = self.tf_vectorizer.get_params()['max_features']
        n_topics = self.lda_model.get_params()['n_topics']
        vis_data = pyLDAvis.sklearn.prepare(self.lda_model,
                                            self.tf,
                                            self.tf_vectorizer,
                                            R=n_topics,
                                            n_jobs=-1)
        pyLDAvis.save_html(
            vis_data, 'plots/pyLDAvis_' + str(max_features) + 'feats_' +
            str(n_topics) + 'topics.html')
        if display:
            pyLDAvis.show(vis_data)

    def get_sentiment_of_words(self, df):
        sentiment_of_words = sentiment_of_words_wordnet(df)

        return sentiment_of_words

    def get_sentiment_by_topic(self, df, display=False):
        n_topics = self.lda_model.get_params()['n_topics']

        self.sentiment_by_topic = sentiment_by_topic_wordnet(
            df, self.topics_mat, self.feature_names)

        if display:
            for i, site in enumerate(sentiment_by_topic.keys()):
                plt.subplot(3, 4, i + 1)
                score = []
                for topic in range(n_topics):
                    score.append(sentiment_by_topic[site][topic][3])
                score = np.array(score)
                score /= sum(np.abs(score))
                plt.bar(np.arange(len(score)), score, align='center')
                plt.ylabel('Score')
                plt.title('Score by Topic for ' + site)
            plt.subplots_adjust(hspace=0.4, wspace=0.4)
            plt.show()

        return self.sentiment_by_topic

    def length_of_articles_hist(self, df):
        for i, site in enumerate(df['source'].unique()):
            plt.subplot(3, 4, i + 1)
            new_df = df[df['source'] == site]
            article_len = [
                len(article.split(' ')) for article in new_df['article_text']
            ]
            plt.hist(article_len, normed=True)
            plt.xlabel('Length of Article')
            plt.ylabel('# of Articles')
            plt.title('Length of articles for ' + site)
        plt.subplots_adjust(hspace=0.4, wspace=0.4)
        plt.show()

    def pickle_everything(self):
        filename = '../pickles/lda_model.pkl'
        pickle.dump(self.lda_model, open(filename, 'wb'), protocol=2)

        filename = '../pickles/tf_vectorizer.pkl'
        pickle.dump(self.tf_vectorizer, open(filename, 'wb'), protocol=2)
Example #15
0
fid.close()
f2.close()
#    i = i + 1
#    print i

print len(corpus)

vectorizer = CountVectorizer()
transformer = TfidfTransformer()

tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
word = vectorizer.get_feature_names()
weight = tfidf.toarray()

vectorP = vectorizer.get_params()
tfidfP = transformer.get_params()

print 'vectorP:', vectorP
print 'tfidfP:', tfidfP

joblib.dump(vectorizer, "vectorizer" + str(sys.argv[1]) + ".m")
joblib.dump(transformer, "tfidf" + str(sys.argv[1]) + ".m")

resName = "BaiduTfidf_Result.txt"
result = codecs.open(resName, 'w', 'utf-8')
for j in range(len(word)):
    result.write(word[j] + ' ')
result.write('\r\n\r\n')

for i in range(len(weight)):
Example #16
0
X = cv.fit_transform(corpus).toarray()

messages.columns

y = messages['Label_enc']

## Divide the dataset into Train and Test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=0)

cv.get_feature_names()[:20]
cv.get_params()

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

from sklearn import metrics
import numpy as np
import itertools

classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred)

import matplotlib.pyplot as plt
Example #17
0
X_train, X_test, y_train, y_test = train_test_split(df['cleaned'],
                                                    df.opinion,
                                                    test_size=0.2)

count = CountVectorizer(ngram_range=(1, 2),
                        analyzer="word",
                        min_df=10,
                        max_df=0.9)
#count = CountVectorizer(ngram_range=(1,2),analyzer="word")
#count = CountVectorizer(lowercase=False)

temp = count.fit_transform(xtrain)
print(count.vocabulary_.__len__())
#print(count.get_feature_names())
print(count.get_params())  #her satırda bir cümle nin vectorizeri

tdif = TfidfTransformer()

temp2 = tdif.fit_transform(temp)
print(temp2)
text_regression = LogisticRegression()

model = text_regression.fit(temp2, ytrain)

prediction_data = tdif.transform((count.transform(xtest)))
#prediction_data = count.transform(xtest)
predicted = model.predict(prediction_data)

print(model.get_params())
Example #18
0
    message = [
        ps.stem(word) for word in message
        if not word in stopwords.words('english')
    ]
    meassage = ' '.join(message)
    corpus.append(meassage)

print(corpus[0])

#now applying countvector and apply bag of words

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, ngram_range=(1, 3))
x = cv.fit_transform(corpus).toarray()
print(cv.get_feature_names()[:20])  #check the top 20 feature names
print(cv.get_params())  #input parameter formed and their types
"""vector formed in bag of words"""
# count_df = pd.DataFrame(x_train, columns = cv.get_feature_names())
# count_df.head()

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

import numpy as np
from sklearn import metrics
Example #19
0
def generate_vectors(train_url,
                     test_url=None,
                     column='article',
                     trans_type=None,
                     max_n=1,
                     min_df=1,
                     max_df=1.0,
                     max_features=1,
                     sublinear_tf=True,
                     balanced=False,
                     re_weight=0,
                     verbose=False,
                     drop_words=0):
    """ generate X, y, X_test vectors with csv(with header) url use pandas and CountVectorizer

    Args:
        train_url: url to train csv
        test_url: url to test csv, set to None if not need X_test
        column: column to use as feature
        trans_type: specific transformer, {'dc','idf'}
        max_n: max_n for ngram_range
        min_df: min_df for CountVectorizer
        max_df: max_df for CountVectorizer
        max_features: max_features for CountVectorizer
        sublinear_tf: sublinear_tf for default TfdcTransformer
        balanced: balanced for default TfdcTransformer, for idf transformer, it is use_idf
        re_weight: re_weight for TfdcTransformer
        verbose: True to show more information
        drop_words: randomly delete some words from sentences

    Returns:
        X, y, X_test

    """
    verbose and print("loading '%s' level data from %s with pandas" %
                      (column, train_url))

    train_df = load_to_df(train_url)

    # vectorizer
    vec = CountVectorizer(ngram_range=(1, max_n),
                          min_df=min_df,
                          max_df=max_df,
                          max_features=max_features,
                          token_pattern='\w+')
    s_time = time()
    verbose and print("finish loading, vectorizing")
    verbose and print("vectorizer params:", vec.get_params())

    sequences = train_df[column]
    # delete some words randomly
    for i, row in enumerate(sequences):
        if drop_words <= 0:
            break
        if np.random.ranf() < drop_words:
            row = np.array(row.split())
            sequences.at[i] = ' '.join(row[np.random.ranf(row.shape) > 0.35])

    X = vec.fit_transform(sequences)
    e_time = time()
    verbose and print("finish vectorizing in %.3f seconds, transforming" %
                      (e_time - s_time))
    # transformer
    if trans_type is None or trans_type == 'idf':
        trans = TfidfTransformer(sublinear_tf=sublinear_tf, use_idf=balanced)
    else:
        trans = TfdcTransformer(sublinear_tf=sublinear_tf,
                                balanced=balanced,
                                re_weight=re_weight)

    verbose and print("transformer params:", trans.get_params())
    y = np.array((train_df["class"]).astype(int))
    X = trans.fit_transform(X, y)

    X_test = None
    if test_url:
        verbose and print("transforming test set")
        test_df = load_to_df(test_url)
        X_test = vec.transform(test_df[column])
        X_test = trans.transform(X_test)

    s_time = time()
    verbose and print("finish transforming in %.3f seconds\n" %
                      (s_time - e_time))
    return X, y, X_test
Example #20
0
def train_topic_model(wordcloud_path, number_topics, model_path,
                      preprocessed_path, clusterable_words, query_words,
                      filename, out, use_tfidf, expert_terms):

    outfile, outfile_pos, wordcloud_file, wordcloud_json, statistic_topics_json = prepare_infrastructure.prepare_file_names_train(
        filename, out)
    stop_words_german, stop_words_english = prepare_stopwords(query_words)
    # Json file to store topics and their word distribution
    if os.path.isfile(wordcloud_path):
        os.remove(wordcloud_path)
    file = open(wordcloud_path, 'a', encoding='utf8')
    var1 = {'name': 'topics', 'children': []}
    place_holder_list = var1.get('children')

    infile = clusterable_words

    # Removing stopwords from the clusterable words
    fin = open(infile, 'r', encoding='utf8')
    fout = open(outfile, "w+", encoding='utf8')
    for line in fin.readlines():
        for word in line.split():
            if not word in stop_words_german:
                fout.write(word + ' ')
        fout.write('\n')
    fin.close()
    fout.close()

    # Learn the vocabulary dictionary and return term-document matrix (BOWs)
    new_data, stem_lemma_dict = stem_clusterable_words(outfile)

    empty_lines = []
    new_data_sklearn = []  # replace by sklearn
    for i, n in enumerate(new_data):
        if new_data[i] == '':
            empty_lines.append(i)
        else:
            new_data_sklearn.append(n.split(' '))

    with open('output/new_data_gensim.sav', 'wb') as f:
        pickle.dump(new_data_sklearn, f)

    topic_input = open('output/topic_input.txt', "w+", encoding='utf8')
    for k in new_data:
        topic_input.write(k + '\n')
    topic_input.close()

    if use_tfidf is False:
        print("INFO: no tfidf in use")
        tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                        stop_words='english',
                                        lowercase=True,
                                        token_pattern=r'\b[a-zA-Z]{3,}\b',
                                        max_df=1.0,
                                        min_df=3)
        dtm_tf = tf_vectorizer.fit_transform(new_data)
        vocab = tf_vectorizer.get_feature_names()
        #print(dtm_tf.shape)
        lda_model = LatentDirichletAllocation(n_topics=number_topics,
                                              random_state=0,
                                              max_iter=50,
                                              max_doc_update_iter=500)
        lda_model.fit(dtm_tf)
    else:
        print("INFO: tfidf in use")
        tf_vectorizer_init = CountVectorizer(strip_accents='unicode',
                                             stop_words='english',
                                             lowercase=True,
                                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                                             max_df=1.0,
                                             min_df=3)
        tf_vectorizer = TfidfVectorizer(**tf_vectorizer_init.get_params())
        dtm_tf = tf_vectorizer.fit_transform(new_data)
        vocab = tf_vectorizer.get_feature_names()
        #print(dtm_tf.shape)
        lda_model = LatentDirichletAllocation(n_topics=number_topics,
                                              random_state=0,
                                              max_iter=50,
                                              max_doc_update_iter=500)
        lda_model.fit(dtm_tf)

    pickle.dump(lda_model, open(model_path, 'wb'))
    model = pickle.load(open(model_path, 'rb'))
    prepare_topic_distribution(model, place_holder_list, stem_lemma_dict,
                               vocab)
    save_word_cloud_json(var1, file)
    display_word_cloud(number_topics, wordcloud_file, wordcloud_json)
    save_train_topic_to_json(model, dtm_tf, preprocessed_path,
                             statistic_topics_json, expert_terms,
                             wordcloud_path)
    vis = pyLDAvis.sklearn.prepare(lda_model,
                                   dtm_tf,
                                   tf_vectorizer,
                                   sort_topics=False)
    pyLDAvis.save_html(vis, 'output/LDA_Visualization_sklearn.html')
Example #21
0
                                                    test_size=0.33,
                                                    random_state=25)

# In[25]:

y_test.shape

# In[26]:

cv.get_feature_names(
)[:
  20]  # Top 20 feature names for this data set #, which shows 2 words and 3 words togather #

# In[27]:

cv.get_params()  # will give details for the count vectorizer applied #

# In[28]:

# Data set after applying tyhe count vectporizer #

df_count = pd.DataFrame(X, columns=cv.get_feature_names())
df_count.head()

# In[29]:

# Applying the Multionomial NB algorithm #

from sklearn.naive_bayes import MultinomialNB

mn = MultinomialNB()
Example #22
0
# define steps in the pipeline

# (1) define parameters of count vectorizer
vec = CountVectorizer(
    analyzer="word",
    stop_words='english',
    ngram_range=(1, 2),
    #preprocessor=None,
    tokenizer=word_tokenize,
    max_features=10000)

# inspect:
vec.get_stop_words()
vec.get_feature_names()[:10]  #first 10 features (unigrams and bigrams)
vec.get_params()

# (2) classifier
lr = LogisticRegression()

# inspect:
lr.get_params()

# define a scikit learn pipeline

pipe_bigram_lr_clf = Pipeline([('vectorizer', vec), ('classifier', lr)])

# inspect: steps
pipe_bigram_lr_clf.named_steps

### Fit transformer/classifier Pipeline to train data (X_train and y_train) ----
Example #23
0
def generate_vectors(train_url,
                     test_url=None,
                     column='article',
                     trans_type=None,
                     max_n=1,
                     min_df=1,
                     max_df=1.0,
                     max_features=1,
                     sublinear_tf=True,
                     balanced=False,
                     re_weight=0,
                     verbose=False,
                     drop_words=0,
                     multilabel_out=False,
                     label_col='subjects',
                     only_single=True,
                     shuffle=True,
                     apply_fun=None):
    """ generate X, y, X_test vectors with csv(with header) url use pandas and CountVectorizer

    Args:
        train_url: url to train csv
        test_url: url to test csv, set to None if not need X_test
        column: column to use as feature
        trans_type: specific transformer, {'dc','idf', 'hashing'}
        max_n: max_n for ngram_range
        min_df: min_df for CountVectorizer
        max_df: max_df for CountVectorizer
        max_features: max_features for CountVectorizer
        sublinear_tf: sublinear_tf for default TfdcTransformer
        balanced: balanced for default TfdcTransformer, for idf transformer, it is use_idf
        re_weight: re_weight for TfdcTransformer
        verbose: True to show more information
        drop_words: randomly delete some words from sentences
        multilabel_out: return y as multilabel format
        label_col: col name of label
        only_single: only keep records of single label
        shuffle: re sample train data
        apply_fun: callable to be applied on label column

    Returns:
        X, y, X_test

    """
    verbose and print("loading '%s' level data from %s with pandas" %
                      (column, train_url))

    train_df = pd.read_csv(train_url)
    if shuffle:
        train_df = train_df.sample(frac=1)
    if only_single:
        train_df = train_df[train_df['subjects'].apply(lambda x: len(x) < 2)]

    # vectorizer
    s_time = time()
    analyzer = 'word' if column == 'word_seg' else 'char'
    vec = CountVectorizer(analyzer=analyzer,
                          ngram_range=(1, max_n),
                          min_df=min_df,
                          max_df=max_df,
                          max_features=max_features,
                          token_pattern='\w+')
    verbose and print("finish loading, vectorizing")
    verbose and print("vectorizer params:", vec.get_params())
    sequences = train_df[column]
    # delete some words randomly
    for i, row in enumerate(sequences):
        if drop_words <= 0:
            break
        if np.random.ranf() < drop_words:
            row = np.array(row.split())
            sequences.at[i] = ' '.join(row[np.random.ranf(row.shape) > 0.35])
    X = sequences if trans_type == 'hashing' else vec.fit_transform(sequences)
    e_time = time()
    verbose and print("finish vectorizing in %.3f seconds, transforming" %
                      (e_time - s_time))

    # transformer
    if trans_type is None or trans_type == 'idf':
        trans = TfidfTransformer(sublinear_tf=sublinear_tf, use_idf=balanced)
    elif trans_type == 'dc':
        trans = TfdcTransformer(sublinear_tf=sublinear_tf,
                                balanced=balanced,
                                re_weight=re_weight)
    else:
        trans = HashingVectorizer(analyzer=analyzer,
                                  ngram_range=(1, max_n),
                                  n_features=max_features,
                                  token_pattern='\w+',
                                  binary=not balanced)
    verbose and print(trans_type, "transformer params:", trans.get_params())

    if multilabel_out:
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_df[label_col].apply(str.split))
        verbose and print("multilabel columns:\n", mlb.classes_)
    else:
        y = train_df[label_col].apply(apply_fun).values if apply_fun is not None \
            else train_df[label_col].values
    X = trans.fit_transform(X, y)

    X_test = None
    if test_url:
        verbose and print("transforming test set")
        test_df = pd.read_csv(test_url)
        X_test = test_df[column] if trans_type == 'hashing' else vec.transform(
            test_df[column])
        X_test = trans.transform(X_test)
    s_time = time()
    verbose and print("finish transforming in %.3f seconds\n" %
                      (s_time - e_time))
    return X, y, X_test