def read_messages(filepath): data = pd.read_csv("./spam.csv", encoding="latin1", names=["labels", "text", "", "", ""]) data = data.filter(["labels", "text"]) #remove extra columns mapping = {"spam": 0, "ham": 1} data = data.replace({"labels": mapping}) ps = PorterStemmer() for index, value in data.iterrows(): text = value["text"] text = porter_stemmer(text, ps) data.set_value(index, "text", text) #counts the number of uses per word count_vectorizer = CountVectorizer() counts = count_vectorizer.fit_transform(data["text"]) labels = data["labels"] # assert(len(labels) == len(counts)) # print("Number of examples", len(labels)) print("params", count_vectorizer.get_params()) print("type", type(count_vectorizer)) print("shape", np.shape(counts)) return labels, counts, count_vectorizer
def featTransform(sents_train, sents_test): cv = CountVectorizer() cv.fit(sents_train) print(cv.get_params()) features_train = cv.transform(sents_train) features_test = cv.transform(sents_test) return features_train, features_test, cv
def get_params(self, deep=True): params = super().get_params(deep) # Hack to make get_params return base class params... cp = copy.copy(self) cp.__class__ = CountVectorizer params.update(CountVectorizer.get_params(cp, deep)) return params
def Common_Vectorizer_usage(): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) corpus = [ 'This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?', ] analyze = vectorizer.build_analyzer() print analyze("This is a text document to analyze.") print analyze("This is a text document to analyze.") == ['this', 'is', 'text', 'document', 'to', 'analyze'] X=vectorizer.fit_transform(corpus) print vectorizer.get_feature_names() print vectorizer.vocabulary_ #.get('document') print vectorizer.transform(['Something completely new.']).toarray() print list(X) #bigram======================================================== bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1) analyze = bigram_vectorizer.build_analyzer() print analyze('Bi-grams are cool!') X_2 = bigram_vectorizer.fit_transform(corpus).toarray() print X_2 feature_index = bigram_vectorizer.vocabulary_.get('is this') print X_2[:, feature_index] #marui test print '\n\nmarui test=====================' def t_preprocessor(s): return ','.join([x.lower() for x in s.split(' ')]) stop_words1=['is','a','this'] #is ok: frozenset(['a', 'this', 'is']) stop_words2={'is':0,'a':1,'this':2} #is ok: convert to frozenset(['a', 'this', 'is']) cv = CountVectorizer(preprocessor=t_preprocessor,stop_words=stop_words2) params=cv.get_params() print 'get_params()',type(params),'---------------' for k in params: print k,'\t',params[k] print 'get_params end--------------' print '\nget_stop_words=',cv.get_stop_words() cv.fit(corpus) print cv.get_feature_names() print cv.transform(corpus).toarray() print '\n测试preprocesser, result:\t',cv.build_preprocessor()('this is a document') print '\n测试tokenizer,result',cv.build_tokenizer()('this is a document') print '\n测试tokenizer2,result',cv.build_tokenizer()('th-is is a document') print '\n测试tokenizer2,result',cv.build_tokenizer()('th_is is a document') print '\n测试tokenizer2,result',cv.build_tokenizer()('th&is is a document') """
def LDA(docs_raw,n_topics): tf_vectorizer = CountVectorizer(strip_accents = 'unicode', stop_words = en_stop, # token choose the words with more than 4 length and get rid of the speical words token_pattern = r'\b\w+\b') dtm_tf = tf_vectorizer.fit_transform(docs_raw) tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params()) dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw) # for TF DTM lda_tf = LatentDirichletAllocation(n_components=n_topics, random_state=0) %time lda_tf.fit(dtm_tf) # for TFIDF DTM lda_tfidf = LatentDirichletAllocation(n_components=n_topics, random_state=0) %time lda_tfidf.fit(dtm_tfidf) return(lda_tf, dtm_tf, tf_vectorizer)
def main2(params): df = pd.read_csv(join(params.input_folder, 'tokenized1/084_update_quality_minmax_sizeFixture.cs.tree-viewer.txt'), header=None) df = df[df[0].notnull()] df.applymap(filter_type) matrix = CountVectorizer(max_features=10) X = matrix.fit_transform(df[0]).toarray() print(matrix.vocabulary_) print(matrix.get_params()) df[0].iloc[0:10].str.cat(sep=' ') starters = df.loc[df[0] == "BEGIN_METHOD"] enders = df.loc[df[0] == "END_METHOD"] zipped = list(zip(starters.index, enders.index)) functions_list = [] for begin, end in zipped: functions_list.append(df[0].iloc[begin:end+1].str.cat(sep=' '))
def model(df): #hasher = HashingVectorizer(n_features=100, analyzer='word', stop_words='english', alternate_sign=False, norm=None) hasher = CountVectorizer(analyzer='word', stop_words='english') vectorizer = make_pipeline(hasher, TfidfTransformer(use_idf=False)) X = vectorizer.fit_transform(df['links']) print(hasher.get_params()) normalizer = Normalizer(copy=False) svd = TruncatedSVD(n_components=12) lsa = make_pipeline(svd, normalizer) Y = lsa.fit_transform(X) km = KMeans(n_clusters=4, init='k-means++', max_iter=1000, n_init=1) Z = km.fit_predict(Y) df['labels'] = Z df['first'] = Y[:, 0] df['second'] = Y[:, 1] df['third'] = Y[:, 2] result = df[['outlet', 'total', 'labels', 'first', 'second', 'third']] return result
def gen_document_term_matrices(args, data): """Generates document-term matrices""" # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html tf_vectorizer = CountVectorizer(stop_words="english", max_features=args.num_features, max_df=0.95, min_df=2) dtm_tf = tf_vectorizer.fit_transform(data) with open("%s/dtm_tf.pkl" % args.output_dir, "wb") as dtm_file: pickle.dump(tf_vectorizer, dtm_file) pickle.dump(dtm_tf, dtm_file) # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params()) dtm_tfidf = tfidf_vectorizer.fit_transform(data) with open("%s/dtm_tfidf.pkl" % args.output_dir, "wb") as dtm_file: pickle.dump(tfidf_vectorizer, dtm_file) pickle.dump(dtm_tfidf, dtm_file) return tf_vectorizer, dtm_tf, dtm_tfidf
def topic_modelling(data): abstracts = [] for abstract in data: # Remove punctuation abstract = re.sub('[,\.!?]', '', abstract) # Remove numbers abstract = re.sub('[0-9]', '', abstract) # Convert the abstracts to lowercase abstract = abstract.lower() abstracts.append(abstract) # Splitting abstracts snnipets = [] for abstract in abstracts: if abstract != "abstract not available": length = len(abstract) index = 0 last_i = 0 n = 256 while index < length: i = abstract.rfind(". ", index, index + n) if i == -1 or i == index: i = index + n text = abstract[index:i + 2] index = i + 2 snnipets.append(text) # Creating LDA #number_topics = 5 tf_vectorizer = CountVectorizer(stop_words='english') tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params()) dtm_tfidf = tfidf_vectorizer.fit_transform(snnipets) lda_tfidf = LDA(random_state=0) lda_tfidf.fit(dtm_tfidf) # Visualizing LDA data = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer, mds='mmds') html = pyLDAvis.prepared_data_to_html(data, template_type="simple") return html
from sklearn.feature_extraction.text import CountVectorizer import itertools import numpy as np data = ['She did not cheat on the test, for it was not the right thing to do.','I think I will buy the red car, or I will lease the blue one.','I really want to go to work, but I am too sick to drive.','I am counting my calories, yet I really want dessert.'] count_vect = CountVectorizer() cx = count_vect.fit_transform(data) vocab = {} for key, value in count_vect.vocabulary_.iteritems(): vocab[value] = key #print vocab start = 0 for i, end in enumerate(cx.indptr[1:]): for j, val in zip(cx.indices[start:end], cx.data[start:end]): print "("+str(i)+","+str(j)+"): "+str(val)+" => '"+vocab[j]+"'" #print vocab[j]+" ", print "" start=end print count_vect.get_params() #for i,j,v in zip(cx.row, cx.col, cx.data): # print (i,j,v)
for file in files: #Topic modeling that reads in utterances df = pd.read_csv(file) utterance_temp.append(df['stringList'].tolist()) utterance_raw = [item for sublist in utterance_temp for item in sublist] tf_vectorizer = CountVectorizer(strip_accents='unicode', stop_words='english', lowercase=True, token_pattern=r'\b[a-zA-Z]{3,}\b', max_df=0.2, min_df=0) dtm_tf = tf_vectorizer.fit_transform(utterance_raw) tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params()) dtm_tfidf = tfidf_vectorizer.fit_transform(utterance_raw) # for TF DTM lda_tf = LatentDirichletAllocation(n_topics=30, random_state=0) lda_tf.fit(dtm_tf) # for TFIDF DTM lda_tfidf = LatentDirichletAllocation(n_topics=30, random_state=0) lda_tfidf.fit(dtm_tfidf) nmf_tf = NMF(n_components=80, random_state=1, alpha=.1, l1_ratio=.5).fit(dtm_tf) # nmf_tfidf = NMF(n_components=10, random_state=1, # beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, # l1_ratio=.5).fit(dtm_tfidf)
matrix.vocabulary_ # In[93]: sizes = sorted(np.asarray(cv_fit.sum(axis=0))[0], reverse=True) print(sizes) # In[94]: values = list(matrix.vocabulary_.keys()) values # In[96]: matrix.get_params() # #### How to combine multiple rows into a single row with pandas # In[97]: df[0].iloc[0:10].str.cat(sep=' ') # In[ ]: get_ipython().run_line_magic('pinfo', 'matrix') # #### separate functions from each other # In[98]:
class NewsBias: def __init__(self): self.tf_vectorizer = [] self.tf = [] self.lda_model = [] self.feature_names = [] self.topics_mat = [] self.sentiment_by_topic = [] def fix_sites(mongo_db): fix_cnn(mongo_db) fix_huffpo(mongo_db) def from_mongo(self, db_name): df = get_df(db_name) df = clean_df(df) df = df[pd.notnull(df['processed_text'])] df = df[df['processed_text'] != ''] return df def from_csv(self, csv_name): try: df = pd.read_csv('data/' + csv_name, parse_dates=False) return df except: print('CSV file does not exist!') print('Make sure CSV file is in data folder.') return False def to_csv(self, df, filename): filename = 'data/' + filename df.to_csv(filename, index=False) print('CSV file saved to: ' + filename) def update_from_bucket(self, filename): path = os.getcwd() # Example filename: 'dsiprojectdata/rss_feeds_new.tar' result = from_bucket(filename, path) if not result: print('Error updating data from bucket!') print( 'Make sure you include folder and file in filename from bucket.' ) def update_to_bucket(self, filename, bucketname, mongo_db=False): # If mongo database then just give database name as filename if mongo_db: cwd = os.getcwd() # Give permission to bash file then run p1 = subprocess.Popen('chmod', '+x', 'backup.sh', stdout=subprocess.PIPE, stderr=subprocess.PIPE) out1, err1 = p1.communicate() p2 = subprocess.Popen(cwd + '/backup.sh', filename, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out2, err2 = p2.communicate() else: p = subprocess.Popen('/usr/bin/aws', 's3', 'cp', filename, 's3://' + bucketname + '/', stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() def run_lda(self, df, max_features=1000, n_topics=20): df = df[pd.notnull(df['processed_text'])] processed_text = df['processed_text'].values.tolist() # Inclued quotes in LDA processed_quote = df['processed_quote'].values.tolist() processed_tweet = df['processed_tweet'].values.tolist() processed_all = [] for text, quote, tweet in zip(processed_text, processed_quote): # Check if quote is nan if type(quote) == float: quote = '' if type(tweet) == float: tweet = '' processed_all.append(text + quote + tweet) try: self.tf_vectorizer = CountVectorizer(max_df=0.95, min_df=0.05, max_features=max_features, stop_words='english') self.tf = self.tf_vectorizer.fit_transform(processed_all) except: import pdb pdb.set_trace() self.lda_model = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0, n_jobs=-1) self.lda_model.fit(self.tf) self.feature_names = np.array(self.tf_vectorizer.get_feature_names()) self.topics_mat = self.lda_model.components_ return self.lda_model def run_gensim_lda(self, df, n_topics=20): self.lda_model = gensim_lda(df, n_topics) def get_top_word_by_topic(topic, n_words): return self.feature_names[np.argsort( self.topics_mat[topic, :])[::-1]][:n_words] def visualize_lda(self, df, display=False): if self.lda_model == []: self.run_lda(df) max_features = self.tf_vectorizer.get_params()['max_features'] n_topics = self.lda_model.get_params()['n_topics'] vis_data = pyLDAvis.sklearn.prepare(self.lda_model, self.tf, self.tf_vectorizer, R=n_topics, n_jobs=-1) pyLDAvis.save_html( vis_data, 'plots/pyLDAvis_' + str(max_features) + 'feats_' + str(n_topics) + 'topics.html') if display: pyLDAvis.show(vis_data) def get_sentiment_of_words(self, df): sentiment_of_words = sentiment_of_words_wordnet(df) return sentiment_of_words def get_sentiment_by_topic(self, df, display=False): n_topics = self.lda_model.get_params()['n_topics'] self.sentiment_by_topic = sentiment_by_topic_wordnet( df, self.topics_mat, self.feature_names) if display: for i, site in enumerate(sentiment_by_topic.keys()): plt.subplot(3, 4, i + 1) score = [] for topic in range(n_topics): score.append(sentiment_by_topic[site][topic][3]) score = np.array(score) score /= sum(np.abs(score)) plt.bar(np.arange(len(score)), score, align='center') plt.ylabel('Score') plt.title('Score by Topic for ' + site) plt.subplots_adjust(hspace=0.4, wspace=0.4) plt.show() return self.sentiment_by_topic def length_of_articles_hist(self, df): for i, site in enumerate(df['source'].unique()): plt.subplot(3, 4, i + 1) new_df = df[df['source'] == site] article_len = [ len(article.split(' ')) for article in new_df['article_text'] ] plt.hist(article_len, normed=True) plt.xlabel('Length of Article') plt.ylabel('# of Articles') plt.title('Length of articles for ' + site) plt.subplots_adjust(hspace=0.4, wspace=0.4) plt.show() def pickle_everything(self): filename = '../pickles/lda_model.pkl' pickle.dump(self.lda_model, open(filename, 'wb'), protocol=2) filename = '../pickles/tf_vectorizer.pkl' pickle.dump(self.tf_vectorizer, open(filename, 'wb'), protocol=2)
fid.close() f2.close() # i = i + 1 # print i print len(corpus) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() weight = tfidf.toarray() vectorP = vectorizer.get_params() tfidfP = transformer.get_params() print 'vectorP:', vectorP print 'tfidfP:', tfidfP joblib.dump(vectorizer, "vectorizer" + str(sys.argv[1]) + ".m") joblib.dump(transformer, "tfidf" + str(sys.argv[1]) + ".m") resName = "BaiduTfidf_Result.txt" result = codecs.open(resName, 'w', 'utf-8') for j in range(len(word)): result.write(word[j] + ' ') result.write('\r\n\r\n') for i in range(len(weight)):
X = cv.fit_transform(corpus).toarray() messages.columns y = messages['Label_enc'] ## Divide the dataset into Train and Test from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0) cv.get_feature_names()[:20] cv.get_params() from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB() from sklearn import metrics import numpy as np import itertools classifier.fit(X_train, y_train) pred = classifier.predict(X_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) cm = metrics.confusion_matrix(y_test, pred) import matplotlib.pyplot as plt
X_train, X_test, y_train, y_test = train_test_split(df['cleaned'], df.opinion, test_size=0.2) count = CountVectorizer(ngram_range=(1, 2), analyzer="word", min_df=10, max_df=0.9) #count = CountVectorizer(ngram_range=(1,2),analyzer="word") #count = CountVectorizer(lowercase=False) temp = count.fit_transform(xtrain) print(count.vocabulary_.__len__()) #print(count.get_feature_names()) print(count.get_params()) #her satırda bir cümle nin vectorizeri tdif = TfidfTransformer() temp2 = tdif.fit_transform(temp) print(temp2) text_regression = LogisticRegression() model = text_regression.fit(temp2, ytrain) prediction_data = tdif.transform((count.transform(xtest))) #prediction_data = count.transform(xtest) predicted = model.predict(prediction_data) print(model.get_params())
message = [ ps.stem(word) for word in message if not word in stopwords.words('english') ] meassage = ' '.join(message) corpus.append(meassage) print(corpus[0]) #now applying countvector and apply bag of words from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=5000, ngram_range=(1, 3)) x = cv.fit_transform(corpus).toarray() print(cv.get_feature_names()[:20]) #check the top 20 feature names print(cv.get_params()) #input parameter formed and their types """vector formed in bag of words""" # count_df = pd.DataFrame(x_train, columns = cv.get_feature_names()) # count_df.head() from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB() import numpy as np from sklearn import metrics
def generate_vectors(train_url, test_url=None, column='article', trans_type=None, max_n=1, min_df=1, max_df=1.0, max_features=1, sublinear_tf=True, balanced=False, re_weight=0, verbose=False, drop_words=0): """ generate X, y, X_test vectors with csv(with header) url use pandas and CountVectorizer Args: train_url: url to train csv test_url: url to test csv, set to None if not need X_test column: column to use as feature trans_type: specific transformer, {'dc','idf'} max_n: max_n for ngram_range min_df: min_df for CountVectorizer max_df: max_df for CountVectorizer max_features: max_features for CountVectorizer sublinear_tf: sublinear_tf for default TfdcTransformer balanced: balanced for default TfdcTransformer, for idf transformer, it is use_idf re_weight: re_weight for TfdcTransformer verbose: True to show more information drop_words: randomly delete some words from sentences Returns: X, y, X_test """ verbose and print("loading '%s' level data from %s with pandas" % (column, train_url)) train_df = load_to_df(train_url) # vectorizer vec = CountVectorizer(ngram_range=(1, max_n), min_df=min_df, max_df=max_df, max_features=max_features, token_pattern='\w+') s_time = time() verbose and print("finish loading, vectorizing") verbose and print("vectorizer params:", vec.get_params()) sequences = train_df[column] # delete some words randomly for i, row in enumerate(sequences): if drop_words <= 0: break if np.random.ranf() < drop_words: row = np.array(row.split()) sequences.at[i] = ' '.join(row[np.random.ranf(row.shape) > 0.35]) X = vec.fit_transform(sequences) e_time = time() verbose and print("finish vectorizing in %.3f seconds, transforming" % (e_time - s_time)) # transformer if trans_type is None or trans_type == 'idf': trans = TfidfTransformer(sublinear_tf=sublinear_tf, use_idf=balanced) else: trans = TfdcTransformer(sublinear_tf=sublinear_tf, balanced=balanced, re_weight=re_weight) verbose and print("transformer params:", trans.get_params()) y = np.array((train_df["class"]).astype(int)) X = trans.fit_transform(X, y) X_test = None if test_url: verbose and print("transforming test set") test_df = load_to_df(test_url) X_test = vec.transform(test_df[column]) X_test = trans.transform(X_test) s_time = time() verbose and print("finish transforming in %.3f seconds\n" % (s_time - e_time)) return X, y, X_test
def train_topic_model(wordcloud_path, number_topics, model_path, preprocessed_path, clusterable_words, query_words, filename, out, use_tfidf, expert_terms): outfile, outfile_pos, wordcloud_file, wordcloud_json, statistic_topics_json = prepare_infrastructure.prepare_file_names_train( filename, out) stop_words_german, stop_words_english = prepare_stopwords(query_words) # Json file to store topics and their word distribution if os.path.isfile(wordcloud_path): os.remove(wordcloud_path) file = open(wordcloud_path, 'a', encoding='utf8') var1 = {'name': 'topics', 'children': []} place_holder_list = var1.get('children') infile = clusterable_words # Removing stopwords from the clusterable words fin = open(infile, 'r', encoding='utf8') fout = open(outfile, "w+", encoding='utf8') for line in fin.readlines(): for word in line.split(): if not word in stop_words_german: fout.write(word + ' ') fout.write('\n') fin.close() fout.close() # Learn the vocabulary dictionary and return term-document matrix (BOWs) new_data, stem_lemma_dict = stem_clusterable_words(outfile) empty_lines = [] new_data_sklearn = [] # replace by sklearn for i, n in enumerate(new_data): if new_data[i] == '': empty_lines.append(i) else: new_data_sklearn.append(n.split(' ')) with open('output/new_data_gensim.sav', 'wb') as f: pickle.dump(new_data_sklearn, f) topic_input = open('output/topic_input.txt', "w+", encoding='utf8') for k in new_data: topic_input.write(k + '\n') topic_input.close() if use_tfidf is False: print("INFO: no tfidf in use") tf_vectorizer = CountVectorizer(strip_accents='unicode', stop_words='english', lowercase=True, token_pattern=r'\b[a-zA-Z]{3,}\b', max_df=1.0, min_df=3) dtm_tf = tf_vectorizer.fit_transform(new_data) vocab = tf_vectorizer.get_feature_names() #print(dtm_tf.shape) lda_model = LatentDirichletAllocation(n_topics=number_topics, random_state=0, max_iter=50, max_doc_update_iter=500) lda_model.fit(dtm_tf) else: print("INFO: tfidf in use") tf_vectorizer_init = CountVectorizer(strip_accents='unicode', stop_words='english', lowercase=True, token_pattern=r'\b[a-zA-Z]{3,}\b', max_df=1.0, min_df=3) tf_vectorizer = TfidfVectorizer(**tf_vectorizer_init.get_params()) dtm_tf = tf_vectorizer.fit_transform(new_data) vocab = tf_vectorizer.get_feature_names() #print(dtm_tf.shape) lda_model = LatentDirichletAllocation(n_topics=number_topics, random_state=0, max_iter=50, max_doc_update_iter=500) lda_model.fit(dtm_tf) pickle.dump(lda_model, open(model_path, 'wb')) model = pickle.load(open(model_path, 'rb')) prepare_topic_distribution(model, place_holder_list, stem_lemma_dict, vocab) save_word_cloud_json(var1, file) display_word_cloud(number_topics, wordcloud_file, wordcloud_json) save_train_topic_to_json(model, dtm_tf, preprocessed_path, statistic_topics_json, expert_terms, wordcloud_path) vis = pyLDAvis.sklearn.prepare(lda_model, dtm_tf, tf_vectorizer, sort_topics=False) pyLDAvis.save_html(vis, 'output/LDA_Visualization_sklearn.html')
test_size=0.33, random_state=25) # In[25]: y_test.shape # In[26]: cv.get_feature_names( )[: 20] # Top 20 feature names for this data set #, which shows 2 words and 3 words togather # # In[27]: cv.get_params() # will give details for the count vectorizer applied # # In[28]: # Data set after applying tyhe count vectporizer # df_count = pd.DataFrame(X, columns=cv.get_feature_names()) df_count.head() # In[29]: # Applying the Multionomial NB algorithm # from sklearn.naive_bayes import MultinomialNB mn = MultinomialNB()
# define steps in the pipeline # (1) define parameters of count vectorizer vec = CountVectorizer( analyzer="word", stop_words='english', ngram_range=(1, 2), #preprocessor=None, tokenizer=word_tokenize, max_features=10000) # inspect: vec.get_stop_words() vec.get_feature_names()[:10] #first 10 features (unigrams and bigrams) vec.get_params() # (2) classifier lr = LogisticRegression() # inspect: lr.get_params() # define a scikit learn pipeline pipe_bigram_lr_clf = Pipeline([('vectorizer', vec), ('classifier', lr)]) # inspect: steps pipe_bigram_lr_clf.named_steps ### Fit transformer/classifier Pipeline to train data (X_train and y_train) ----
def generate_vectors(train_url, test_url=None, column='article', trans_type=None, max_n=1, min_df=1, max_df=1.0, max_features=1, sublinear_tf=True, balanced=False, re_weight=0, verbose=False, drop_words=0, multilabel_out=False, label_col='subjects', only_single=True, shuffle=True, apply_fun=None): """ generate X, y, X_test vectors with csv(with header) url use pandas and CountVectorizer Args: train_url: url to train csv test_url: url to test csv, set to None if not need X_test column: column to use as feature trans_type: specific transformer, {'dc','idf', 'hashing'} max_n: max_n for ngram_range min_df: min_df for CountVectorizer max_df: max_df for CountVectorizer max_features: max_features for CountVectorizer sublinear_tf: sublinear_tf for default TfdcTransformer balanced: balanced for default TfdcTransformer, for idf transformer, it is use_idf re_weight: re_weight for TfdcTransformer verbose: True to show more information drop_words: randomly delete some words from sentences multilabel_out: return y as multilabel format label_col: col name of label only_single: only keep records of single label shuffle: re sample train data apply_fun: callable to be applied on label column Returns: X, y, X_test """ verbose and print("loading '%s' level data from %s with pandas" % (column, train_url)) train_df = pd.read_csv(train_url) if shuffle: train_df = train_df.sample(frac=1) if only_single: train_df = train_df[train_df['subjects'].apply(lambda x: len(x) < 2)] # vectorizer s_time = time() analyzer = 'word' if column == 'word_seg' else 'char' vec = CountVectorizer(analyzer=analyzer, ngram_range=(1, max_n), min_df=min_df, max_df=max_df, max_features=max_features, token_pattern='\w+') verbose and print("finish loading, vectorizing") verbose and print("vectorizer params:", vec.get_params()) sequences = train_df[column] # delete some words randomly for i, row in enumerate(sequences): if drop_words <= 0: break if np.random.ranf() < drop_words: row = np.array(row.split()) sequences.at[i] = ' '.join(row[np.random.ranf(row.shape) > 0.35]) X = sequences if trans_type == 'hashing' else vec.fit_transform(sequences) e_time = time() verbose and print("finish vectorizing in %.3f seconds, transforming" % (e_time - s_time)) # transformer if trans_type is None or trans_type == 'idf': trans = TfidfTransformer(sublinear_tf=sublinear_tf, use_idf=balanced) elif trans_type == 'dc': trans = TfdcTransformer(sublinear_tf=sublinear_tf, balanced=balanced, re_weight=re_weight) else: trans = HashingVectorizer(analyzer=analyzer, ngram_range=(1, max_n), n_features=max_features, token_pattern='\w+', binary=not balanced) verbose and print(trans_type, "transformer params:", trans.get_params()) if multilabel_out: mlb = MultiLabelBinarizer() y = mlb.fit_transform(train_df[label_col].apply(str.split)) verbose and print("multilabel columns:\n", mlb.classes_) else: y = train_df[label_col].apply(apply_fun).values if apply_fun is not None \ else train_df[label_col].values X = trans.fit_transform(X, y) X_test = None if test_url: verbose and print("transforming test set") test_df = pd.read_csv(test_url) X_test = test_df[column] if trans_type == 'hashing' else vec.transform( test_df[column]) X_test = trans.transform(X_test) s_time = time() verbose and print("finish transforming in %.3f seconds\n" % (s_time - e_time)) return X, y, X_test