rate_model_detection = MultinomialNB() rate_model_detection.fit(X_train, y_train) predictions = rate_model_detection.predict(X_test) from sklearn.metrics import classification_report, confusion_matrix print(confusion_matrix(y_test, predictions)) print('\n') print(classification_report(y_test, predictions)) from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline pipeline = Pipeline([('cv', CountVectorizer()), ('tfidf', TfidfTransformer()), ('naive', MultinomialNB())]) X = yelp_class['text'] y = yelp_class['stars'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101) pipeline.fit(X_train, y_train) predictions = pipeline.predict(X_test)
feature_vals.append(feature_names[idx]) # create a tuples of feature,score # results = zip(feature_vals,score_vals) results = {} for idx in range(len(feature_vals)): results[feature_vals[idx]] = score_vals[idx] return results docs = des.tolist() cv = CountVectorizer(max_df=0.85, max_features=10000) word_count_vector = cv.fit_transform(docs) tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) tfidf_transformer.fit(word_count_vector) feature_names = cv.get_feature_names() docs = " ".join(docs) docs = remove_html(docs) docs = docs.replace('br', '') tf_idf_vector = tfidf_transformer.transform(cv.transform([docs])) sorted_items = sort_coo(tf_idf_vector.tocoo()) keywords = extract_topn_from_vector(feature_names, sorted_items, 10) plt.bar(*zip(*keywords.items())) plt.xticks(rotation=60) plt.savefig("des_keywords.jpg") plt.show()
traind.head(10) traind.shape dbpedia_df = traind X = dbpedia_df['sentence'] Y = dbpedia_df['condition'] count_vectorizer = CountVectorizer(min_df=0, max_df=80, ngram_range=(2, 2)) feature_vector = count_vectorizer.fit_transform(X) feature_vector.shape tfidf_transformer = TfidfTransformer() feature_vector = tfidf_transformer.fit_transform(feature_vector) feature_vector.shape X_dense = feature_vector.todense() X_dense.shape x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size=0.2) import torch import numpy as np Xtrain_ = torch.from_numpy(x_train).float() Xtest_ = torch.from_numpy(x_test).float()
from scipy.sparse.linalg import svds movie_dataframe['parse'] = movie_dataframe['content'].apply(st.whitespace_nlp_with_sentences) #corpus matrix corpus = (st.CorpusFromParsedDocuments(movie_dataframe, category_col='review', parsed_col='parse') .build() .get_stoplisted_unigram_corpus()) corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['author']) # Calculating the Eigen matrix of the corpus..... embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()) u, s, vt = svds(embeddings, k=1000, maxiter=20000, which='LM') projection = pd.DataFrame({'term': corpus.get_metadata(), 'x': u.T[0], 'y': u.T[1]}).set_index('term') #Plot 2 category = 'positive' scores = (corpus.get_category_ids() == corpus.get_categories().index(category)).astype(int) html = st.produce_pca_explorer(corpus, category=category, category_name='positive', not_category_name='negative', metadata=movie_dataframe['author'], width_in_pixels=1000, show_axes=False, use_non_text_features=True,
if __name__ == '__main__': args = parseArguments() if args["training_datafile"] is not None: print ("-------Training-------") original_training_df = getDataFrame(args["training_datafile"]) clean_training_df = cleanDataFrame(original_training_df.copy()) print ("splitting data into training and validation set") training_set, validation_set = train_test_split(clean_training_df, test_size=0.3) print (training_set.shape) print (validation_set.shape) pipeline = Pipeline([ ('count_vectorizer',CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer()), ('classifier',BernoulliNB(binarize=0.0)) ]) model = trainModel(training_set,pipeline) print ("----Validation and Classification Report----") predicted = predictModel(validation_set, model) target_classifications = list(set(validation_set["classification"])) print(classification_report(validation_set["classification"], predicted, target_names=target_classifications)) print ("----Saving Model----") if args["modelpath"] is not None: saveModel(model, args["modelpath"]) else: saveModel(model) if args["modelpath"] is not None and args["test_datafile"] is not None:
def test_tfidf_transformer_type(X_dtype): X = sparse.rand(10, 20000, dtype=X_dtype, random_state=42) X_trans = TfidfTransformer().fit_transform(X) assert X_trans.dtype == X.dtype
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Jul 17 17:17:16 2018 @author: Junbin Gao All Copyright """ from sklearn.feature_extraction.text import TfidfTransformer # Initiate the transformer transformer = TfidfTransformer(smooth_idf=False) # Check what it is transformer # Corpus with three different terms and their counts in a corpus of 6 documents counts = [[3, 0, 1], [2, 0, 0], [3, 0, 0], [4, 0, 0], [3, 2, 0], [3, 0, 2]] # Transform the corpus tfidf = transformer.fit_transform(counts) # This is the transformed feature matrix for 6 documents # This matrix can be pipelined into a machine learning algorithm # Each row is normalized to have unit Euclidean norm: X = tfidf.toarray()
from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, OneHotEncoder numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) # https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/ch04.html text_transformer = Pipeline( steps=[('bow', CountVectorizer( analyzer=process_text)), ('tfidf', TfidfTransformer())]) def get_column_transformer_preprocessor(numeric_features, categorical_features, text_features): transformers = [] if len(numeric_features) > 0: transformers.append(('num', numeric_transformer, numeric_features)) if len(categorical_features) > 0: transformers.append( ('cat', categorical_transformer, categorical_features)) for x in text_features: transformers.append(('txt_' + str(x), text_transformer, x))
from sklearn.semi_supervised import LabelSpreading from sklearn.metrics import f1_score data = fetch_20newsgroups(subset="train", categories=None) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) print() # Parameters sdg_params = dict(alpha=1e-5, penalty="l2", loss="log") vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8) # Supervised Pipeline pipeline = Pipeline([ ("vect", CountVectorizer(**vectorizer_params)), ("tfidf", TfidfTransformer()), ("clf", SGDClassifier(**sdg_params)), ]) # SelfTraining Pipeline st_pipeline = Pipeline([ ("vect", CountVectorizer(**vectorizer_params)), ("tfidf", TfidfTransformer()), ("clf", SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)), ]) # LabelSpreading Pipeline ls_pipeline = Pipeline([ ("vect", CountVectorizer(**vectorizer_params)), ("tfidf", TfidfTransformer()), # LabelSpreading does not support dense matrices ("todense", FunctionTransformer(lambda x: x.todense())), ("clf", LabelSpreading()),
def main(): analogies_to_try = ( ('king', 'man', 'woman'), ('france', 'paris', 'london'), ('france', 'paris', 'rome'), ('paris', 'france', 'italy'), # ('city', 'state', 'german'), ) ### choose a data source ### # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500) sentences, word2idx = get_wikipedia_data(n_files=3, n_vocab=2000, by_paragraph=True) # sentences, word2idx = get_wikipedia_data(n_files=5, n_vocab=2000, by_paragraph=True) # with open('tfidf_word2idx.json', 'w') as f: # json.dump(word2idx, f) notfound = False for word_list in analogies_to_try: for w in word_list: if w not in word2idx: print("%s not found in vocab, remove it from \ analogies to try or increase vocab size" % w) notfound = True if notfound: exit() # build term document matrix V = len(word2idx) N = len(sentences) # create raw counts first A = np.zeros((V, N)) j = 0 for sentence in sentences: for i in sentence: A[i, j] += 1 j += 1 print("finished getting raw counts") transformer = TfidfTransformer() A = transformer.fit_transform(A.T).T # tsne requires a dense array A = A.toarray() # map back to word in plot idx2word = {v: k for k, v in iteritems(word2idx)} # plot the data in 2-D tsne = TSNE() Z = tsne.fit_transform(A) plt.scatter(Z[:, 0], Z[:, 1]) for i in range(V): try: plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i, 0], Z[i, 1])) except: print("bad string:", idx2word[i]) plt.draw() ### multiple ways to create vectors for each word ### # 1) simply set it to the TF-IDF matrix # We = A # 2) create a higher-D word embedding tsne = TSNE(n_components=3) We = tsne.fit_transform(A) # 3) use a classic dimensionality reduction technique # svd = KernelPCA(n_components=20, kernel='rbf') # We = svd.fit_transform(A) for word_list in analogies_to_try: w1, w2, w3 = word_list find_analogies(w1, w2, w3, We, word2idx, idx2word) plt.show() # pause script until plot is closed
processed_textdata1, class1 = loadtrainset( "C:/Users/Administrator/Desktop/dataset/train/hotel", "宾馆") processed_textdata2, class2 = loadtrainset( "C:/Users/Administrator/Desktop/dataset/train/travel", "旅游") processed_textdata3, class3 = loadtrainset( "C:/Users/Administrator/Desktop/dataset/train/travel", "旅游") train_data = processed_textdata1 + processed_textdata2 classtags_list = class1 + class2 count_vector = CountVectorizer() vecot_matrix = count_vector.fit_transform(train_data) #TFIDF train_tfidf = TfidfTransformer(use_idf=False).fit_transform(vecot_matrix) clf = MultinomialNB().fit(train_tfidf, classtags_list) testset = [] path = "C:/Users/Administrator/Desktop/dataset/tt" allfiles = os.listdir(path) hotel = 0 travel = 0 for thisfile in allfiles: path_name = path + "/" + thisfile new_count_vector = count_vector.transform([preprocess(path_name)]) new_tfidf = TfidfTransformer(use_idf=False).fit_transform(new_count_vector) predict_result = clf.predict(new_tfidf)
train_id_list = total_id_list[:train_size] test_id_list = total_id_list[train_size:] train_title_list = total_title_list[:train_size] test_title_list = total_title_list[train_size:] train_date_list = total_date_list[:train_size] test_date_list = total_date_list[train_size:] train_code_list = total_code_list[:train_size] test_code_list = total_code_list[train_size:] count_vec = CountVectorizer(min_df=1) tf_train = count_vec.fit_transform(train_text_list) tfidf_transformer = TfidfTransformer().fit(tf_train) tfidf_train = tfidf_transformer.transform(tf_train) tf_test = count_vec.transform(test_text_list) tfidf_test = tfidf_transformer.transform(tf_test) del AllData del total_text_list del total_label_list del total_id_list del total_title_list del total_date_list del total_code_list del train_text_list del test_text_list del tf_train
def dat_prep(nbd_train,nbd_test,k,vect_type,Type_train,Type_test,Chr_train,Chr_test,Label_train,Label_test,scaled_feats_train,scaled_feats_test,dummy_train,dummy_test): #Derives the Count Vectorizer or TFIDF scores for a given neighborhood sequence """ Arguments: nbd_train = Column containing the neighborhood sequence from the training data nbd_test = Column containing the neighborhood sequence from the test data k=size of kmer vect_type= 'CV' for Count Vectorizer or else TFIDF Vectorizer Type_train=Numerically encoded substitution Type ("A>T" encoded as 1 or "G>C" encoded as 2 and so on) from training data Type_test=Numerically encoded substitution Type ("A>T" encoded as 1 or "G>C" encoded as 2 and so on) from test data Chr_train= Chromosome number from training data Chr_test=Chromosome number from test data Label_train=Binary label (training data), where 1=Passenger and 2=Driver Label_test=Binary label (test data), where 1=Passenger and 2=Driver scaled_feats_train=Scaled genomic features (consrvation, amino acid etc.) for training data scaled_feats_test=Scaled genomic features (consrvation, amino acid etc.) for test data dummy train= One-hot encoding based feature matrix for training data dummy test=One-hot encoding based feature matrix for test data Returns: df_comb_train= The complete dataframe (using training data) of TFIDF or CountVect scores plus other features such as chromosome number and substitution type df_comb_test= The complete dataframe (using test data) of TFIDF or CountVect scores plus other features such as chromosome number and substitution type count_vector_train= Just the TFIDF or Count vect features (training data) also known as the Document-Term matrix count_vector_test= Just the TFIDF or Count vect features (test data) also known as the Document-Term matrix cols= feature names vect= The vocabulary derived from the training data sc= The scaling variable derived from the training data """ if(vect_type=="CV"): vect=Pipeline([('cv1',CountVectorizer(lowercase=False))]) else: vect = Pipeline([('cv1',CountVectorizer(lowercase=False)), ('tfidf_transformer',TfidfTransformer(smooth_idf=True,use_idf=True))]) count_vector_train=vect.fit_transform(preprocess(nbd_train,k)) count_vector_test=vect.transform(preprocess(nbd_test,k)) df_train=pd.DataFrame(count_vector_train.todense(),columns=vect['cv1'].get_feature_names()) df_test=pd.DataFrame(count_vector_test.todense(),columns=vect['cv1'].get_feature_names()) if(vect_type=="tf"): sc=MinMaxScaler() #We have used fit_transform() here because we wanted to learn the vocabulary dictionary and return document-term matrix using the traininig data df_train=pd.DataFrame(sc.fit_transform(df_train),columns=df_train.columns) #We have used transform() here since we already have a pretrained vocabulary using which we just wanted to derive the term-document matrix for the test data df_test=pd.DataFrame(sc.transform(df_test),columns=df_test.columns) df_train['Type']=Type_train;df_test['Type']=Type_test df_train['Label']=Label_train;df_test['Label']=Label_test df_train['Chr']=Chr_train;df_test['Chr']=Chr_test df_comb_train=pd.concat([df_train, scaled_feats_train,dummy_train], axis=1) df_comb_test=pd.concat([df_test, scaled_feats_test,dummy_test], axis=1) df_comb_train = df_comb_train.loc[:,~df_comb_train.columns.duplicated()] df_comb_test = df_comb_test.loc[:,~df_comb_test.columns.duplicated()] cols=vect['cv1'].get_feature_names() return df_comb_train,df_comb_test,count_vector_train,count_vector_test,cols,vect,sc
#we have matrix of size of (10240, 12196) by calling below def get_countVectorizer_stats(): #vocab size train_count.shape #check vocabulary using below command print(countV.vocabulary_) #get feature names print(countV.get_feature_names()[:25]) #create tf-df frequency features #tf-idf tfidfV = TfidfTransformer() train_tfidf = tfidfV.fit_transform(train_count) def get_tfidf_stats(): train_tfidf.shape #get train data feature names print(train_tfidf.A[:10]) #bag of words - with n-grams #countV_ngram = CountVectorizer(ngram_range=(1,3),stop_words='english') #tfidf_ngram = TfidfTransformer(use_idf=True,smooth_idf=True) tfidf_ngram = TfidfVectorizer(stop_words='english', ngram_range=(1, 4),
8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1 } text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None, class_weight=class_weight))]) text_clf.fit(X_train, y_train) predicted_SVM = text_clf.predict(X_test) print("SVM part, metrics on test set:") print(metrics.classification_report(y_test, predicted_SVM)) from sklearn.model_selection import GridSearchCV parameters = {
# In[76]: sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1])) print('sparsity: {}'.format((sparsity))) # In[77]: from sklearn.feature_extraction.text import TfidfTransformer # In[78]: tfidf_transformer=TfidfTransformer().fit(messages_bow) # In[79]: tfidf4=tfidf_transformer.transform(bow4) # In[80]: print(tfidf4) # In[81]: tfidf_transformer.idf_[bow_transformer.vocabulary_['university']]
return [self.wnl.lemmatize(token, TAG_MAP[tag[0]]) for token, tag in pos_tag(tokenized)] categories = ['alt.atheism','soc.religion.christian','comp.graphics','sci.med'] print('defining dataset') trainingData = fetch_20newsgroups(subset='train', categories=categories, shuffle=True) countVectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), lowercase=True, strip_accents='unicode') print('transforming data to tfidf') xTrainCounts = countVectorizer.fit_transform(trainingData.data) print('done') # print(countVectorizer.vocabulary_.get(u'software')) tfidTransformer = TfidfTransformer() xTrainTfidf = tfidTransformer.fit_transform(xTrainCounts) model = MultinomialNB().fit(xTrainTfidf, trainingData.target) preds = model.predict(xTrainTfidf) print(confusion_matrix(trainingData.target, preds)) print(accuracy_score(trainingData.target, preds)) print(classification_report(trainingData.target, preds)) new = ['This has nothing to do with church or religion', 'Software engineering is getting hotter and hotter nowadays'] xNewCounts = countVectorizer.transform(new) xNewTfidf = tfidTransformer.transform(xNewCounts) predicted = model.predict(xNewTfidf)
"./data/mid_cut_jieba.txt") x_train, x_test, y_train, y_test = train_test_split(x_text, y, test_size=0.2, random_state=2017) y = y.ravel() y_train = y_train.ravel() y_test = y_test.ravel() print("Train/Test split: {:d}/{:d}".format(len(y_train), len(y_test))) """ Naive Bayes classifier """ bayes_clf = Pipeline([ ('vect', CountVectorizer() ), # Convert a collection of text documents to a matrix of token counts ('tfidf', TfidfTransformer() ), # Transform a count matrix to a normalized tf or tf-idf representation ('clf', MultinomialNB()) # Naive Bayes classifier for multinomial models ]) bayes_clf.fit(x_train, y_train) """ Predict the test dataset using Naive Bayes""" predicted = bayes_clf.predict(x_test) print('Naive Bayes correct prediction: {:4.4f}'.format( np.mean(predicted == y_test))) print(metrics.classification_report(y_test, predicted, target_names=categories)) """ Support Vector Machine (SVM) classifier""" svm_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',
if fn[19] == 'D': test_demo_label = np.append(test_demo_label, fn[19]) testDemo.append(f.read()) if fn[19] == 'R': test_repub_label = np.append(test_repub_label, fn[19]) testRepub.append(f.read()) f.close() #In this part of the code we make a pipeline, though we only use the CountVectorizer #in this case. This portion of the code will focus on the Democratic party. #First the code will fit and transform the democractic data set we appended before. #Then convert it to a format we can analyze and find bigrams within the dataset that #contain keywords we're looking for, such as energy, education, and healthcare. #We also conveniently provide a count to also print out if necessary. text_pipeline = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', MultinomialNB(alpha=0))]) vecti = CountVectorizer(ngram_range=(2, 2)) freqTrain = vecti.fit_transform(trainDemo).toarray() nameDict = vecti.get_feature_names() freqArray = np.sum(freqTrain, axis=0) countDict = dict(zip(nameDict, freqArray)) energyDict = {} educationDict = {} healthDict = {} print "Intriguing Bigrams Democrats: " for x in countDict: if "energy" in x: print x if x not in energyDict:
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1') n_items = items.shape[0] print('Number of items:', n_items) X0 = items.values X_train_counts = X0[:, -19:] # tfidf from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer(smooth_idf=True, norm='l2') tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray() def get_items_rated_by_user(rate_matrix, user_id): """ in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp we care about the first three values return (item_ids, scores) rated by user user_id """ y = rate_matrix[:, 0] # all users # item indices rated by user_id # we need to +1 to user_id since in the rate_matrix, id starts from 1 # while index in python starts from 0 ids = np.where(y == user_id + 1)[0] item_ids = rate_matrix[ids, 1] - 1 # index starts from 0
def tfidf_transformer(bow_matrix): transformer = TfidfTransformer(norm='l2', smooth_idf=True, use_idf=True) tfidf_matrix = transformer.fit_transform(bow_matrix) return transformer, tfidf_matrix
lemma_instance = WordNetLemmatizer() lemmas = [lemma_instance.lemmatize(word, "v") for word in no_stopwords] stem_instance = PorterStemmer() stems = [stem_instance.stem(word) for word in lemmas] return stems # Creating the pipeline print('\nCreating the pipeline ...') pipeline = Pipeline([ ('bow', CountVectorizer( analyzer=text_processing)), # strings to token integer counts ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores ('classifier', MultinomialNB()), # train on TF-IDF vectors with Naive Bayes classifier ]) # Train Test Split X = messages['message'] # features y = messages['label'] # target msg_train, msg_test, label_train, label_test = train_test_split(X, y, test_size=0.3) # Training Pipeline print('\nTraining the Pipeline ...') pipeline.fit(msg_train, label_train)
def run(train_data, valid_data, test_data, truth_data): train_data_df = pd.DataFrame.from_dict(train_data) truth_data_df = pd.DataFrame.from_dict(truth_data) train = pd.merge(train_data_df, truth_data_df, on="id") data = train.values textFeatures = ["postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords", "targetDescription", "truthClass"] vals = data.tolist() final_vals = [] # print(vals[0]) for i in range(len(vals)): if vals[i][1] != []: print(vals[i][2]) final_vals.append([vals[i][2], vals[i][4], vals[i][5], vals[i][6], vals[i][7], vals[i][8], vals[i][9]]) vals_df = pd.DataFrame(final_vals, columns=["postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords", "targetDescription", "truthClass"]) textColumns = vals_df.values.tolist() df = [] y = [] print('---------') print(len(final_vals)) VALIDATION_SPLIT = 0.1 nb_validation_samples = int(VALIDATION_SPLIT * len(final_vals)) valid_data = final_vals[:nb_validation_samples] test_data = final_vals[int(0.8 * len(final_vals)):int(0.9 * len(final_vals))] final_vals = final_vals[0:int(len(final_vals)*0.8)] for i in final_vals: if(i[6]=="clickbait"): y.append(1) else: y.append(0) # print(textColumns[0]) for i in range(len(final_vals)): text = [] for j in range(0,6): k = final_vals[i][j] # print(k, j) if (j == 2 or j == 3): text.append(k) else: text+=k words = "" for string in text: string = clean_str(string) words +=" ".join(string.split()) df+=[words] vectorizer = CountVectorizer(input='content', lowercase=False, analyzer='word', stop_words='english') X = vectorizer.fit_transform(df) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X) print(X_train_tfidf.shape) clf = linear_model.LinearRegression() clf.fit(X_train_tfidf, y) ### VALIDATION DATA ### print("Validation") # valid_data_df = pd.DataFrame(valid_data) # valid_data_df = pd.DataFrame.from_dict(valid_data) # valid = pd.merge(valid_data_df, truth_data_df, on="id") # vdata = valid.append(train).values # vdata = final_vals.append(valid_data_df.values).tolist() vdata = final_vals + valid_data y_valid = [] for i in vdata: if (i[6] == "clickbait"): y_valid.append(1) if (i[6] == "no-clickbait"): y_valid.append(0) y_valid = pd.DataFrame(y_valid) print("Y_valid length", len(y_valid)) # vdata = valid[textFeatures].append(train[textFeatures]).values.tolist() df_valid = [] for i in range(len(vdata)): text = [] for j in range(0, 5): k = vdata[i][j] if (j == 2 or j == 3): text.append(k) else: text += k words = "" for string in text: string = clean_str(string) words += " ".join(string.split()) df_valid += [words] # a_train, a_val, b_train, b_val = train_test_split(df_valid, y_valid, test_size = 0.11, random_state = 42) predicted = [] for v in df_valid: valid_X = vectorizer.transform([v]) X_valid_tfidf = tfidf_transformer.transform(valid_X) predicted.append(clf.predict(X_valid_tfidf).round()) scores = accuracy_score(y_valid, predicted) print("Validation Data Accuracy ", scores) ### TEST DATA ### # predicted = [] # for t in df_test: # test_X = vectorizer.transform([t]) # X_test_tfidf = tfidf_transformer.transform(test_X) # predicted.append(model.predict(X_test_tfidf).round()) # # scores = accuracy_score(y_test, predicted) tdata = test_data y_test =[] df_test =[] for i in tdata: if(i[6]=="clickbait"): y_test.append(1) if(i[6]=="no-clickbait"): y_test.append(0) # textColumns_test = test[textFeatures] # textColumns_test = textColumns_test.values.tolist() for i in range(len(tdata)): text = [] for j in range(0,5): k = tdata[i][j] if (j == 2 or j == 3): text.append(k) else: text+=k words = "" for string in text: string = clean_str(string) words +=" ".join(string.split()) df_test+=[words] # test_X = vectorizer.fit_transform(df_test) # X_test_tfidf = tfidf_transformer.fit_transform(test_X) # predicted = model.predict(X_test_tfidf) # print(clf.score(X_test_tfidf, y_test)) predicted = [] for t in df_test: test_X = vectorizer.transform([t]) X_test_tfidf = tfidf_transformer.transform(test_X) predicted.append(clf.predict(X_test_tfidf).round()) scores = accuracy_score(y_test, predicted) print("Test Data Accuracy ", scores)
def build_model(): pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) return pipeline
eight_test.target[k] = 0 else: eight_test.target[k] = 1 eight_train.target_names = ['c','r'] eight_test.target_names = ['c','r'] #%% from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction import text stop_words = text.ENGLISH_STOP_WORDS vectorizer = CountVectorizer(stop_words=stop_words) X = vectorizer.fit_transform(eight_train.data) tfidf = TfidfTransformer() X_tfidf = tfidf.fit_transform(X) #%% from sklearn.decomposition import TruncatedSVD svd_model = TruncatedSVD(n_components=50, random_state=42) X_svd = svd_model.fit_transform(X_tfidf) #%% from sklearn.svm import LinearSVC LSVM = LinearSVC(loss='hinge') X_LSVM = LSVM.fit(X_svd,eight_train.target)
# Instantiating a PorterStemmer object porter = PorterStemmer() token_words = word_tokenize(nopunc) stem_message = [] for word in token_words: stem_message.append(porter.stem(word)) stem_message.append(" ") return ''.join(stem_message) sms['clean_message'] = sms['message'].apply(text_process) X = sms.clean_message y = sms.label_num pipe = Pipeline([('bow', CountVectorizer()), ('tfid', TfidfTransformer()), ('model', LogisticRegression(solver='liblinear'))]) pipe.fit(X, y) @app.route('/') def home(): return render_template('home.html') @app.route('/Predict', methods=['POST']) def Predict(): if request.method == 'POST': message = request.form['message']
def calTFidf(text): vectorizer = CountVectorizer(lowercase=True) wordcount = vectorizer.fit_transform(text) tf_idf_transformer = TfidfTransformer() tfidf_matrix = tf_idf_transformer.fit_transform(wordcount) return vectorizer, tfidf_matrix
def transformer(self, matrixList): trans = TfidfTransformer() counts = array(matrixList) tfidf = trans.fit_transform(counts) # print type(tfidf) return tfidf
def test_vectorizer(): # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert_equal(counts_train[0, v1.vocabulary_["pizza"]], 2) # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary_) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() vocabulary = v.vocabulary_ assert_equal(counts_test[0, vocabulary["salad"]], 1) assert_equal(counts_test[0, vocabulary["tomato"]], 1) assert_equal(counts_test[0, vocabulary["water"]], 1) # stop word from the fixed list assert_false("the" in vocabulary) # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert_false("copyright" in vocabulary) # not present in the sample assert_equal(counts_test[0, vocabulary["coke"]], 0) assert_equal(counts_test[0, vocabulary["burger"]], 0) assert_equal(counts_test[0, vocabulary["beer"]], 0) assert_equal(counts_test[0, vocabulary["pizza"]], 0) # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = t1.fit(counts_train).transform(counts_train).toarray() assert_equal(len(t1.idf_), len(v1.vocabulary_)) assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_))) # test tf-idf with new data tfidf_test = t1.transform(counts_test).toarray() assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_))) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = t2.fit(counts_train).transform(counts_train).toarray() assert_false(hasattr(t2, "idf_")) # test idf transform with unlearned idf vector t3 = TfidfTransformer(use_idf=True) assert_raises(ValueError, t3.transform, counts_train) # test idf transform with incompatible n_features X = [[1, 1, 5], [1, 1, 0]] t3.fit(X) X_incompt = [[1, 3], [1, 3]] assert_raises(ValueError, t3.transform, X_incompt) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = TfidfVectorizer(norm='l1') tv.max_df = v1.max_df tfidf2 = tv.fit_transform(train_data).toarray() assert_false(tv.fixed_vocabulary_) assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = tv.transform(test_data).toarray() assert_array_almost_equal(tfidf_test, tfidf_test2) # test transform on unfitted vectorizer with empty vocabulary v3 = CountVectorizer(vocabulary=None) assert_raises(ValueError, v3.transform, train_data) # ascii preprocessor? v3.set_params(strip_accents='ascii', lowercase=False) assert_equal(v3.build_preprocessor(), strip_accents_ascii) # error on bad strip_accents param v3.set_params(strip_accents='_gabbledegook_', preprocessor=None) assert_raises(ValueError, v3.build_preprocessor) # error with bad analyzer type v3.set_params = '_invalid_analyzer_type_' assert_raises(ValueError, v3.build_analyzer)
df.target, test_size=0.2, random_state=42) #----------------------------------------Entrainement des modèle--------------------------------------------- #A la place d'effectuer chaque tache séparément on peut passer (l'entrainement,prédiction,transformation.......) from sklearn.neighbors import KNeighborsClassifier #Fonction utilisé pour adapter knn puisqu'on utilise le pipeline class DenseTransformer(TransformerMixin): def transform(self, X, y=None, **fit_params): return X.todense() def fit_transform(self, X, y=None, **fit_params): self.fit(X, y, **fit_params) return self.transform(X) def fit(self, X, y=None, **fit_params): return self # On peut une pipeline qui nous permet de passer les trois paramètres (le modèle à utiliser,le transformateur et le compteurs des mots ..) pipe = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('model', OneVsRestClassifier(KNeighborsClassifier(n_neighbors=3)))]) # Entrainemt du modèle model = pipe.fit(X_train, y_train) prediction = model.predict(X_test) filename = 'finalized_model.sav' #sauvegarde du modéle pickle.dump(model, open(filename, 'wb'))