def TFIDFEmbedding(self, text_list): print('embedding word by using TFIDF mdoel ... ') train_data = [] for item in text_list: train_data.append(''.join(item)) tfidf_word = TFIDF(min_df=0, max_features=None, analyzer='word', ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=False, stop_words='english') tfidf_char = TFIDF(min_df=0, max_features=None, strip_accents='unicode', analyzer='word', ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=False, stop_words='english') tfidf_word.fit(train_data) vector_list = tfidf_word.transform(train_data) return vector_list
def train_tfidf(): skills, data = read() corpus = [' '.join(turnToWordList(d['description'])) for d in data] features = 5000 tfidf = TFIDF(min_df=2, max_features=features, strip_accents="unicode", analyzer="word", token_pattern=r"\w{1,}", ngram_range=(1,3), use_idf=1,smooth_idf=1,sublinear_tf=1,stop_words="english") tfidf.fit(corpus) joblib.dump(tfidf, '../../model/tfidf.pkl')
def train_tfidf(train_data): tfidf = TFIDF(min_df=5, max_features=dim, ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=True) tfidf.fit(train_data) X = tfidf.fit_transform(train_data) word_dict = {} name = tfidf.get_feature_names() with open('name.txt', 'w') as fw: for i, s in enumerate(name): s = s.replace("", "_") word_dict[i] = s fw.write(s) fw.write('\n') raw_text = [] for line in X.A: s = "" for i in line: s += " " + word_dict[i] raw_text.append(s) return raw_text
def vectorize2(self, f, feature): data = self.preprocess(f) tfidf = TFIDF(vocabulary=feature) fit_t = tfidf.fit_transform(data["content"]) weight = pd.DataFrame(fit_t.toarray()) return weight.values, data["label"].values
def tf_idf(train_data, test_data): """TF-IDF向量""" tfidf = TFIDF( min_df=2, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), # 二元文法模型 use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') # 去掉英文停用词 # 合并训练和测试集以便进行TFIDF向量化操作 data_all = train_data + test_data len_train = len(train_data) tfidf.fit(data_all) data_all = tfidf.transform(data_all) # 恢复成训练集和测试集部分 train_x = data_all[:len_train] test_x = data_all[len_train:] print("train: \n", np.shape(train_x[0])) print("test: \n", np.shape(test_x[0])) return train_x, test_x
def train_tfidf(train_data): tfidf = TFIDF(min_df=5, max_features=5000, ngram_range=(1, 3), use_idf=1, smooth_idf=1) tfidf.fit(train_data) return tfidf
def train_tfidf(self, facts): """ train the TFIDF vectorizer model """ tfidf = TFIDF(min_df=5, max_features=DIM, ngram_range=(1, 3)) tfidf.fit(facts) if util.DEBUG: print("DEBUG: TF-IDF model learnt.") return tfidf
def vectorize(self, data): tfidf = TFIDF() fit_t = tfidf.fit_transform(data["content"]) weight = pd.DataFrame(fit_t.toarray()) word = tfidf.get_feature_names() #print weight.shape return word, weight, data["label"].values
def train_tfidf(train_data): tfidf = TFIDF(min_df=5, max_features=dim, ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=True) tfidf.fit(train_data) return tfidf
def tf_idf(train_data, test_data): len_train = len(train_data) tokenizer = TweetTokenizer() vectorizer = TFIDF(ngram_range=(1, 2), tokenizer=tokenizer.tokenize) full_text = list(train_data['Phrase'].values) + list( test_data['Phrase'].values) vectorizer.fit(full_text) data_all = vectorizer.transform(full_text) # 恢复成训练集和测试集部分 train_x = data_all[:len_train] test_x = data_all[len_train:] return train_x, test_x
def to_matrix(all): tfidf = TFIDF( min_df=3, # 最小支持度为3 max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 4), # 二元文法模型 use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') # 去掉英文停用词 tfidf.fit(all) data_all = tfidf.transform(all) return data_all
def train_tfidf(train_data, dim=5000, ngram=3, min_df=5): ngram_range = (1, 3) if ngram == 1: ngram_range = (1, 1) elif ngram == 2: ngram_range = (1, 2) tfidf = TFIDF(min_df=min_df, max_features=dim, ngram_range=ngram_range, use_idf=1, smooth_idf=1) tfidf.fit(train_data) return tfidf
def tfidf_extraction(texts, vectorizer=None, max_features=2000, ngram_range=(1, 1)): if (vectorizer == None): vectorizer = TFIDF(tokenizer=LemmaTokenizer(), binary="True", strip_accents="unicode", ngram_range=ngram_range, analyzer='word', stop_words='english', max_features=max_features) tfidf = vectorizer.fit_transform(texts) else: tfidf = vectorizer.transform(texts) return tfidf, vectorizer
def preprocessing(): """数据预处理""" df = load_dataset() train_X, valid_X, train_y, valid_y = \ train_test_split(df['content'], df['content_type'], test_size=0.2, random_state=42) model_tfidf = TFIDF(min_df=5, max_features=5000, ngram_range=(1, 3), use_idf=1, smooth_idf=1) # 学习idf vector model_tfidf.fit(train_X) # 把文档转换成 X矩阵(该文档中该特征词出现的频次),行是文档个数,列是特征词的个数 train_vec = model_tfidf.transform(train_X) valid_vec = model_tfidf.transform(valid_X) return train_X, valid_y, train_y, valid_y, train_vec, valid_vec
def tfidf_count(train_data, test_data): begintime = datetime.datetime.now() # 参考:http://blog.csdn.net/longxinchen_ml/article/details/50629613 tfidf = TFIDF( min_df=2, # 最小支持度为2 max_features=None, strip_accents='unicode', #在预处理步骤中删除语气词。 #'ascii'是一种快速的方法,只适用于具有直接ASCII映射的字符。 #'unicode'是一种稍慢的方法,适用于任何字符。 None(默认)不起作用 analyzer='word', #特征值是一个单词 还是一个n-gram token_pattern=r'\w{1,}', #表示什么构成“token”的正则表达式,仅在分析器==“单词”时使用。 #默认正则表达式选择2个或更多字母数字字符的标记 #(标点符号被完全忽略,并始终作为token分隔符处理)。 ngram_range=(1, 3), # 二元文法模型 use_idf=1, #反文档频率 smooth_idf=1, #通过将文档频率添加一个平滑的idf权重,防止0频率 sublinear_tf=1, #用1 + log(tf)替换tf。 stop_words=None) # 去掉英文停用词 如果是字符串,则将其传递给_check_stop_list,并返回相应的停止列表。 #'english'是目前唯一支持的字符串值 # 合并训练和测试集以便进行TFIDF向量化操作 data_all = train_data + test_data len_train = len(train_data) tfidf.fit(data_all) #Learn vocabulary and idf from training set. data_all = tfidf.transform( data_all) #Transform documents to document-term matrix #print(data_all) # 恢复成训练集和测试集部分 train_tfidf = data_all[:len_train] test_tfidf = data_all[len_train:] print('TF-IDF over.') endtime = datetime.datetime.now() tfidftime = (endtime - begintime).seconds * 1000 + ( endtime - begintime).microseconds / 1000 #微妙转换为毫秒 return train_tfidf, test_tfidf, tfidftime
def tfidf_tsne(): indx_sent, word2idx, idx2word = Sentences().limit_vocab() word_sent_counts = np.zeros((len(word2idx) + 1, len(indx_sent) + 1)) j = 0 for sentence in indx_sent: for idx in sentence: word_sent_counts[idx, j] += 1 j += 1 word_sent_tfidf = TFIDF().fit_transform(word_sent_counts).toarray() word_sent_tsne = TSNE().fit_transform(word_sent_tfidf) plt.scatter(word_sent_tsne[:, 0], word_sent_tsne[:, 1]) for label in range(len(word2idx)): try: plt.annotate(s=idx2word[label].encode('utf8'), xy=(word_sent_tsne[label, 0], word_sent_tsne[label, 1])) except UnicodeError: pass except KeyError: pass plt.show()
def train_tfidf(self, facts): tfidf = TFIDF(min_df=5, max_features=DIM, ngram_range=(1, 3)) tfidf.fit(facts)
print(all_data[0]) print(all_data[1999]) print(type(all_data[0])) print(data_all[0]) print(data_all[1999]) print(type(data_all[0])) # In[4]: from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF tfidf = TFIDF(min_df=5, # 最小支持度为2 max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,1), # 1元文法模型 use_idf=1, smooth_idf=1, sublinear_tf=1) # In[5]: tfidf.fit(all_data) all_data = tfidf.transform(all_data) print(type(all_data)) # In[6]:
def train_tfidf(train_data): tfidf = TFIDF(token_pattern=r"(?u)\b\w+\b") # 0.85030136 tfidf.fit(train_data) return tfidf
train_data.append(' '.join(review_to_wordlist(train['review'][i]))) test_data = [] for i in range(len(test['review'])): test_data.append(' '.join(review_to_wordlist(test['review'][i]))) # 预览数据 #print (train_data[0], '\n') #print( test_data[0]) from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF tfidf = TFIDF(min_df=2, # 最小支持度为2 max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), # 二元文法模型 use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words = 'english') # 去掉英文停用词 # 合并训练和测试集以便进行TFIDF向量化操作 data_all = train_data + test_data len_train = len(train_data) tfidf.fit(data_all) data_all = tfidf.transform(data_all) # 恢复成训练集和测试集部分 train_x = data_all[:len_train] test_x = data_all[len_train:] print ('TF-IDF over.')
# print("*************") # 是否存在样本不平衡问题? # for i in [0, 1, 2, 3]: # print(i, (train.target == i).sum()/len(train.target)) ''' 0 0.26052974381241856 1 0.25749023013460703 2 0.23708206686930092 3 0.24489795918367346 ''' from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF Xtrain = train.data Xtest = test.data Ytrain = train.target Ytest = test.target tfidf = TFIDF().fit(Xtrain) Xtrain_ = tfidf.transform(Xtrain) Xtest_ = tfidf.transform(Xtest) # print(Xtrain_) tosee = pd.DataFrame(Xtrain_.toarray(), columns=tfidf.get_feature_names()) # print(tosee.shape) # (2303, 40725) # print(tosee.head()) # 建模 from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB from sklearn.metrics import brier_score_loss as BS name = ["Multinomial", "Complement", "Bournulli"] # # 注意高斯朴素贝叶斯不接受稀疏矩阵 models = [MultinomialNB(), ComplementNB(), BernoulliNB()] # for name, clf in zip(name, models): clf.fit(Xtrain_, Ytrain)
# ,"talk.politics.guns" #政治 - 枪支问题 , "talk.politics.mideast" ] #政治 - 中东问题 train = fetch_20newsgroups(subset="train", categories=categories) test = fetch_20newsgroups(subset="test", categories=categories) xtrain = train.data xtest = test.data print("train", len(data)) # print("text",data[0]) ytrain = train.target ytest = test.target tfidf = TFIDF().fit(xtrain) xtrain_ = tfidf.transform(xtrain) xtest_ = tfidf.transform(xtest) print("xtrain_", xtrain_.shape) from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB from sklearn.metrics import brier_score_loss as BS name = ["Multinomial", "Complement", "Bournulli"] models = [MultinomialNB(), ComplementNB(), BernoulliNB()] # for name,clf in zip(name,models): # clf.fit(xtrain_,ytrain) # y_pred=clf.predict(xtest_) # proba=clf.predict_proba(xtest_) # score=clf.score(xtest_,ytest)
api_train = pd.DataFrame(api_train) api_train.rename(columns={0:'return'},inplace=True) test['api_return'] = test.return_value.map(str) api_test = test.groupby(by='file_id').apply(lambda x:' '.join(x.api_return)) api_test = pd.DataFrame(api_test) api_test.rename(columns={0:'return'},inplace=True) import networkx as nx apiSet = list(set(train.api) | set(test.api)) # TFIDF特征 print('tfidf starts') tfidf = False if tfidf: vec = TFIDF(ngram_range=(1, 4), max_features=300000) tfidf_train = vec.fit_transform(df_train['text']) tfidf_test = vec.transform(df_test['text']) print(tfidf_train.shape, tfidf_test.shape, time.time() - start0) sparse.save_npz('./virus_set/tfidf_train.npz', tfidf_train) # 保存 sparse.save_npz('./virus_set/tfidf_test.npz', tfidf_test) # 保存 # TFIDF特征 # vec = TFIDF(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1) # tfidf_train_api_return = vec.fit_transform(df_train['parallel_api']) # tfidf_test_api_return = vec.transform(df_test['parallel_api']) # print(tfidf_train_api_return.shape, tfidf_test_api_return.shape,time.time()-start0) # sparse.save_npz('./virus_set/tfidf_train_api_return.npz', tfidf_train_api_return) #保存 # sparse.save_npz('./virus_set/tfidf_test_api_return.npz', tfidf_test_api_return) #保存
#process context #filter out punctuations temp_context = client.annotate(raw_context) parsed_context = [] for s in temp_context.sentence: this_sentence = [] for token in s.token: ts = token.lemma.lower() if ts not in PUNCTUATIONS: this_sentence.append(ts) parsed_sentence = " ".join(this_sentence) parsed_context.append(parsed_sentence) #train TFIDF model using processed context content without stopwords unigram_model = TFIDF(input=parsed_context, analyzer='word', dtype=np.float32, stop_words=STOP_WORDS) #process each question in the question & answers set for q in set_qass: raw_question = q['question'] qid = q['id'] #process a single question, generating a corresponding questionSpan #additionally, analize the type of the question #if the question type is WDT WHAT or WHICH, we should get the part for substitution temp_question = client.annotate(raw_question) this_question = [] size_tokens = len(temp_question.sentence[0].token) list_tokens = temp_question.sentence[0].token IDENTFY = False qtype = 8 QSpan = questionSpan()
testData = [] for i in range(len(test['review'])): testData.append(' '.join(turnToWordList(test['review'][i]))) print(len(testData)) #print(train.head()) #print(test.head()) from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF print('training TFIDF') tfidf = TFIDF(min_df=2, max_features=1000, strip_accents="unicode", analyzer="word", token_pattern=r"\w{1,}", ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words="english") allData = trainData + testData lentrain = len(trainData) tfidf.fit(allData) allData = tfidf.transform(allData) train_x = allData[:lentrain] test_x = allData[lentrain:] print("TF-IDF处理结束")
from sklearn.model_selection import train_test_split # 对数据进行洗牌处理 from sklearn.utils import shuffle X_shuf, Y_shuf = shuffle(content, data['rumorType']) # 将有标签的数据集划分成训练集和测试集 train_X, test_X, train_y, test_y = train_test_split(X_shuf, Y_shuf, test_size=0.2, random_state=42) # 模型构建 model_tfidf = TFIDF(min_df=5, max_features=5000, ngram_range=(1, 3), use_idf=1, smooth_idf=1) # 学习idf vector model_tfidf.fit(train_X) # 把文档转换成 X矩阵(该文档中该特征词出现的频次),行是文档个数,列是特征词的个数 train_vec = model_tfidf.transform(train_X) # 模型训练 model_SVC = LinearSVC() clf = CalibratedClassifierCV(model_SVC) clf.fit(train_vec, train_y) # 把文档转换成矩阵 test_vec = model_tfidf.transform(test_X) # 验证
def main(): sw = list(stopwords.words("english")) data_src = "D:\\Reference\\aclImdb" tags = ['neg', 'pos'] gbm_param_grid = { 'n_estimators': range(5, 20), 'max_depth': range(6, 20), 'learning_rate': [.4, .45, .5, .55, .6], 'colsample_bytree': [.6, .7, .8, .9, 1], 'min_child_weight': range(1, 6, 2) } # Training x_train = [] y_train = [] for tag in tags: for aFile in os.listdir(f"{data_src}\\train\\{tag}"): with open(f"{data_src}\\train\\{tag}\\{aFile}", "r", encoding="utf-8") as f: x_train.append(f.read().strip()) y_train.append(tags.index(tag)) tfidf = TFIDF(stop_words=sw) x_train_tfidf = tfidf.fit_transform(x_train) xgb = XGBClassifier() xgb_random = CV(param_distributions=gbm_param_grid, estimator=xgb, scoring="accuracy", verbose=1, n_iter=50, cv=5, n_jobs=-1) xgb_random.fit(x_train_tfidf, y_train) print("Search log: ", xgb_random.cv_results_) print("Best parameters found: ", xgb_random.best_params_) print("Best accuracy found: ", xgb_random.best_score_) # Testing x_test = [] y_test = [] for tag in tags: for aFile in os.listdir(f"{data_src}\\test\\{tag}"): with open(f"{data_src}\\test\\{tag}\\{aFile}", "r", encoding="utf-8") as f: x_test.append(f.read().strip()) y_test.append(tags.index(tag)) x_test_tfidf = tfidf.transform(x_test) y_pred = xgb_random.predict(x_test_tfidf) print("Acc:", accuracy_score(y_test, y_pred)) print("Rec:", recall_score(y_test, y_pred)) print("Pre:", precision_score(y_test, y_pred)) print("F1:", f1_score(y_test, y_pred)) # Save The Model dump(tfidf, "model/tfidf.pkl") dump(xgb_random, "model/xgb.pkl")
data_all[i] = remove_number(data_all[i]) data_all[i] = remove_link(data_all[i]) print(i) # In[4]: print(data_all[100]) # In[5]: from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF tfidf = TFIDF( min_df=5, # 最小支持度为2 max_features=None, strip_accents='unicode', analyzer='char', ngram_range=(1, 1), # 1元文法模型 use_idf=1, smooth_idf=1, sublinear_tf=1) # In[7]: all_data = data_all tfidf.fit(all_data) all_data = tfidf.transform(all_data) print(type(all_data)) # In[8]: print(tfidf.vocabulary_)
print(train_data[0], '\n') print(test_data[0]) # ## 1.3 Feature Extraction and Vectorization of text # In[44]: from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF tfidf = TFIDF( min_df=2, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), # binary grammar model #ngram_range=(1, 3), # Ternary grammar model use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') # Remove English stop words # Combine training and test sets for TF-IDF vectorization data_all = train_data + test_data len_train = len(train_data) tfidf.fit(data_all) data_all = tfidf.transform(data_all) # Restore to training set and testing set sections train_x = data_all[:len_train] test_x = data_all[len_train:]
sample = [ "Machine learning is fascinating, it is wonderful", "Machine learning is a sensational techonology", "Elsa is a popular character" ] from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer() x = vec.fit_transform(sample) print("features", vec.get_feature_names()) # print("x",x.shape,type(x),x) # print("x",x.toarray()) # import pandas as pd # vcresult=pd.DataFrame(x.toarray(),columns=vec.get_feature_names()) from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF vec = TFIDF() x = vec.fit_transform(sample) print("x", x)