def test_tfidf_vectorizer(): ''' 停用词就是在分类中没有用的词,这些词一般词频 TF 高,但是 IDF 很低,起不到分类的作用。 为了节省空间和计算时间,我们把这些词作为停用词 stop words,告诉机器这些词不需要帮我计算 TfidfVectorizer stop_words list token_pattern 过滤规则 正则表达式 fit_transform后 vocabulary_ 词汇表 字典型 idf_ 返回idf值 stop_words_ 返回停用词表 :return: ''' tfidf_vec = TfidfVectorizer() print(tfidf_vec) documents = [ 'this is the bayes document', 'this is the second document', 'and the third one', 'is this the document' ] tfidf_matrix = tfidf_vec.fit_transform(documents) print(tfidf_vec.get_feature_names()) print(tfidf_vec.get_stop_words()) print(tfidf_vec.get_params()) print(tfidf_vec.vocabulary_) print(tfidf_matrix.toarray())
def calculate_wight(entities, text): weight = dict() #print entities vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 10)) tfidf_mat = vectorizer.fit_transform([text]) stop_word = vectorizer.get_stop_words() #entity_type = extract_ne_type(entities,text,stop_word) #print entity_type for word, w8 in zip(vectorizer.get_feature_names(), tfidf_mat.toarray().tolist()[0]): weight[word] = w8 spcl_c = [ '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\'', "]", "^", '_', "`", "{", "|", "}", "~" ] tpl_list = set() for word in entities: if word.lower() not in stop_word: ss = word.lower() for ch in spcl_c: ss = ss.replace(ch, ' ') ss = ' '.join([ w.strip() for w in ss.split(' ') if len(w.strip()) > 1 and w.strip() not in stop_word ]) try: tpl_list.add( (word.lower(), weight[ss])) #, entity_type[word])) except KeyError as e: continue return tpl_list
def describeTfid(): tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 2)) in_text = [ "Tommy is a boy", "Mary is Tommy sister", "Maggie is both their friend" ] text_transformed = tfidf_vectorizer.fit_transform( in_text) # This is actually # tfidf_vectorizer.fit(in_text) # text_transformed = tfidf_vectorizer.transform(in_text) feature_names = tfidf_vectorizer.get_feature_names() print("In Text = ", in_text) print("Text Transformed(", type(text_transformed), " of shape ", text_transformed.shape, ") =\n", text_transformed) print("Vocab = ", tfidf_vectorizer.vocabulary_) print("Feature names = ", feature_names) print("Stop words \n", tfidf_vectorizer.get_stop_words())
def document_tfid_parser(documents): # So we want to parse one single document # Het werkt niet met alle documenten # vectorizer = TfidfVectorizer() # X = vectorizer.fit_transform(documents) # print(vectorizer.get_feature_names()) # print(vectorizer.get_params()) # print(vectorizer.get_stop_words()) # print(X) # print(X.shape) sumOfDocuments = [] for document in documents: sumOfDocuments.append(str(document[0]).replace('_',' ')) # if len(document[0]) > 4: # vectoriser = TfidfVectorizer() # X = vectoriser.fit_transform(document) # print(vectoriser.get_feature_names()) # print(vectoriser.get_params()) # print(vectoriser.get_stop_words()) # print(X.shape) # print(X) # else: # pass if len(sumOfDocuments) > 4: vectoriser = TfidfVectorizer(max_df=0.7) X = vectoriser.fit_transform(sumOfDocuments) print(vectoriser.get_feature_names()) print(vectoriser.get_params()) print(vectoriser.get_stop_words()) print(X.shape) print(X) else: pass
def vectorizaCorpus(corpus, minDf): ''' Vectoriza o corpus introducido filtrando as palabras que aparecen en menos de minDf documentos''' try: vectorizer = TfidfVectorizer(min_df = minDf, lowercase=True, stop_words='english') # Definimos unha lista propoia de stopwords myStopwords = ['did','didn','does','doesn','don','just','isn', \ 'reddit', 'wasn','www','yeah','yes','like','able','thanks', \ 'know', 'think','ve', 'want','com','https','http',\ 'good', 'really', 'make', 'say', 'going', 'said', 'people','way', \ 'use'] # engadimos as stop_words que queremos ao conxunto xa existente vectorizer.stop_words = vectorizer.get_stop_words().union(myStopwords) # calculamos a matriz de documentos-términos docTerms = vectorizer.fit_transform(corpus) # invertimos o vocabulario creando un diccionario de índices - termos invVoc = {v: k for k, v in vectorizer.vocabulary_.items()} # buscamos os termos centrais, que son os que a suma acumulada de tf/idf en todos os documentos é maior sumaTfidf = docTerms.sum(axis=0).tolist()[0] #calculamos a suma por columnas da matriz de documentos-termos return vectorizer, invVoc, sumaTfidf except Exception as e: print('\nOcorreu un problema: {0}'.format(e)) sys.exit()
def tf_idf_vectorizer_big(list_of_strings, choose_to_log_data=True, log_vectorised_words=False, logger=None): """ function should return tf-idf logistic regression score :param : list :type : string :return: sparse matrix :rtype: value """ search_and_replace_numerals_with_space = lambda x: re.sub( r'(\d[\d\.])+', '', x.lower()) vect_char = TfidfVectorizer( preprocessor=search_and_replace_numerals_with_space, stop_words='english', analyzer='char', ngram_range=(2, 6), min_df=20) vect_word = TfidfVectorizer( preprocessor=search_and_replace_numerals_with_space, stop_words='english', min_df=20) sparse_matrix_word = vect_word.fit_transform(list_of_strings) sparse_matrix_char = vect_char.fit_transform(list_of_strings) sparse_matrix_combined = sparse.hstack( [sparse_matrix_word, sparse_matrix_char]) if choose_to_log_data: logger.info("\nbig vector shape\n %s", sparse_matrix_combined.shape) if log_vectorised_words: logger.info("\nFeatures of vectorizer_character\n %s", vect_char.get_feature_names()) logger.info("\nRemoved Features of vectorizer_character \n %s", vect_char.get_stop_words()) logger.info("\nHyperparameters of vectorizer_character\n %s", vect_char.fit(list_of_strings)) logger.info("\nFeatures of vectorizer_word\n %s", vect_word.get_feature_names()) logger.info("\nRemoved Features of vectorizer_word \n %s", vect_word.get_stop_words()) logger.info("\nHyperparameters of vectorizer_word\n %s", vect_word.fit(list_of_strings)) return sparse_matrix_combined
def make_tfidf_matrix(doc): tfidf = TfidfVectorizer(max_features=1000, stop_words='english') stops = list(tfidf.get_stop_words()) + [ 'education', 'students', 'students', 'school', 'learning', 'learn', 'experience', 'teach', 'working' ] tfidf.set_params(stop_words=stops) tfidf_matrix = tfidf.fit_transform(doc) return tfidf_matrix
def create_stopword_list(extra_words): """ Creates stopword list (adds extra words to original English set) """ from sklearn.feature_extraction.text import TfidfVectorizer original = list(TfidfVectorizer.get_stop_words(TfidfVectorizer(stop_words='english'))) if extra_words: return frozenset(original+extra_words) else: return frozenset(original)
def get_stopword_set(): vectorizer_for_stop = TfidfVectorizer(stop_words='english') stop_words = set() with open('stopwords_custom.in', 'r', encoding='UTF-8') as f: stop_words = stop_words | set(f.read().split()) with open('SmartStoplist.txt', 'r', encoding='UTF-8') as f: stop_words = stop_words | set(f.read().split()) stop_words = stop_words | set( stopwords.words('english')) | vectorizer_for_stop.get_stop_words() return stop_words
def pre_processing(self): # pre-processing function AST = None src = open(self.name, 'r') # loop to parse each source code for x in range(1): src = src.read() attributes = [] variables = [] # Source parsing try: AST = javalang.parse.parse(src) # This will return AST for path, node in AST: # Index, Element if 'ReferenceType' != node: AST.remove(node) print(node, "\n") # print(path,"\n") except: pass vectorizer = TfidfVectorizer( stop_words='english') # Create the vectorize/transform vectorizer.fit( [str(AST)] ) # Learns vocab " CompilationUnit, Imports, path, static, true, util, io " print( '---------------------------check 2----------------------------------' ) print(vectorizer.vocabulary_) print("STOPPPPING WORDS", vectorizer.get_stop_words()) vector = vectorizer.transform([str(AST) ]) # transform document to matrix print(vector) print( '---------------------check 3-------------------------------------------------------------' ) a = np.array(vector.toarray()) print(a) print( '---------------------check 4-------------------------------------------------------------' ) df = DataFrame(a) print(df) # print("Features") # print(vectorizer.get_feature_names()) df.to_csv('featuresExtraction.csv', mode='a', header=False, index=False)
def create_stopword_list(extra_words): """ Creates stopword list (adds extra words to original English set) """ from sklearn.feature_extraction.text import TfidfVectorizer original = list( TfidfVectorizer.get_stop_words(TfidfVectorizer(stop_words='english'))) if extra_words: return frozenset(original + extra_words) else: return frozenset(original)
class tdidf(object): """ 步骤一:利用tfidf解析特征和label 步骤二:利用朴素贝叶斯分类 """ def __init__(self, datas): self.movie_reviews = load_files(datas) #数据需要分析的文件夹 #doc_terms_train, doc_terms_test, y_train, y_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.3) '''''BOOL型特征下的向量空间模型,注意,测试样本调用的是transform接口''' self.count_vec = TfidfVectorizer( binary=False, decode_error='ignore', stop_words='english' ) #CountVectorizer对应词频权重或是BOOL型权重(通过参数binary调节)向量空间模型 self.count_vec.get_stop_words( ) #使用count_vec.get_stop_words()查看TfidfVectorizer内置的所有停用词 self.a = self.count_vec.fit_transform(self.movie_reviews.data) self.count_vec.get_feature_names() #x按照count_vec分词结果 self.x = self.a.toarray() self.y = self.movie_reviews.target print(self.x) #tf-idf矩阵 def train(self): #加载数据集,切分数据集80%训练,20%测试 z = 10 the_all = 0 lists = [] for i in range(0, z): x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.2, random_state=i) #调用MultinomialNB分类器 clf = MultinomialNB().fit(x_train, y_train) #clf:首先引入模块;其次将模块fit到训练样本中 doc_class_predicted = clf.predict(x_test) yy = round(float(np.mean(doc_class_predicted == y_test)), 2) lists.append(yy) return lists
def main(): """ 数据预处理 """ le = LabelEncoder() x = [1, 2, 2, 6, 8, 12, 45, 23] le.fit(x) tsf = le.transform(x) print(tsf) #非数值型转化为数值型 y = ["名称", "产地", "编号", "属性", "功能", "规格"] le.fit(y) str_tsf = le.transform(y) print(np.unique(y)) print(str_tsf) # (零均值规范化)。计算方式是将特征值减去均值,除以标准差。 print(scale(x)) # 特征标准化 sts = StandardScaler() # scaler = sts.fit(x) # print(scaler.transform(x)) # 规范化将不同变化范围的值映射到相同的固定范围,常见的是[0,1],此时也称为归一化 x_t = [[1, -1, 2], [2, 0, 0], [0, 1, -1]] print(preprocessing.normalize(x_t, norm='l2')) # 文本特征抽取与向量化 # sklearn.datasets支持从目录读取所有分类好的文本。 # 不过目录必须按照一个文件夹一个标签名的规则放好。比如本文使用的数据集共有2个标签,一个为“net”,一个为“pos” movie_reviews = load_files('./data/endata') print(movie_reviews.data) print(movie_reviews.target) doc_train, doc_test, y_train, y_test = train_test_split( movie_reviews.data, movie_reviews.target, test_size=0.3) print(doc_train) # 词频统计 count_vec = TfidfVectorizer(binary=False, stop_words=["english"]) x_train = count_vec.fit_transform(doc_train) print(count_vec.get_feature_names()) print(x_train.toarray()) print(movie_reviews.target) # 查看停用词 print(count_vec.get_stop_words())
def main(fi): questions = [" ".join(json.loads(line)['question']) for line in fi] model = TfidfVectorizer(stop_words="english") model.fit(questions) word2idf = {word:model.idf_[idx] for word, idx in model.vocabulary_.items()} sys.stderr.write("Saving IDF...") with open("../../work/idf.json", 'w') as fo: fo.write(json.dumps(word2idf)) sys.stderr.write("Done.\n") sys.stderr.write("Saving StopWords...") with open("../../work/stopwords.txt", 'w') as fo: fo.write(json.dumps(list(model.get_stop_words()))) sys.stderr.write("Done.\n")
def make_stop_words(): ''' Take in list of user-created stop words and join with Tfidf 'english' stop words. INPUT: - None OUTPUT: - New master list of stop words including user and model inputs ''' new_stop_words = ['ha', "\'s", 'tt', 'ireach', "n\'t", 'wo', 'pv', 'tm', 'anite', 'rabichev', 'russell', '603', 'hana', 'atmel', 'radwin', 'se', 'doxee', 'lantto', 'publ', 'fpc1025', '855', 'il', '0344'] # create temporary TfidfVectorizer object tfidf_temp = TfidfVectorizer(stop_words='english') # get Tfidf 'english' stop words from model stop_words = tfidf_temp.get_stop_words() # combine two lists of stop words result = list(stop_words) + new_stop_words return result
def test11(self): corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?' ] vectorizer = TfidfVectorizer() # X = vectorizer.fit_transform(corpus) model = vectorizer.fit(corpus) X = model.transform(corpus) print(vectorizer.get_feature_names()) print(vectorizer.get_stop_words()) print(vectorizer.inverse_transform(X)) print(X.shape) print(X) print(model.vocabulary_) print(model.idf_)
def get_stop_words(self): tfidf = TfidfVectorizer(stop_words='english') stop_words = list(tfidf.get_stop_words()) stop_words += ['qb_y', 'qb_n', 'lngnod'] stop_words_stemmed = [stemmer.stem(x) for x in stop_words] good_words = [ 'no', 'none', 'not', 'nothing', 'back', 'between', 'found', 'front', 'find', 'without', 'above', 'almost', 'under', 'among', 'together', 'serious', 'less', 'each', 'bottom', 'full', 'empty' ] good_words_stemmed = [stemmer.stem(x) for x in good_words] for word in good_words_stemmed: stop_words_stemmed.remove(word) return stop_words_stemmed
def read_data(dataset): # features if dataset in ["nyt", "dblp", "cora2"]: content = pd.read_csv("data/" + dataset + "/features.txt", sep="\t", header=None, quoting=3) vectorizer = TfidfVectorizer(lowercase=True, analyzer="word", stop_words="english", max_df=0.25, min_df=4, norm='l2', use_idf=True) features = vectorizer.fit_transform(content[1].values) vectorizerTF = TfidfVectorizer(lowercase=True, analyzer="word", stop_words="english", max_df=0.25, min_df=4, norm=None, use_idf=False) tf = vectorizerTF.fit_transform(content[1].values) tokenizer = RegexpTokenizer(r'\w+') raw = [tokenizer.tokenize(i.lower()) for i in content[1].values] en_stop = vectorizer.get_stop_words() for i in range(len(raw)): raw[i] = [word for word in raw[i] if not word in en_stop] voc = vectorizer.get_feature_names() else: print("Unknown dataset: %s" % dataset) return None # graph graph = nx.read_adjlist("data/" + dataset + "/graph.txt", nodetype=int) A = nx.to_scipy_sparse_matrix(graph, nodelist=range(features.shape[0]), format="csr") # labels groups = np.loadtxt("data/" + dataset + "/group.txt", delimiter="\t", dtype=int) groups = groups[:, 1] return features, groups, A, graph, voc, raw, tf
def math_stop(): '''Add math specific words to the standard stop list''' tfidf = TfidfVectorizer(stop_words='english') Stop = set() Stop.update([word for word in tfidf.get_stop_words()]) Stop.update(['theorem', 'denote', 'like', 'thank', 'lemma', 'proof', 'sum', 'difference', 'corollary', 'hand', 'product', 'multiple', 'let', 'group', 'prime', 'log', 'limit', 'cid', 'result', 'main', 'conjecture', 'case', 'suppose', 'function', 'assume', 'follows', 'given', 'define', 'note', 'defined', 'class', 'proposition', 'function', 'set', 'primes', 'numbers', 'form', 'integers', 'curves', 'real', 'using', 'following', 'obtain', 'prove', 'definition', 'large', 'small', 'action', 'define', 'bound', 'sufficiently', 'subject', 'non', 'mathematics']) return list(Stop)
def get_data(): print('开始加载数据.') t_start = time() remove = () categories = 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space' data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=0, shuffle=True, remove=remove) data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=0, shuffle=True, remove=remove) t_end = time() print('加载时间:', t_start - t_end) print('数据类型', type(data_train)) print('训练集包含文本个数', len(data_train.data)) print('测试集包含文本个数', len(data_test.data)) print('类别名称:\n', categories) y_train = data_train.target y_test = data_test.target categories = data_train.target_names print('--------展示前十个样本--------') for i in range(10): print('文本{}属于类别{}'.format(i + 1, categories[y_train[i]])) print(data_train.data[i]) print('\n\n\n') vectorizer = TfidfVectorizer(input='content', stop_words='english', max_df=0.5, sublinear_tf=True) x_train = vectorizer.fit_transform(data_train.data) x_test = vectorizer.transform(data_test.data) print(u'训练集样本个数:%d,特征个数:%d' % x_train.shape) print(u'停止词:\n', ) pprint(vectorizer.get_stop_words()) feature_names = np.asarray(vectorizer.get_feature_names()) return x_train, y_train, x_test, y_test
def tf_idf_vectorizer_small(list_of_strings, choose_to_log_data=True, log_vectorised_words=False, logger=None): """ function should return tf-idf logistic regression score :param : list :type : string :return: sparse matrix :rtype: value """ vect_word = TfidfVectorizer(stop_words='english', min_df=20) sparse_matrix_word = vect_word.fit_transform(list_of_strings) if choose_to_log_data: logger.info("\nsmall vector shape %s", sparse_matrix_word.shape) if log_vectorised_words: logger.info("\nFeatures of vectorizer_word %s", vect_word.get_feature_names()) logger.info("\nRemoved Features of vectorizer_word %s", vect_word.get_stop_words()) logger.info("\nHyperparameters of vectorizer_word %s", vect_word.fit(list_of_strings)) return sparse_matrix_word
pickle.dump( from_data, open("your_email_authors.pkl", "w") ) ### in Part 4, do TfIdf vectorization here from sklearn.feature_extraction.text import TfidfVectorizer transformer = TfidfVectorizer(stop_words="english") theFit = transformer.fit_transform(word_data) print('theFit', theFit) afterTransform = theFit.toarray() print(afterTransform) stopWords = transformer.get_stop_words() # print('stopWords', len(stopWords)) featureNames = transformer.get_feature_names() # print(featureNames[34597]) print('featureNames', len(featureNames)) # shove matrix gained from TfidfVectorizer into # k-means clustering # just wanted to experiment from sklearn.cluster import MiniBatchKMeans clf = MiniBatchKMeans(n_clusters=4, n_init=10, max_iter=300) clf.fit(afterTransform) pred = clf.predict(afterTransform) print('PRED', pred)
### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name=='sara': from_data.append(0) else: from_data.append(1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump( word_data, open("your_word_data.pkl", "w") ) pickle.dump( from_data, open("your_email_authors.pkl", "w") ) # print word_data[152] ### in Part 4, do TfIdf vectorization here from sklearn.feature_extraction.text import TfidfVectorizer tfidfv = TfidfVectorizer(stop_words="english", lowercase=True) v_word_data = tfidfv.fit_transform(word_data) print len(tfidfv.get_feature_names()) print tfidfv.get_feature_names() print tfidfv.get_feature_names()[34597] print tfidfv.get_stop_words()
def make_stop_words(new_words_list): tfidf_temp = TfidfVectorizer(stop_words='english') stop_words = tfidf_temp.get_stop_words() result = list(stop_words) + new_words_list return result
all_classes = np.array(['positive', 'negative']) for l in range(len(pos_trainer)): pos_train = pos_train + ['positive'] for l in range(len(neg_trainer)): neg_train = neg_train + ['negative'] y_train = pos_train + neg_train X_train = tfidf.fit_transform(pos_trainer + neg_trainer) #n_tf = tfidf.fit_transform(neg_trainer) clf = MultinomialNB().fit(X_train, y_train) #clf = MultinomialNB().partial_fit(n_tf, neg_train) #clf.partial_fit(n_tf, neg_train) fn = tfidf.get_feature_names() stop = tfidf.get_stop_words() if statement == 'red': comment_list = redditor.redditor() for comment in range(len(comment_list)): statement = comment_list[comment] X_test = tfidf.transform(np.array([statement])) predicted = clf.predict(X_test) prob = clf.predict_proba(X_test) print "Tweet: " + str(l) print statement print predicted, prob X_test = tfidf.transform(np.array([statement])) predicted = clf.predict(X_test)
logging.info("#" * 10) logging.info("Training TfidfVectorizer") logging.info("#" * 10) start = time.time() tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=STOP_WORDS, decode_error='replace', tokenizer=LemmaTokenizer()) queries_index_transformed = tfidf.fit_transform(queries_index) logging.info("%s fit in %s s" % (len(queries_index), time.time() - start)) start = time.time() # logging.info("Selects feature pairs..") # queries_items_fields_vecs = [queries_index_transformed[feats_ids] for feats_ids in queries_items_fields] # print(tfidf.__dict__) tfidf_stopwords = tfidf.get_stop_words() logging.info("Done in %s s" % (time.time() - start)) start = time.time() logging.info("Transforming %s paragraphs.." % len(paragraphs)) para_features = tfidf.transform(paragraphs) logging.info("%s fit_transform in %s s" % (len(paragraphs), time.time() - start)) start = time.time() res_dists = [] proc_list = [] add_label = [] max_memory = 10 * 1024 * 1024 * 1024 # 10GB
print(newsgroups_subset.data[text_number]) # A vectorizer function is implemented in the Scikit-Learn package. You can directly filter out stopwords. from sklearn.feature_extraction.text import TfidfVectorizer vect = TfidfVectorizer(stop_words='english') X = vect.fit_transform(newsgroups_subset.data).toarray() y = newsgroups_subset.target print(type(X)) print(X.shape) # The list of stopwords can be displayed by the following attribute: print(vect.get_stop_words()) # In a next step the dataset is split into 90% training data and 10% test data. n_samples = X.shape[0] X_train = X[:int(.9 * n_samples)] y_train = y[:int(.9 * n_samples)] X_test = X[int(.9 * n_samples):] y_test = y[int(.9 * n_samples):] print(X_train.shape) # Feature Selection # Features are now selected according to their Chi2 value.
from_sara.close() from_chris.close() pickle.dump(word_data, open("your_word_data.pkl", "w")) pickle.dump(from_data, open("your_email_authors.pkl", "w")) ### in Part 4, do TfIdf vectorization here from sklearn.feature_extraction.text import TfidfVectorizer transformer = TfidfVectorizer(stop_words="english") theFit = transformer.fit_transform(word_data) print('theFit', theFit) afterTransform = theFit.toarray() print(afterTransform) stopWords = transformer.get_stop_words() # print('stopWords', len(stopWords)) featureNames = transformer.get_feature_names() # print(featureNames[34597]) print('featureNames', len(featureNames)) # shove matrix gained from TfidfVectorizer into # k-means clustering # just wanted to experiment from sklearn.cluster import MiniBatchKMeans clf = MiniBatchKMeans(n_clusters=4, n_init=10, max_iter=300) clf.fit(afterTransform) pred = clf.predict(afterTransform) print('PRED', pred)
all_docs.append(read_merge_data(ff)) print all_docs print len(all_docs) tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=1, stop_words='english') try: results = tfidf_vec.fit_transform(all_docs) print results.get_shape() result_as_array = results.toarray() feature_names = tfidf_vec.get_feature_names() total_highest_scores = get_highest_scoring_feature(result_as_array, feature_names, -10) print total_highest_scores print tfidf_vec.get_stop_words() except ValueError: pass # documents = read_data(directory) # first_five = documents[0:100] # tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=1, stop_words='english') # try: # results = tfidf_vec.fit_transform(first_five) # print results.get_shape() # result_as_array = results.toarray() # feature_names = tfidf_vec.get_feature_names() # total_highest_scores = get_highest_scoring_feature(result_as_array, feature_names, -10) # print total_highest_scores
pos_train = pos_train+['positive'] for l in range(len(neg_trainer)): neg_train = neg_train+['negative'] y_train = pos_train+neg_train X_train = tfidf.fit_transform(pos_trainer+neg_trainer) #n_tf = tfidf.fit_transform(neg_trainer) clf = MultinomialNB().fit(X_train, y_train) #clf = MultinomialNB().partial_fit(n_tf, neg_train) #clf.partial_fit(n_tf, neg_train) fn = tfidf.get_feature_names() stop = tfidf.get_stop_words() if statement == 'red': comment_list = redditor.redditor() for comment in range(len(comment_list)): statement = comment_list[comment] X_test = tfidf.transform(np.array([statement])) predicted = clf.predict(X_test) prob = clf.predict_proba(X_test) print "Tweet: " + str(l) print statement print predicted, prob X_test = tfidf.transform(np.array([statement]))
print(u'测试集包含的文本数目:', len(data_test.data)) print(u'训练集和测试集使用的%d个类别的名称:' % len(categories)) categories = data_train.target_names pprint(categories) y_train = data_train.target y_test = data_test.target print(u' -- 前10个文本 -- ') for i in np.arange(10): print(u'文本%d(属于类别 - %s):' % (i+1, categories[y_train[i]])) print(data_train.data[i]) print('\n\n') vectorizer = TfidfVectorizer(input='content', stop_words='english', max_df=0.5, sublinear_tf=True) x_train = vectorizer.fit_transform(data_train.data) # x_train是稀疏的,scipy.sparse.csr.csr_matrix x_test = vectorizer.transform(data_test.data) print(u'训练集样本个数:%d,特征个数:%d' % x_train.shape) print(u'停止词:\n', pprint(vectorizer.get_stop_words())) feature_names = np.asarray(vectorizer.get_feature_names()) print(u'\n\n===================\n分类器的比较:\n') clfs = (MultinomialNB(), # 0.87(0.017), 0.002, 90.39% BernoulliNB(), # 1.592(0.032), 0.010, 88.54% ) result = [] for clf in clfs: a = test_clf(clf) result.append(a) print('\n') result = np.array(result) time_train, time_test, err, names = result.T time_train = time_train.astype(np.float) time_test = time_test.astype(np.float)
new_str.replace('germani', '') ### append the text to word_data word_data.append(new_str) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if from_person == 'Sara': from_data.append(0) elif from_person == 'Chris': from_data.append(1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump(word_data, open("your_word_data.pkl", "w")) pickle.dump(from_data, open("your_email_authors.pkl", "w")) from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(stop_words='english') data = vectorizer.fit_transform(word_data) vectorizer = vectorizer.fit(word_data) stop_words = vectorizer.get_stop_words() print len(vectorizer.get_feature_names()) #print word_data[152] # 385 with 200 samples # 38825 with all samples ### in Part 4, do TfIdf vectorization here
class TfIdf: def __init__(self, corpus=None): self.tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, max_df=0.9, stop_words='english') self.tfidf_matrix = None self.corpus = corpus if corpus != None: self.calcTFidf(corpus) #Todo: get highest ranked words of each document, do tfidf on srt #Todo: tfidf on the query itself(to limit search on words) #idf gives the inverse of number of times word appears in other documents, get words with high idf #words with just high tf #phrase queries with tfidf to match highest score.... def testQuery(self, query): response = self.calcTFidfQuery(query) topChoices = self.calcCosineSim(response) self.printTopChoices(topChoices) def calcCosineSim(self, queryTfidfVec, top_n=10): if self.tfidf_matrix is None: print 'Error! tfidfMatrix has not yet been initialized.' else: cosine_similarities = linear_kernel(queryTfidfVec, self.tfidf_matrix).flatten() related_docs_indices = [ i for i in cosine_similarities.argsort()[::-1] ] return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n] def calcTFidfQuery(self, query): response = self.tfidf.transform([query]) feature_names = self.tfidf.get_feature_names() print "Query" #print response #response- sk learn matrix: (doc_num, feature_name): tfidf score #print feature_names for col in response.nonzero()[1]: print feature_names[col], ', ', response[0, col] return response def calcTFidf(self, corpus): self.tfidf_matrix = self.tfidf.fit_transform(corpus) feature_names = self.tfidf.get_feature_names() dense = self.tfidf_matrix.todense() stopWordsSet = self.tfidf.get_stop_words() #the stopWordsSet #TODO: figure out this part, what is this doiong? currDoc = dense[1].tolist()[0] #filter out to get doc (i+1) phrase_scores = [ pair for pair in zip(range(0, len(currDoc)), currDoc) if pair[1] > 0 ] #pair of featurename to feature score sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1) print phrase_scores print "Sorted scores" print sorted_phrase_scores for scoreTuple in sorted_phrase_scores: phraseToScore = str(feature_names[scoreTuple[0]]) + ', ' + str( scoreTuple[1]) print phraseToScore #for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:20]: # print('{0: <20} {1}'.format(phrase, score)) def printTopChoices(self, topChoices): print "TFIDF:" print topChoices for choice in topChoices: print self.corpus[choice[0]]
tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=1, stop_words='english') try: results = tfidf_vec.fit_transform(all_docs) print results.get_shape() result_as_array = results.toarray() feature_names = tfidf_vec.get_feature_names() total_highest_scores = get_highest_scoring_feature(result_as_array, feature_names, -10) print total_highest_scores print tfidf_vec.get_stop_words() except ValueError: pass # documents = read_data(directory) # first_five = documents[0:100] # tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=1, stop_words='english') # try: # results = tfidf_vec.fit_transform(first_five) # print results.get_shape() # result_as_array = results.toarray() # feature_names = tfidf_vec.get_feature_names() # total_highest_scores = get_highest_scoring_feature(result_as_array, feature_names, -10) # print total_highest_scores
count -= 1 if count == 0: pass print "Finished reading." def tokenize(text): text.split(' ') return text.split(' ') vectorizer = TfidfVectorizer(tokenizer=tokenize, min_df=5, max_df=0.03, stop_words='english') m = vectorizer.fit_transform(corpus) print "Finished tf-idf.", m.shape[0], m.shape[1] print vectorizer.get_stop_words() phrases = vectorizer.get_feature_names() with open(output_file, 'w') as output: for i in xrange(m.shape[0]): d = m.getrow(i) s = zip(d.indices, d.data) sorted_s = sorted(s, key=lambda v: v[1], reverse=True) indices = [element[0] for element in sorted_s] for i in range(min(top_K, len(indices))): output.write(phrases[indices[i]]) output.write(' ') output.write('\n')
y_test = data_test.target # 目标分类 转化成[0,1,2,3] print("--前10个文本--") for i in np.arange(10): print("文本%d(属于类别 - %s):" % (i + 1, categories[y_train[i]])) print(data_train.data[i]) print("\n\n\n") vectorizer = TfidfVectorizer(input='content', stop_words='english', max_df=0.5, sublinear_tf=True) # 设置模型参数 x_train = vectorizer.fit_transform(data_train.data) x_test = vectorizer.transform(data_test.data) print("训练集样本个数:%d,特征个数:%d" % x_train.shape) print("停用词:\n") pprint(vectorizer.get_stop_words()) feature_names = np.asarray(vectorizer.get_feature_names()) print("\n\n================\n分类器的比较:\n") clfs = ( MultinomialNB(), BernoulliNB() # KNeighborsClassifier() # RidgeClassifier() # RandomForestClassifier(n_estimators=200) # SVC() ) result = [] for clf in clfs: a = test_clf(clf) result.append(a) print('\n')
class TfidfRecommender: """Term Frequency - Inverse Document Frequency (TF-IDF) Recommender This class provides content-based recommendations using TF-IDF vectorization in combination with cosine similarity. """ def __init__(self, id_col, tokenization_method="scibert"): """Initialize model parameters Args: id_col (str): Name of column containing item IDs. tokenization_method (str): ['none','nltk','bert','scibert'] option for tokenization method. """ self.id_col = id_col if tokenization_method.lower() not in ["none", "nltk", "bert", "scibert"]: raise ValueError( 'Tokenization method must be one of ["none" | "nltk" | "bert" | "scibert"]' ) self.tokenization_method = tokenization_method.lower() # Initialize other variables used in this class self.tf = TfidfVectorizer() self.tfidf_matrix = dict() self.tokens = dict() self.stop_words = frozenset() self.recommendations = dict() self.top_k_recommendations = pd.DataFrame() def __clean_text(self, text, for_BERT=False, verbose=False): """ Clean text by removing HTML tags, symbols, and punctuation. Args: text (str): Text to clean. for_BERT (boolean): True or False for if this text is being cleaned for a BERT word tokenization method. verbose (boolean): True or False for whether to print. Returns: clean (str): Cleaned version of text. """ try: # Normalize unicode text_norm = unicodedata.normalize("NFC", text) # Remove HTML tags clean = re.sub("<.*?>", "", text_norm) # Remove new line and tabs clean = clean.replace("\n", " ") clean = clean.replace("\t", " ") clean = clean.replace("\r", " ") clean = clean.replace("Â\xa0", "") # non-breaking space # Remove all punctuation and special characters clean = re.sub("([^\s\w]|_)+", "", clean) # If you want to keep some punctuation, see below commented out example # clean = re.sub('([^\s\w\-\_\(\)]|_)+','', clean) # Skip further processing if the text will be used in BERT tokenization if for_BERT is False: # Lower case clean = clean.lower() except: if verbose is True: print("Cannot clean non-existent text") clean = "" return clean def clean_dataframe(self, df, cols_to_clean, new_col_name="cleaned_text"): """ Clean the text within the columns of interest and return a dataframe with cleaned and combined text. Args: df (pd.DataFrame): Dataframe containing the text content to clean. cols_to_clean (list of str): List of columns to clean by name (e.g., ['abstract','full_text']). new_col_name (str): Name of the new column that will contain the cleaned text. Returns: df (pd.DataFrame): Dataframe with cleaned text in the new column. """ # Collapse the table such that all descriptive text is just in a single column df = df.replace(np.nan, "", regex=True) df[new_col_name] = df[cols_to_clean].apply(lambda cols: " ".join(cols), axis=1) # Check if for BERT tokenization if self.tokenization_method in ["bert", "scibert"]: for_BERT = True else: for_BERT = False # Clean the text in the dataframe df[new_col_name] = df[new_col_name].map( lambda x: self.__clean_text(x, for_BERT) ) return df def tokenize_text( self, df_clean, text_col="cleaned_text", ngram_range=(1, 3), min_df=0 ): """ Tokenize the input text. For more details on the TfidfVectorizer, see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html Args: df_clean (pd.DataFrame): Dataframe with cleaned text in the new column. text_col (str): Name of column containing the cleaned text. ngram_range (tuple of int): The lower and upper boundary of the range of n-values for different n-grams to be extracted. min_df (int): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. Returns: tf (TfidfVectorizer): Scikit-learn TfidfVectorizer object defined in .tokenize_text(). vectors_tokenized (pd.Series): Each row contains tokens for respective documents separated by spaces. """ vectors = df_clean[text_col] # If a HuggingFace BERT word tokenization method if self.tokenization_method in ["bert", "scibert"]: # Set vectorizer tf = TfidfVectorizer( analyzer="word", ngram_range=ngram_range, min_df=min_df, stop_words="english", ) # Get appropriate transformer name if self.tokenization_method == "bert": bert_method = "bert-base-cased" elif self.tokenization_method == "scibert": bert_method = "allenai/scibert_scivocab_cased" # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained(bert_method) # Loop through each item vectors_tokenized = vectors.copy() for i in range(0, len(vectors)): vectors_tokenized[i] = " ".join(tokenizer.tokenize(vectors[i])) elif self.tokenization_method == "nltk": # NLTK Stemming token_dict = {} stemmer = PorterStemmer() def stem_tokens(tokens, stemmer): stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed def tokenize(text): tokens = nltk.word_tokenize(text) stems = stem_tokens(tokens, stemmer) return stems # When defining a custome tokenizer with TfidfVectorizer, the tokenization is applied in the fit function tf = TfidfVectorizer( tokenizer=tokenize, analyzer="word", ngram_range=ngram_range, min_df=min_df, stop_words="english", ) vectors_tokenized = vectors elif self.tokenization_method == "none": # No tokenization applied tf = TfidfVectorizer( analyzer="word", ngram_range=ngram_range, min_df=min_df, stop_words="english", ) vectors_tokenized = vectors # Save to class variable self.tf = tf return tf, vectors_tokenized def fit(self, tf, vectors_tokenized): """ Fit TF-IDF vectorizer to the cleaned and tokenized text. Args: tf (TfidfVectorizer): Scikit-learn TfidfVectorizer object defined in .tokenize_text(). vectors_tokenized (pd.Series): Each row contains tokens for respective documents separated by spaces. """ self.tfidf_matrix = tf.fit_transform(vectors_tokenized) def get_tokens(self): """ Return the tokens generated by the TF-IDF vectorizer. Returns: self.tokens (dict): Dictionary of tokens generated by the TF-IDF vectorizer. """ try: self.tokens = self.tf.vocabulary_ except: self.tokens = "Run .tokenize_text() and .fit_tfidf() first" return self.tokens def get_stop_words(self): """ Return the stop words excluded in the TF-IDF vectorizer. Returns: self.stop_words (frozenset): Frozenset of stop words used by the TF-IDF vectorizer (can be converted to list). """ try: self.stop_words = self.tf.get_stop_words() except: self.stop_words = "Run .tokenize_text() and .fit_tfidf() first" return self.stop_words def __create_full_recommendation_dictionary(self, df_clean): """ Create the full recommendation dictionary containing all recommendations for all items. Args: df_clean (pd.DataFrame): Dataframe with cleaned text. """ # Similarity measure cosine_sim = linear_kernel(self.tfidf_matrix, self.tfidf_matrix) results = {} for idx, row in df_clean.iterrows(): similar_indices = cosine_sim[idx].argsort()[: -(len(df_clean) + 1) : -1] similar_items = [ (cosine_sim[idx][i], df_clean[self.id_col][i]) for i in similar_indices ] results[row[self.id_col]] = similar_items[1:] # Save to class self.recommendations = results def __organize_results_as_tabular(self, df_clean, k): """ Restructures results dictionary into a table containing only the top k recommendations per item. Args: df_clean (pd.DataFrame): Dataframe with cleaned text. k (int): Number of recommendations to return. """ # Initialize new dataframe to hold recommendation output item_id = list() rec_rank = list() rec_score = list() rec_item_id = list() # For each item for idx in range(0, len(self.recommendations)): # Information about the item we are basing recommendations off of rec_based_on = list(self.recommendations.keys())[idx] tmp_item_id = str( df_clean.loc[df_clean[self.id_col] == rec_based_on][self.id_col].values[ 0 ] ) # Get all scores and IDs for items recommended for this current item rec_array = self.recommendations[rec_based_on] tmp_rec_score = list(map(lambda x: x[0], rec_array)) tmp_rec_id = list(map(lambda x: x[1], rec_array)) # Append multiple values at a time to list item_id.extend([tmp_item_id] * k) rec_rank.extend(list(range(1, k + 1))) rec_score.extend(tmp_rec_score[:k]) rec_item_id.extend(tmp_rec_id[:k]) # Save the output output_dict = { self.id_col: item_id, "rec_rank": rec_rank, "rec_score": rec_score, "rec_" + self.id_col: rec_item_id, } # Convert to dataframe self.top_k_recommendations = pd.DataFrame(output_dict) def recommend_top_k_items(self, df_clean, k=5): """ Recommend k number of items similar to the item of interest. Args: df_clean (pd.DataFrame): Dataframe with cleaned text. k (int): Number of recommendations to return. Returns: self.top_k_recommendations (pd.DataFrame): Dataframe containing id of top k recommendations for all items. """ if k > len(df_clean) - 1: raise ValueError( "Cannot get more recommendations than there are items. Set k lower." ) self.__create_full_recommendation_dictionary(df_clean) self.__organize_results_as_tabular(df_clean, k) return self.top_k_recommendations def __get_single_item_info(self, metadata, rec_id): """ Get full information for a single recommended item. Args: metadata (pd.DataFrame): Dataframe containing item info. rec_id (str): Identifier for recommended item. Results: rec_info (pd.Series): Single row from dataframe containing recommended item info. """ # Return row rec_info = metadata.iloc[int(np.where(metadata[self.id_col] == rec_id)[0])] return rec_info def __make_clickable(self, address): """ Make URL clickable. Args: address (str): URL address to make clickable. """ return '<a href="{0}">{0}</a>'.format(address) def get_top_k_recommendations( self, metadata, query_id, cols_to_keep=[], verbose=True ): """ Return the top k recommendations with useful metadata for each recommendation. Args: metadata (pd.DataFrame): Dataframe holding metadata for all public domain papers. query_id (str): ID of item of interest. cols_to_keep (list of str): List of columns from the metadata dataframe to include (e.g., ['title','authors','journal','publish_time','url']). By default, all columns are kept. verbose (boolean): Set to True if you want to print the table. Results: df (pd.Styler): Stylized dataframe holding recommendations and associated metadata just for the item of interest (can access as normal dataframe by using df.data). """ # Create subset of dataframe with just recommendations for the item of interest df = self.top_k_recommendations.loc[ self.top_k_recommendations[self.id_col] == query_id ].reset_index() # Remove id_col of query item df.drop([self.id_col], axis=1, inplace=True) # Add metadata for each recommended item (rec_<id_col>) metadata_cols = metadata.columns.values df[metadata_cols] = df.apply( lambda row: self.__get_single_item_info( metadata, row["rec_" + self.id_col] ), axis=1, ) # Remove id col added from metadata (already present from self.top_k_recommendations) df.drop([self.id_col], axis=1, inplace=True) # Rename columns such that rec_ is no longer appended, for simplicity df = df.rename(columns={"rec_rank": "rank", "rec_score": "similarity_score"}) # Only keep columns of interest if len(cols_to_keep) > 0: # Insert our recommendation scoring/ranking columns cols_to_keep.insert(0, "similarity_score") cols_to_keep.insert(0, "rank") df = df[cols_to_keep] # Make URLs clickable if they exist if "url" in list(map(lambda x: x.lower(), metadata_cols)): format_ = {"url": self.__make_clickable} df = df.head().style.format(format_) if verbose == True: df return df