def get_keywords_for_vocabularies(vocabularies): collection = TextCollection(list(vocabularies.values())) keywords = {} batch_size = len(vocabularies) // 20 for index, (name, vocabulary) in enumerate(vocabularies.items()): if index % batch_size == 0: print("Processing vocabulary #" + str(index) + "...") vocabulary_words = list(set(vocabulary)) num_tokens = len(vocabulary_words) if num_tokens > 1000: tokens_freq = FreqDist(vocabulary) first_keyword = 50 vocabulary_words = tokens_freq.most_common(first_keyword + 1000) vocabulary_words = vocabulary_words[first_keyword:] vocabulary_words = [word for word, _ in vocabulary_words] tf_idfs = { word: collection.tf_idf(word, vocabulary) for word in vocabulary_words } tf_idf_values = list(tf_idfs.values()) tf_idfs = sorted(tf_idfs.items(), key=lambda x: x[1], reverse=True) mean_tf_idf = np.mean(tf_idf_values) median_tf_idf = np.median(tf_idf_values) threshold = max((mean_tf_idf, median_tf_idf)) keyword_list = [ word for word, tf_idf_value in tf_idfs if tf_idf_value >= threshold ] if len(keyword_list) > 30: keyword_list = keyword_list[:30] keywords[name] = keyword_list return keywords
def calc_tf_idf(docs): # docsは辞書型データのリスト。キーがスクリーンネーム、バリューがプロフィール total_words = [] each_friend_words = {} #for name in docs.keys(): #APIで取得した文字コードはユニコードなのでutf-8に修正する。 # docs[name] = docs[name].encode('utf-8') for name in docs.keys(): each_friend_words[name] = [] words_ja = get_nouns_ja(docs[name]) words_en = get_nouns_en(docs[name]) for word in words_ja: total_words.append(word) each_friend_words[name].append(word) # スクリーンネームはutf-8に揃える for word in words_en: total_words.append(word) each_friend_words[name].append(word) # スクリーンネームはutf-8に揃える total_words = del_stopwords(total_words) docs = [doc for doc in docs.values()] #TextCollectionに適切な形に変換 collection = TextCollection(docs) word_types = set(total_words) results = [] tf_idf = {} for word_type in word_types: #tf = collection.tf(word_type, total_words) #idf = collection.idf(word_type, total_words) #tf_idf = collection.tf_idf(word_type, total_words) #results.append([word_type, tf, idf, tf_idf]) tf_idf[word_type] = collection.tf_idf(word_type, total_words) #return sorted(results, key=lambda result:result[3], reverse=True), each_friend_words return tf_idf, each_friend_words
def transform(self, sents): """ tf_idfによりsentssをベクトルに変換する :param corpus: list(list(str)) :return: """ # 変換するために、全ての単語のlistを生成 words = sum(sents, []) word_collection = TextCollection(words) for sent in sents: yield {word: word_collection.tf_idf(word, sent) for word in sent}
def extractKeywords(selection, corpus, nr): csv = open('db\cyttron-keywords.csv', 'w') cyttronCorpus = TextCollection(corpus) for i in range(len(selection)): currentEntry = selection[i].lower() freqWords(currentEntry, cyttronCorpus, nr) freqNouns(currentEntry, cyttronCorpus, nr) nGrams(currentEntry, cyttronCorpus, nr, clean=True) csv.close()
import nltk from nltk import FreqDist import jieba from nltk import TextCollection text1 = '我很喜欢这部电影 ' text2 = '这部电影很棒 ' text3 = '这个很不错 ' text4 = '这个真的很烂 ' text5 = '这部电影不行' tc = TextCollection([text1, text2, text3, text4, text5]) new_text = '这部电影实在太好看了!' word = '电影' tf_idf_val = tc.tf_idf(word, new_text) print('{}的TF-IDF值为:{}'.format(word, tf_idf_val))
#read_to_save() #run_main() train_text_df, test_text_df = load_train_set() n_common_words = 200 all_words_in_train = get_word_list_from_data(train_text_df) fdisk = nltk.FreqDist(all_words_in_train) common_words_freqs = fdisk.most_common(n_common_words) for word, count in common_words_freqs: print('{}:{}'.format(word, count)) text_collection = TextCollection(train_text_df['text'].values.tolist()) print('训练样本特征提取') train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs) print('完成') print('测试样本特征提取') test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs) print('完成') print('训练模型...') gnb = GaussianNB() gnb.fit(train_X, train_y) print('训练完成...') '''
百度手机浏览器是百度自主研发,为手机上网用户量身定制的一款浏览类产品,于2011 年6月15日正式上线公测,极速内核强劲动力,提供超强智能搜索,整合百度优质服务。 Hao123: 上网从这里开始 Hao123创立于1999年,2004年被百度收购。作为百度旗下核心产品,hao123及时收录包括音乐、视频、小说、游戏等热门分类的网站,与搜索完美结合,为中国互联网用户提供最简单便捷的网上导航服务,重新定义了上网导航的概念。 百度杀毒: 更快更安全 百度杀毒是由百度公司研发的专业杀毒软件,也是世界上第一款将“深度学习”技术应用到病毒查杀客户端的产品。产品依托于百度强大的云计算、大数据能力。自2013年上线以来,百度杀毒累积为千万用户提供网络安全服务。 百度卫士: 轻、快、智、净 百度卫士是百度公司出品的系统工具软件,集电脑加速、系统清理、安全维护三大功能于一身,为用户提供优质的电脑及网络安全服务。 百度医生: 更权威,更便捷,更丰富,连接人与医疗服务 百度医生打造了面向普通用户、医生以及医院的产品体系,包括百度医生、百度医生工作台、百度医学、医疗直达号等,实现医患双选的业务模式,从而优化医疗资源的配置效率,提升各方的工作效率,改善患者的就医体验。 百度商业服务,新生产力引擎 百度商业服务是原有的百度推广(以搜索推广为主)的基础上,将数据产品、交易产品、媒体产品、信用产品和咨询服务进行了深度的整合, 并已将咨询服务、百度内容联盟加入到整体的商业服务框架中来。 目前百度商业服务包括三大类产品服务: 以凤巢搜索排名为基础的推广类产品服务,品牌宣传类的产品服务以及基于大数据的数据产品增值服务 ''' words = jieba.cut(content) words_freq = nltk.FreqDist(words) print(words_freq.most_common(50)) #词频最高的50个词汇 #计算TF-IDF from nltk import TextCollection corpus = TextCollection( ['I love my mother', 'I love my country', 'I love my daddy']) corpus.tf_idf('country', 'I love my country') #句法树 from nltk.corpus import treebank from nltk.tree import Tree sentTree = '(IP (NP (NR 张三)) (VP (VV 参加) (AS 了) (NP (NN 会议))))' tree = Tree.fromstring(sentTree) tree.draw()
def build_TC(db, collection): col = Connection('127.0.0.1')[db][collection] tc = TextCollection([t for t in col.find(fields=['text'])], name=collection) return tc
a["lyrics"] = Text( w.lower() for w in nltk.word_tokenize(" ".join(a["lyrics"]).encode("utf-8"))) a["terms"] = set(w for w in a["lyrics"] if not is_stopword(w) and valid(w)) a["tf_idf"] = dict( sorted(((t, text_col.tf_idf(t, a["lyrics"])) for t in a["terms"]), key=lambda x: x[1])) i += 1 print i return a ts = TextCollection([ Text((w.lower() for w in nltk.word_tokenize(" ".join(l["lyrics"]).encode('utf-8')) if not is_stopword(w) and valid(w)), name=l["id"]) for l in lyrics if l.get("id") ]) #lyrics = map(lambda x : add_info(x[1], ts, x[0]), ((i, l) for i, l in enumerate(lyrics) if l.get("id"))) #with open("withtfidf.pickle", "w") as f: # pickle.dump(lyrics, f) with open("withtfidf.pickle", "r") as f: lyrics = pickle.load(f) lyrics = dict((l["id"], l) for l in lyrics) def similar_lyrics(lyric):