コード例 #1
0
def get_keywords_for_vocabularies(vocabularies):
    collection = TextCollection(list(vocabularies.values()))
    keywords = {}
    batch_size = len(vocabularies) // 20
    for index, (name, vocabulary) in enumerate(vocabularies.items()):
        if index % batch_size == 0:
            print("Processing vocabulary #" + str(index) + "...")
        vocabulary_words = list(set(vocabulary))
        num_tokens = len(vocabulary_words)
        if num_tokens > 1000:
            tokens_freq = FreqDist(vocabulary)
            first_keyword = 50
            vocabulary_words = tokens_freq.most_common(first_keyword + 1000)
            vocabulary_words = vocabulary_words[first_keyword:]
            vocabulary_words = [word for word, _ in vocabulary_words]
        tf_idfs = {
            word: collection.tf_idf(word, vocabulary)
            for word in vocabulary_words
        }
        tf_idf_values = list(tf_idfs.values())
        tf_idfs = sorted(tf_idfs.items(), key=lambda x: x[1], reverse=True)
        mean_tf_idf = np.mean(tf_idf_values)
        median_tf_idf = np.median(tf_idf_values)
        threshold = max((mean_tf_idf, median_tf_idf))
        keyword_list = [
            word for word, tf_idf_value in tf_idfs if tf_idf_value >= threshold
        ]
        if len(keyword_list) > 30:
            keyword_list = keyword_list[:30]
        keywords[name] = keyword_list
    return keywords
コード例 #2
0
def calc_tf_idf(docs):
    # docsは辞書型データのリスト。キーがスクリーンネーム、バリューがプロフィール
    total_words = []
    each_friend_words = {}

    #for name in docs.keys():  #APIで取得した文字コードはユニコードなのでutf-8に修正する。
    #    docs[name] = docs[name].encode('utf-8')
    for name in docs.keys():
        each_friend_words[name] = []
        words_ja = get_nouns_ja(docs[name])
        words_en = get_nouns_en(docs[name])
        for word in words_ja:
            total_words.append(word)
            each_friend_words[name].append(word)  # スクリーンネームはutf-8に揃える
        for word in words_en:
            total_words.append(word)
            each_friend_words[name].append(word)  # スクリーンネームはutf-8に揃える
    total_words = del_stopwords(total_words)
    docs = [doc for doc in docs.values()]  #TextCollectionに適切な形に変換
    collection = TextCollection(docs)
    word_types = set(total_words)

    results = []
    tf_idf = {}
    for word_type in word_types:
        #tf = collection.tf(word_type, total_words)
        #idf = collection.idf(word_type, total_words)
        #tf_idf = collection.tf_idf(word_type, total_words)
        #results.append([word_type, tf, idf, tf_idf])
        tf_idf[word_type] = collection.tf_idf(word_type, total_words)
    #return sorted(results, key=lambda result:result[3], reverse=True), each_friend_words
    return tf_idf, each_friend_words
コード例 #3
0
    def transform(self, sents):
        """
        tf_idfによりsentssをベクトルに変換する
        :param corpus: list(list(str))
        :return:
        """
        # 変換するために、全ての単語のlistを生成
        words = sum(sents, [])
        word_collection = TextCollection(words)

        for sent in sents:
            yield {word: word_collection.tf_idf(word, sent) for word in sent}
コード例 #4
0
ファイル: keywords.py プロジェクト: RedSunCMX/thesis
def extractKeywords(selection, corpus, nr):
    csv = open('db\cyttron-keywords.csv', 'w')
    cyttronCorpus = TextCollection(corpus)
    for i in range(len(selection)):
        currentEntry = selection[i].lower()
        freqWords(currentEntry, cyttronCorpus, nr)
        freqNouns(currentEntry, cyttronCorpus, nr)
        nGrams(currentEntry, cyttronCorpus, nr, clean=True)
    csv.close()
コード例 #5
0
ファイル: TF-IDF.py プロジェクト: Jason0827/master-degree
import nltk
from nltk import FreqDist
import jieba
from nltk import TextCollection

text1 = '我很喜欢这部电影 '
text2 = '这部电影很棒 '
text3 = '这个很不错 '
text4 = '这个真的很烂 '
text5 = '这部电影不行'

tc = TextCollection([text1, text2, text3, text4, text5])
new_text = '这部电影实在太好看了!'
word = '电影'
tf_idf_val = tc.tf_idf(word, new_text)
print('{}的TF-IDF值为:{}'.format(word, tf_idf_val))
コード例 #6
0
    #read_to_save()
    #run_main()

    train_text_df, test_text_df = load_train_set()

    n_common_words = 200

    all_words_in_train = get_word_list_from_data(train_text_df)

    fdisk = nltk.FreqDist(all_words_in_train)
    common_words_freqs = fdisk.most_common(n_common_words)

    for word, count in common_words_freqs:
        print('{}:{}'.format(word, count))

    text_collection = TextCollection(train_text_df['text'].values.tolist())
    print('训练样本特征提取')
    train_X, train_y = extract_feat_from_data(train_text_df, text_collection,
                                              common_words_freqs)
    print('完成')

    print('测试样本特征提取')
    test_X, test_y = extract_feat_from_data(test_text_df, text_collection,
                                            common_words_freqs)
    print('完成')

    print('训练模型...')
    gnb = GaussianNB()
    gnb.fit(train_X, train_y)
    print('训练完成...')
    '''
コード例 #7
0
ファイル: NLTK文本分析.py プロジェクト: wsgan001/python-4
百度手机浏览器是百度自主研发,为手机上网用户量身定制的一款浏览类产品,于2011 年6月15日正式上线公测,极速内核强劲动力,提供超强智能搜索,整合百度优质服务。
Hao123: 上网从这里开始
Hao123创立于1999年,2004年被百度收购。作为百度旗下核心产品,hao123及时收录包括音乐、视频、小说、游戏等热门分类的网站,与搜索完美结合,为中国互联网用户提供最简单便捷的网上导航服务,重新定义了上网导航的概念。
百度杀毒: 更快更安全
百度杀毒是由百度公司研发的专业杀毒软件,也是世界上第一款将“深度学习”技术应用到病毒查杀客户端的产品。产品依托于百度强大的云计算、大数据能力。自2013年上线以来,百度杀毒累积为千万用户提供网络安全服务。
百度卫士: 轻、快、智、净
百度卫士是百度公司出品的系统工具软件,集电脑加速、系统清理、安全维护三大功能于一身,为用户提供优质的电脑及网络安全服务。
百度医生: 更权威,更便捷,更丰富,连接人与医疗服务
百度医生打造了面向普通用户、医生以及医院的产品体系,包括百度医生、百度医生工作台、百度医学、医疗直达号等,实现医患双选的业务模式,从而优化医疗资源的配置效率,提升各方的工作效率,改善患者的就医体验。
百度商业服务,新生产力引擎
百度商业服务是原有的百度推广(以搜索推广为主)的基础上,将数据产品、交易产品、媒体产品、信用产品和咨询服务进行了深度的整合, 并已将咨询服务、百度内容联盟加入到整体的商业服务框架中来。
目前百度商业服务包括三大类产品服务: 以凤巢搜索排名为基础的推广类产品服务,品牌宣传类的产品服务以及基于大数据的数据产品增值服务
'''
words = jieba.cut(content)
words_freq = nltk.FreqDist(words)
print(words_freq.most_common(50))  #词频最高的50个词汇

#计算TF-IDF
from nltk import TextCollection

corpus = TextCollection(
    ['I love my mother', 'I love my country', 'I love my daddy'])
corpus.tf_idf('country', 'I love my country')

#句法树
from nltk.corpus import treebank
from nltk.tree import Tree

sentTree = '(IP (NP (NR 张三)) (VP (VV 参加) (AS 了) (NP (NN 会议))))'
tree = Tree.fromstring(sentTree)
tree.draw()
コード例 #8
0
ファイル: clusterer.py プロジェクト: israelst/pypln
def build_TC(db, collection):
    col = Connection('127.0.0.1')[db][collection]
    tc = TextCollection([t for t in col.find(fields=['text'])],
                        name=collection)
    return tc
コード例 #9
0
    a["lyrics"] = Text(
        w.lower()
        for w in nltk.word_tokenize(" ".join(a["lyrics"]).encode("utf-8")))
    a["terms"] = set(w for w in a["lyrics"] if not is_stopword(w) and valid(w))

    a["tf_idf"] = dict(
        sorted(((t, text_col.tf_idf(t, a["lyrics"])) for t in a["terms"]),
               key=lambda x: x[1]))
    i += 1
    print i
    return a


ts = TextCollection([
    Text((w.lower()
          for w in nltk.word_tokenize(" ".join(l["lyrics"]).encode('utf-8'))
          if not is_stopword(w) and valid(w)),
         name=l["id"]) for l in lyrics if l.get("id")
])

#lyrics = map(lambda x : add_info(x[1], ts, x[0]), ((i, l) for i, l in enumerate(lyrics) if l.get("id")))

#with open("withtfidf.pickle", "w") as f:
#    pickle.dump(lyrics, f)

with open("withtfidf.pickle", "r") as f:
    lyrics = pickle.load(f)

lyrics = dict((l["id"], l) for l in lyrics)


def similar_lyrics(lyric):