def get_freq(self, word): this_word = word import nltk from nltk.corpus import sinica_treebank corpus = sinica_treebank.words() from collections import Counter freq_list = Counter(corpus) print(freq_list)
'English: Brown Corpus (Religion)': lambda: brown.words(categories='religion'), 'English: Brown Corpus (Learned)': lambda: brown.words(categories='learned'), 'English: Brown Corpus (Science Fiction)': lambda: brown.words(categories='science_fiction'), 'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'), 'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'), 'English: NPS Chat Corpus': lambda: nps_chat.words(), 'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words() }
'English: Brown Corpus (Religion)': lambda: brown.words(categories='religion'), 'English: Brown Corpus (Learned)': lambda: brown.words(categories='learned'), 'English: Brown Corpus (Science Fiction)': lambda: brown.words(categories='science_fiction'), 'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'), 'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'), 'English: NPS Chat Corpus': lambda: nps_chat.words(), 'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words() } class CollocationsView:
# -*- coding:utf-8 -*- # Filename: sinica_treebank.py # Author:hankcs # Date: 2014-04-08 上午11:44 from __future__ import print_function import nltk import sqlite3 from nltk.corpus import sinica_treebank sinica_text = nltk.Text(sinica_treebank.words()) print(sinica_text) for (key, var) in sinica_treebank.tagged_words()[:8]: print('%s/%s' % (key, var)) print(sinica_text.concordance('我')) print(sinica_text.concordance(u'\u5609\u73cd')) print("************* sinica_fd=nltk.FreqDist(sinica_treebank.words()) NLTK计算中文高频词 **********") sinica_fd=nltk.FreqDist(sinica_treebank.words()) top100=sinica_fd.items()[0:100] for (x,y) in top100: print(x,y)
# -*-coding:utf-8-*- import jieba import pandas as pd import time import uniout import nltk from nltk.corpus import wordnet as wn from nltk.corpus import sinica_treebank print(sinica_treebank.words()[100:120]) # sinica_treebank.parsed_sents()[33].draw() # nltk.download() # wn.syssets('love') # print wn.synsets(u'摩托车') # print wn.synset('car.n.01').lemma_names
def __init__(self, min_nchar, fn, lang="ENG"): """ TXT_FN : path to file containing text data. """ self.min_nchar = min_nchar self.fdict = { 'WORD': self.sample_word, 'LINE': self.sample_line, 'PARA': self.sample_para } self.lang = lang # parse English text if self.lang == "ENG": print('Generate English Data with NLTK:PlaintextCorpusReader') corpus = PlaintextCorpusReader("./", fn) self.words = corpus.words() self.sents = corpus.sents() self.paras = corpus.paras() # parse Japanese text elif self.lang == "JPN": print('Generate Japanese Data with NLTK:ChasenCorpusReader') # convert fs into chasen file _, ext = os.path.splitext(os.path.basename(fn)) fn_chasen = fn.replace(ext, ".chasen") print("Convert {} into {}".format(fn, fn_chasen)) cmd = "mecab -Ochasen {} > {}".format(fn, fn_chasen) print( "The following cmd below was executed to convert into chasen (for Japanese)" ) print("\t{}".format(cmd)) p = subprocess.call(cmd, shell=True) data = ChasenCorpusReader('./', fn_chasen, encoding='utf-8') self.words = data.words() self.sents = data.sents() self.paras = data.paras() # jp_sent_tokenizer = nltk.RegexpTokenizer(u'[^ 「」!?。]*[!?。]') # jp_chartype_tokenizer = nltk.RegexpTokenizer(u'([ぁ-んー]+|[ァ-ンー]+|[\u4e00-\u9FFF]+|[^ぁ-んァ-ンー\u4e00-\u9FFF]+)') # # corpus = PlaintextCorpusReader("./", # fn, # encoding='utf-8', # para_block_reader=read_line_block, # sent_tokenizer=jp_sent_tokenizer, # word_tokenizer=jp_chartype_tokenizer) elif self.lang == "ZHTW": print( 'Generate Traditional Chinese Data with NLTK:sinica_treebank') self.words = [] self.sents = [] self.paras = [] #data = SinicaTreebankCorpusReader('./', fn, encoding='utf-8') #self.words = data.words() #self.sents = data.sents() #self.paras = data.parsed_sents() self.words = sinica_treebank.words() self.sents = sinica_treebank.sents() self.paras = sinica_treebank.parsed_sents() else: self.words = [] self.sents = [] self.paras = [] #data = SinicaTreebankCorpusReader('./', fn, encoding='utf-8') #self.words = data.words() #self.sents = data.sents() #self.paras = data.parsed_sents() self.words = sinica_treebank.words() self.sents = sinica_treebank.sents() self.paras = sinica_treebank.parsed_sents() # distribution over line/words for LINE/PARA: self.p_line_nline = np.array([0.85, 0.10, 0.05]) self.p_line_nword = [4, 3, 12] # normal: (mu, std) self.p_para_nline = [1.0, 1.0] #[1.7,3.0] # beta: (a, b), max_nline self.p_para_nword = [1.7, 3.0, 10] # beta: (a,b), max_nword # probability to center-align a paragraph: self.center_para = 0.5
import nltk import jieba import re from nltk import word_tokenize from nltk.corpus import sinica_treebank sinica_text = nltk.Text(sinica_treebank.words()) print(sinica_text.concordance('我')) ## 推特特殊语言体定义 emoticons_str = r""" (?: [:=;] # 眼睛 [oO\-]? # 鼻鼻子子 [D\)\]\(\]/\\OpP] # 嘴 )""" regex_str = [ emoticons_str, r'<[^>]+>', # HTML tags r'(?:@[\w_]+)', # @某人人 r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # 话题标签 r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs r'(?:(?:\d+,?)+(?:\.?\d+)?)', # 数字 r"(?:[a-z][a-z'\-_]+[a-z])", # 含有 - 和 ‘ 的单词 r'(?:[\w_]+)', # 其他 r'(?:\S)' # 其他 ] tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE) emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
#!/usr/bin/python3 # coding: utf-8 from nltk.corpus import sinica_treebank ################################################################## ## 简单测试 print( type(sinica_treebank) ) # <class 'nltk.corpus.reader.sinica_treebank.SinicaTreebankCorpusReader'> print(len(sinica_treebank.words())) # 91627 print(sinica_treebank.words() ) # ['一', '友情', '嘉珍', '和', '我', '住在', '同一條', '巷子', '我們', ...] # 去 ~/nltk_data/corpora/sinica_treebank/ 里面直接看会有好多的其他字符 ################################################################## ## 生成 中文拼音的 38k-cn-words-pinyin-sorted-by-frequency.txt import re from nltk import FreqDist from pypinyin import pinyin, lazy_pinyin, Style fd = FreqDist(sinica_treebank.words()) print(len(list(fd.keys()))) # 17273; 去重以后的结果 print(len(fd.most_common())) # 17273 str = ''.join([x[0] for x in fd.most_common()]) print(len(str)) # 38844 str = re.sub('[^\u4e00-\u9fa5]', '', str) print(len(str)) # 38225; 去掉标点符号 with open('38k-cn-words-pinyin-sorted-by-frequency.txt', 'w') as f: f.write('\n'.join(lazy_pinyin(str)))
line = line.strip('\n') titles.append(line), print (line ) print (titles[:10] )#前 10 个片名 #---------- #nltk.download() #载入 nltk 的英文停用词作为“stopwords”变量 import nltk stopwords = nltk.corpus.stopwords.words('english') print (stopwords[:10]) # from nltk.corpus import sinica_treebank print(sinica_treebank.words()) import nltk stopwords = nltk.corpus.stopwords.words('english') print (stopwords[:10]) #------- snownlp --------- import snownlp from snownlp import SnowNLP s=SnowLP('这东西真心很赞')
# -*- coding: utf-8 -*- import nltk from nltk.corpus import sinica_treebank sinica_text = nltk.Text(sinica_treebank.words()) print sinica_text for (key, var) in sinica_treebank.tagged_words()[:8]: print '%s/%s' % (key, var) print sinica_treebank.parsed_sents()[15]
#!/usr/bin/python3 # coding: utf-8 ################################################################## ## 一: treebank, 显示解析树 from nltk.corpus import treebank t = treebank.parsed_sents('wsj_0001.mrg')[0] t.draw() ################################################################## ## 二: sinica_treebank from nltk.corpus import sinica_treebank ################################################################## ## 简单测试 print(type(sinica_treebank)) # <class 'nltk.corpus.reader.sinica_treebank.SinicaTreebankCorpusReader'> print(len(sinica_treebank.words())) # 91627 print(sinica_treebank.words()) # ['一', '友情', '嘉珍', '和', '我', '住在', '同一條', '巷子', '我們', ...] # 去 ~/nltk_data/corpora/sinica_treebank/ 里面直接看会有好多的其他字符 ################################################################## ## 生成 中文拼音的 38k-cn-words-pinyin-sorted-by-frequency.txt import re from nltk import FreqDist from pypinyin import pinyin, lazy_pinyin, Style fd = FreqDist(sinica_treebank.words()) print(len(list(fd.keys()))) # 17273; 去重以后的结果 print(len(fd.most_common())) # 17273 str = ''.join([x[0] for x in fd.most_common()]); print(len(str)) # 38844 str = re.sub('[^\u4e00-\u9fa5]', '', str); print(len(str)) # 38225; 去掉标点符号 with open('38k-cn-words-pinyin-sorted-by-frequency.txt', 'w') as f: f.write('\n'.join(lazy_pinyin(str)))
#!/usr/bin/env python # -*- coding: utf-8 -*- import pickle import nltk from nltk.corpus import sinica_treebank cnt = 0 sinica_fd = nltk.FreqDist(sinica_treebank.words()) # sinica_fd.keys() # sinica_fd.values() cnt = sum([float(x) for x in sinica_fd.values()]) print(cnt) c = {} for key, val in sinica_fd.items(): c[key] = float(val) / cnt print(key, c[key]) d = dict(c) # print(d) with open("data/models/char_freq.cp", 'wb') as f: pickle.dump(d, f) # with open("data/models/char_freq.cp", 'rb') as f: # print(pickle.load(f))
# -*- coding:utf-8 -*- # Filename: sinica_treebank.py # Author:hankcs # Date: 2014-04-08 上午11:44 from __future__ import print_function import nltk import sqlite3 from nltk.corpus import sinica_treebank sinica_text = nltk.Text(sinica_treebank.words()) print(sinica_text) for (key, var) in sinica_treebank.tagged_words()[:8]: print('%s/%s' % (key, var)) print(sinica_text.concordance('我')) print(sinica_text.concordance(u'\u5609\u73cd')) print( "************* sinica_fd=nltk.FreqDist(sinica_treebank.words()) NLTK计算中文高频词 **********" ) sinica_fd = nltk.FreqDist(sinica_treebank.words()) top100 = sinica_fd.items()[0:100] for (x, y) in top100: print(x, y)
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>" POLL_INTERVAL = 100 _DEFAULT = "English: Brown Corpus (Humor)" _CORPORA = { "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), "English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue)
## -*- coding: utf-8 -*- import nltk from nltk.corpus import sinica_treebank as sinica import string import time non_hanzi= list(string.printable) non_hanzi.append("。") non_hanzi.append("!") non_hanzi.append("?") non_hanzi.append(",") corpus_bank = set(sinica.words()) def sent_segment(sentence): palabras=[] sentence= sentence.decode('utf-8') # print sentence num_characters= len(sentence) if num_characters==0: return [] ## ## Checking if what we got to analyze is not a hanzi to skip it ini_non_hanzi= 0 fin_non_hanzi= 0 not_a_hanzi= False # Go through the non-hanzi text until i find something different while((fin_non_hanzi < num_characters) and (sentence[fin_non_hanzi] in non_hanzi)): not_a_hanzi= True fin_non_hanzi+=1 if not_a_hanzi: palabras.append(sentence[ini_non_hanzi:fin_non_hanzi]) # Añade el conjunto con caracteres latinos y puntuación como una palabra entera
_CORPORA = { "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), "English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words( categories=["news", "editorial", "reviews"] ), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words( categories="science_fiction" ), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue)