def test_parsed_sents(self): parsed_sents = sinica_treebank.parsed_sents()[25] self.assertEqual( parsed_sents, Tree('S', [ Tree('NP', [Tree('Nba', ['嘉珍'])]), Tree('V‧地', [Tree('VA11', ['不停']), Tree('DE', ['的'])]), Tree('VA4', ['哭泣']) ]))
def test_parsed_sents(self): parsed_sents = sinica_treebank.parsed_sents()[25] self.assertEqual(parsed_sents, Tree('S', [ Tree('NP', [ Tree('Nba', ['嘉珍']) ]), Tree('V‧地', [ Tree('VA11', ['不停']), Tree('DE', ['的']) ]), Tree('VA4', ['哭泣']) ]))
def test_parsed_sents(self): parsed_sents = sinica_treebank.parsed_sents()[25] self.assertEqual( parsed_sents, Tree( "S", [ Tree("NP", [Tree("Nba", ["嘉珍"])]), Tree("V‧地", [Tree("VA11", ["不停"]), Tree("DE", ["的"])]), Tree("VA4", ["哭泣"]), ], ), )
import nltk from nltk.corpus import sinica_treebank print(sinica_treebank.sents()) print(sinica_treebank.parsed_sents()[27])
text1.concordance('god') text2.concordance('god') text3.concordance('god', lines = 10, width = 30) text1.similar('monstrous') text2.similar('monstrous') text3.similar('monstrous') help(text1.similar) from nltk.corpus import sinica_treebank import random num = random.choice([n for n in range(len(indian.sent()))]) print(indian.sents([num])) sinica_treebank.parsed_sents()[888].draw() from nltk.corpus import gutenberg as G print(G.fileids()) emma = G.words('austen-emma.txt') #number of words in one txt for fileid in G.fileids(): words = G.words(fileid) print(fileid, len(words)) #number of letters in one txt num_chars = len(G.raw('austen-emma.txt')) print(num_chars) for fileid in G.fileids():
import nltk from nltk.tree import Tree from nltk.corpus import sinica_treebank # print(sinica_treebank.words()) print(sinica_treebank.parsed_sents()[36].draw()) # print(Tree.fromstring(sinica_treebank.parsed_sents()[33]).draw())
def __init__(self, min_nchar, fn, lang="ENG"): """ TXT_FN : path to file containing text data. """ self.min_nchar = min_nchar self.fdict = { 'WORD': self.sample_word, 'LINE': self.sample_line, 'PARA': self.sample_para } self.lang = lang # parse English text if self.lang == "ENG": print('Generate English Data with NLTK:PlaintextCorpusReader') corpus = PlaintextCorpusReader("./", fn) self.words = corpus.words() self.sents = corpus.sents() self.paras = corpus.paras() # parse Japanese text elif self.lang == "JPN": print('Generate Japanese Data with NLTK:ChasenCorpusReader') # convert fs into chasen file _, ext = os.path.splitext(os.path.basename(fn)) fn_chasen = fn.replace(ext, ".chasen") print("Convert {} into {}".format(fn, fn_chasen)) cmd = "mecab -Ochasen {} > {}".format(fn, fn_chasen) print( "The following cmd below was executed to convert into chasen (for Japanese)" ) print("\t{}".format(cmd)) p = subprocess.call(cmd, shell=True) data = ChasenCorpusReader('./', fn_chasen, encoding='utf-8') self.words = data.words() self.sents = data.sents() self.paras = data.paras() # jp_sent_tokenizer = nltk.RegexpTokenizer(u'[^ 「」!?。]*[!?。]') # jp_chartype_tokenizer = nltk.RegexpTokenizer(u'([ぁ-んー]+|[ァ-ンー]+|[\u4e00-\u9FFF]+|[^ぁ-んァ-ンー\u4e00-\u9FFF]+)') # # corpus = PlaintextCorpusReader("./", # fn, # encoding='utf-8', # para_block_reader=read_line_block, # sent_tokenizer=jp_sent_tokenizer, # word_tokenizer=jp_chartype_tokenizer) elif self.lang == "ZHTW": print( 'Generate Traditional Chinese Data with NLTK:sinica_treebank') self.words = [] self.sents = [] self.paras = [] #data = SinicaTreebankCorpusReader('./', fn, encoding='utf-8') #self.words = data.words() #self.sents = data.sents() #self.paras = data.parsed_sents() self.words = sinica_treebank.words() self.sents = sinica_treebank.sents() self.paras = sinica_treebank.parsed_sents() else: self.words = [] self.sents = [] self.paras = [] #data = SinicaTreebankCorpusReader('./', fn, encoding='utf-8') #self.words = data.words() #self.sents = data.sents() #self.paras = data.parsed_sents() self.words = sinica_treebank.words() self.sents = sinica_treebank.sents() self.paras = sinica_treebank.parsed_sents() # distribution over line/words for LINE/PARA: self.p_line_nline = np.array([0.85, 0.10, 0.05]) self.p_line_nword = [4, 3, 12] # normal: (mu, std) self.p_para_nline = [1.0, 1.0] #[1.7,3.0] # beta: (a, b), max_nline self.p_para_nword = [1.7, 3.0, 10] # beta: (a,b), max_nword # probability to center-align a paragraph: self.center_para = 0.5
# -*- coding: utf-8 -*- """ Created on Tue Aug 2 08:14:38 2016 @author: alex """ from nltk.corpus import sinica_treebank sents = sinica_treebank.parsed_sents()[15] print type(sents) sents.draw()
# -*- coding: utf-8 -*- import nltk from nltk.corpus import sinica_treebank sinica_text = nltk.Text(sinica_treebank.words()) print sinica_text for (key, var) in sinica_treebank.tagged_words()[:8]: print '%s/%s' % (key, var) print sinica_treebank.parsed_sents()[15]
import nltk from nltk.corpus import sinica_treebank print(sinica_treebank.sents()) print(sinica_treebank.parsed_sents()[27]) sinica_treebank.parsed_sents()[27].draw()
print(treebank.words('wsj_0003.mrg')) print(treebank.tagged_words('wsj_0003.mrg')) print(treebank.parsed_sents('wsj_0003.mrg') [0]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('ptb') print(ptb.fileids()) # doctest: +SKIP # download the corpus from here: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/treebank.zip # then extract and place to the following location: .../nltk_data/corpora/ptb/ print(ptb.words('treebank/combined/wsj_0003.mrg')) # doctest: +SKIP print(ptb.tagged_words('treebank/combined/wsj_0003.mrg')) # doctest: +SKIP # print(ptb.categories()) # doctest: +SKIP # print(ptb.fileids('news')) # doctest: +SKIP # print(ptb.words(categories=['humor', 'fiction'])) # doctest: +SKIP # nltk.download('sinica_treebank') print(sinica_treebank.sents()) # doctest: +SKIP print(sinica_treebank.parsed_sents()[25]) # doctest: +SKIP # nltk.download('conll2007') print(conll2007.sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP # for tree in ycoe.parsed_sents('cocuraC')[:4]: # print(tree) # doctest: +SKIP # word lists and lexicons print(words.fileids()) print(words.words('en')) # doctest: +ELLIPSIS print(stopwords.fileids()) # doctest: +ELLIPSIS print(stopwords.words('portuguese')) # doctest: +ELLIPSIS # nltk.download('names') print(names.fileids()) print(names.words('male.txt')) # doctest: +ELLIPSIS print(names.words('female.txt')) # doctest: +ELLIPSIS
b= "".join(s) ## print "B",b f.write('Corpus: '+str(b)+'\n') r1= tree2set(str(a)) ## print "Tree A: "+str(r1) r2= tree2set(str(b)) ## print "Tree B: "+str(r2) return lp_lr(r2,r1)#parseval(r2,r1), labeled_recall(r2,r1),lp_lr(r2,r1) ## ## TRAIN + TEST 1000 ## size= 1000 frases= sinica.sents() arboles= sinica.parsed_sents() train= pcfg(size) train.carga_pesos() with open('gramatica1000total.txt','r') as g: gramatica=g.readlines() train.carga_gramatica(gramatica) F1= 0 f= open('t1000.txt', 'w') ##f= open('Knownwords.txt', 'w') for i in range(size): ## print i f.write(str(i)+'\n') ## print '\n\n'