def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') print(knbc.fileids()[:10]) print(''.join( knbc.words()[:100] )) print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )) knbc.morphs2str = lambda morphs: '/'.join( "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS' ).encode('utf-8') print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )) print('\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] ))
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find("corpora/knbc/corpus1") fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] def _knbc_fileids_sort(x): cells = x.split("-") return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp") print knbc.fileids()[:10] print "".join(knbc.words()[:100]) print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]) knbc.morphs2str = lambda morphs: "/".join( "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS" ).encode("utf-8") print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]) print "\n".join(" ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2])
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') print(knbc.fileids()[:10]) print(''.join(knbc.words()[:100])) print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2])) knbc.morphs2str = lambda morphs: '/'.join( "%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS' ).encode('utf-8') print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2])) print( '\n'.join( ' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] ) )
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find("corpora/knbc/corpus1") fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] def _knbc_fileids_sort(x): cells = x.split("-") return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader( "knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp", ) print(knbc.fileids()[:10]) print("".join(knbc.words()[:100])) print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2])) knbc.morphs2str = lambda morphs: "/".join("{}({})".format( m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS").encode("utf-8") print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])) print("\n".join(" ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2]))