def test_parsed_sents(self): parsed_sents = conll2007.parsed_sents("esp.train")[0] self.assertEqual( parsed_sents.tree(), Tree( "fortaleció", [ Tree( "aumento", ["El", Tree("del", [Tree("índice", [Tree("de", [Tree("desempleo", ["estadounidense"])])])])], ), "hoy", "considerablemente", Tree( "al", [ Tree( "euro", [ Tree( "cotizaba", [ ",", "que", Tree("a", [Tree("15.35", ["las", "GMT"])]), "se", Tree( "en", [ Tree( "mercado", ["el", Tree("de", ["divisas"]), Tree("de", ["Fráncfort"])], ) ], ), Tree("a", ["0,9452_dólares"]), Tree( "frente_a", [ ",", Tree( "0,9349_dólares", ["los", Tree("de", [Tree("mañana", ["esta"])])], ), ], ), ], ) ], ) ], ), ".", ], ), )
def test_parsed_sents(self): parsed_sents = conll2007.parsed_sents('esp.train')[0] self.assertEqual(parsed_sents.tree(), Tree('fortaleció', [ Tree('aumento', [ 'El', Tree('del', [ Tree('índice', [ Tree('de', [ Tree('desempleo', ['estadounidense']) ]) ]) ]) ]), 'hoy', 'considerablemente', Tree('al', [ Tree('euro', [ Tree('cotizaba', [ ',', 'que', Tree('a', [ Tree('15.35', ['las', 'GMT']) ]), 'se', Tree('en', [ Tree('mercado', [ 'el', Tree('de', ['divisas']), Tree('de', ['Fráncfort']) ]) ]), Tree('a', ['0,9452_dólares']), Tree('frente_a', [ ',', Tree('0,9349_dólares', [ 'los', Tree('de', [ Tree('mañana', ['esta']) ]) ]) ]) ]) ]) ]), '.' ]) )
def test_parsed_sents(self): parsed_sents = conll2007.parsed_sents('esp.train')[0] self.assertEqual(parsed_sents.tree(), Tree('fortaleció', [ Tree('aumento', [ 'El', Tree('del', [ Tree('índice', [ Tree('de', [ Tree('desempleo', ['estadounidense']) ]) ]) ]) ]), 'hoy', 'considerablemente', Tree('al', [ Tree('euro', [ Tree('cotizaba', [ ',', 'que', Tree('a', [ Tree('15.35', ['las', 'GMT']) ]), 'se', Tree('en', [ Tree('mercado', [ 'el', Tree('de', ['divisas']), Tree('de', ['Fráncfort']) ]) ]), Tree('a', ['0,9452_dólares']), Tree('frente_a', [ ',', Tree('0,9349_dólares', [ 'los', Tree('de', [ Tree('mañana', ['esta']) ]) ]) ]) ]) ]) ]), '.' ]) )
def test_parsed_sents(self): parsed_sents = conll2007.parsed_sents("esp.train")[0] self.assertEqual( parsed_sents.tree(), Tree( "fortaleció", [ Tree( "aumento", [ "El", Tree( "del", [ Tree( "índice", [ Tree( "de", [ Tree( "desempleo", ["estadounidense"]) ], ) ], ) ], ), ], ), "hoy", "considerablemente", Tree( "al", [ Tree( "euro", [ Tree( "cotizaba", [ ",", "que", Tree("a", [ Tree("15.35", ["las", "GMT"]) ]), "se", Tree( "en", [ Tree( "mercado", [ "el", Tree( "de", ["divisas"]), Tree( "de", ["Fráncfort"]), ], ) ], ), Tree("a", ["0,9452_dólares"]), Tree( "frente_a", [ ",", Tree( "0,9349_dólares", [ "los", Tree( "de", [ Tree( "mañana", [ "esta" ], ) ], ), ], ), ], ), ], ) ], ) ], ), ".", ], ), )
[0]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('ptb') print(ptb.fileids()) # doctest: +SKIP # download the corpus from here: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/treebank.zip # then extract and place to the following location: .../nltk_data/corpora/ptb/ print(ptb.words('treebank/combined/wsj_0003.mrg')) # doctest: +SKIP print(ptb.tagged_words('treebank/combined/wsj_0003.mrg')) # doctest: +SKIP # print(ptb.categories()) # doctest: +SKIP # print(ptb.fileids('news')) # doctest: +SKIP # print(ptb.words(categories=['humor', 'fiction'])) # doctest: +SKIP # nltk.download('sinica_treebank') print(sinica_treebank.sents()) # doctest: +SKIP print(sinica_treebank.parsed_sents()[25]) # doctest: +SKIP # nltk.download('conll2007') print(conll2007.sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP # for tree in ycoe.parsed_sents('cocuraC')[:4]: # print(tree) # doctest: +SKIP # word lists and lexicons print(words.fileids()) print(words.words('en')) # doctest: +ELLIPSIS print(stopwords.fileids()) # doctest: +ELLIPSIS print(stopwords.words('portuguese')) # doctest: +ELLIPSIS # nltk.download('names') print(names.fileids()) print(names.words('male.txt')) # doctest: +ELLIPSIS print(names.words('female.txt')) # doctest: +ELLIPSIS # nltk.download('cmudict') print(cmudict.entries()[653:659]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # Load the entire cmudict corpus into a Python dictionary: