def header(self, fileids=None, **kwargs): """ Returns header(s) of specified fileids. """ return concat([self._view(self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs).handle_query() for fileid in fileids])
def sents(self, fileids=None, **kwargs): """ Returns sentences in specified fileids. """ return concat([self._view(self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs).handle_query() for fileid in fileids])
def words(self, fileids=None, **kwargs): return concat( [ self._view(fileid, tags=False, **kwargs) for fileid in self._list_morph_files(fileids) ] )
def tagged_paras(self, fileids=None, **kwargs): return concat( [ self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs) for fileid in self._list_morph_files(fileids) ] )
def raw(self, fileids=None, **kwargs): """ Returns words in specified fileids. """ return concat([self._view(self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs).handle_query() for fileid in fileids])
def parsed_sents2(self, fileids=None): return concat([JapaneseCorpusView(fileid, enc, False, False, False, True, self._syntax_parser, self._word_tokenizer, self._sent_tokenizer, self._case_parser) for (fileid, enc) in self.abspaths(fileids, True)])
def raw(self, fileids=None): """ :return: the given file(s) as a single string. :rtype: str """ if fileids is None: fileids = self._fileids elif isinstance(fileids, basestring): fileids = [fileids] return concat([self.open(f).read() for f in fileids])
def fixed_parsed_sents(self, fileids=None, top_label="root"): from nltk.corpus.reader.util import concat from nltk.corpus.reader.dependency import DependencyCorpusView from nltk.parse import DependencyGraph sents=concat([DependencyCorpusView(fileid, False, True, True, encoding=enc) for fileid, enc in self.abspaths(fileids, include_encoding=True)]) return [DependencyGraph(sent, top_relation_label=top_label, cell_separator="\t") for sent in sents]
def tagged_words(self, fileids=None, **kwargs): """ Call with specified tags as a list, e.g. tags=['subst', 'comp']. Returns tagged words in specified fileids. """ tags = kwargs.pop('tags', []) return concat([self._view(self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, tags=tags, **kwargs).handle_query() for fileid in fileids])
def raw(self, fileids=None): """ Return the corpora in their raw form. """ if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids])
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False): """A helper function that instantiates BNCWordViews or the list of words/sentences.""" f = BNCWordView if self._lazy else self._words return concat( [ f(fileid, sent, tag, strip_space, stem) for fileid in self.abspaths(fileids) ] )
def sents(self, fileids=None, **kwargs): return concat( [ self._view( fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs ) for fileid in self._list_morph_files(fileids) ] )
def aligned_sents(self, fileids=None): """ :return: the given file(s) as a list of AlignedSent objects. :rtype: list of C{AlignedSent} """ return concat([AlignedSentCorpusView(fileid, enc, True, True, self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader) for (fileid, enc) in self.abspaths(fileids, True)])
def parsed_docs(self, fileids=None): """ @return: A list of parsed corpus documents. @rtype: C{list} of C{StreamBackedCorpusView} @param fileids: A list of corpus files. @type fileids: C{list} of C{str} or regular expression """ return concat([StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True)])
def words(self, fileids=None): """ @return: the given file(s) as a list of words and punctuation symbols. @rtype: C{list} of C{str} """ return concat([self._alignedsent_corpus_view(fileid, enc, False, False, self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader) for (fileid, enc) in self.abspaths(fileids, True)])
def words(self, fileids=None): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list of str """ return concat([AlignedSentCorpusView(fileid, enc, False, False, self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader) for (fileid, enc) in self.abspaths(fileids, True)])
def docs(self, fileids=None): """ Returns the full Tweet objects, as specified by `Twitter documentation on Tweets <https://dev.twitter.com/docs/platform-objects/tweets>`_ :return: the given file(s) as a list of dictionaries deserialised from JSON. :rtype: list(dict) """ return concat([self.CorpusView(path, self._read_tweets, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)])
def sents(self, fileids=None): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list of (list of str) """ return concat([AlignedSentCorpusView(fileid, enc, False, True, self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader) for (fileid, enc) in self.abspaths(fileids, True)])
def raw(self, fileids=None): """ @return: A list of corpus file contents. @rtype: C{list} of C{str} @param fileids: A list of corpus files. @type fileids: C{list} of C{str} or regular expression """ if fileids is None: fileids = self._fileids elif isinstance(fileids, basestring): fileids = [fileids] return concat([self.open(f).read() for f in fileids])
def sents(self, fileids=None): """ @return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. @rtype: C{list} of (C{list} of C{str}) """ return concat([self._alignedsent_corpus_view(fileid, enc, False, True, self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader) for (fileid, enc) in self.abspaths(fileids, True)])
def lemmatized_sents(corpus,fileids=None): """ Retorna árboles cuyas hojas son parejas (word,lemma) """ from nltk import tree from nltk.corpus.reader.util import concat def lemmatized(element): if element: subtrees = map(lemmatized, element) subtrees = [t for t in subtrees if t] return tree.Tree(element.tag, subtrees) elif element.get('elliptic') == 'yes': return None else: return tree.Tree(element.get('pos') or element.get('ne') or 'unk', [(element.get('wd'),element.get('lem'))]) if not fileids: fileids = corpus.xmlreader.fileids() return LazyMap(lemmatized, concat([list(corpus.xmlreader.xml(fileid)) for fileid in fileids]))
def sents(self, fileids=None, speaker='ALL', sent=True, stem=False, relation=None, pos=False, strip_space=True, replace=False): """ @return: the given file(s) as a list of sentences @rtype: C{list} of (C{list} of C{str}) @param speaker: If specified, select specitic speakers defined in the corpus. Default is 'ALL'. Common choices are 'CHI' (all children) and 'MOT' (mothers) @param stem: If true, then use word stems instead of word strings. @param relation: If true, then return tuples of C{(str,relation_list)} @param pos: If true, then return tuples of C{(stem, part_of_speech)} @param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. @param replace: If true, then use the replaced word instead of the original word (e.g., 'wat' will be replaced with 'watch') """ return concat([self._get_words(fileid, speaker, sent, stem, relation, pos, strip_space, replace) for fileid in self.abspaths(fileids)])
def words(self, fileids=None, speaker='ALL', sent=None, stem=False, relation=False, pos=False, strip_space=True, replace=False): """ :return: the given file(s) as a list of words :rtype: list(str) :param speaker: If list is specified, select specitic speakers defined in the corpus. Default is 'All' (all participants). Common choices are ['CHI'] (all children), ['MOT'] (mothers), ['CHI','MOT'] (exclude researchers) :param stem: If true, then use word stems instead of word strings. :param relation: If true, then return tuples of (stem, index, dependent_index) :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param replace: If true, then use the replaced word instead of the original word (e.g., 'wat' will be replaced with 'watch') """ return concat([self._get_words(fileid, speaker, sent, stem, relation, pos, strip_space, replace) for fileid in self.abspaths(fileids)])
def tagged_sents(self, fileids=None, speaker='ALL', sent=True, stem=False, relation=None, pos=True, strip_space=True, replace=False): """ :return: the given file(s) as a list of sentences, each encoded as a list of ``(word,tag)`` tuples. :rtype: list(list(tuple(str,str))) :param speaker: If list is specified, select specitic speakers defined in the corpus. Default is 'All' (all participants). Common choices are ['CHI'] (all children), ['MOT'] (mothers), ['CHI','MOT'] (exclude researchers) :param stem: If true, then use word stems instead of word strings. :param relation: If true, then return tuples of ``(str,pos,relation_list)``. If there is manually-annotated relation info, it will return tuples of tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)`` :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param replace: If true, then use the replaced word instead of the original word (e.g., 'wat' will be replaced with 'watch') """ return concat([self._get_words(fileid, speaker, sent, stem, relation, pos, strip_space, replace) for fileid in self.abspaths(fileids)])
def tagged_words(self, fileids=None, speaker='ALL', stem=False, relation=False, strip_space=True, replace=False): """ :return: the given file(s) as a list of tagged words and punctuation symbols, encoded as tuples ``(word,tag)``. :rtype: list(tuple(str,str)) :param speaker: If specified, select specific speaker(s) defined in the corpus. Default is 'ALL' (all participants). Common choices are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude researchers) :param stem: If true, then use word stems instead of word strings. :param relation: If true, then return tuples of (stem, index, dependent_index) :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param replace: If true, then use the replaced (intended) word instead of the original word (e.g., 'wat' will be replaced with 'watch') """ sent=None pos=True return concat([self._get_words(fileid, speaker, sent, stem, relation, pos, strip_space, replace) for fileid in self.abspaths(fileids)])
def tagged_sents(self, fileids=None): if not fileids: fileids = self.xmlreader.fileids() return LazyMap(tagged, concat([list(self.xmlreader.xml(fileid)) for fileid in fileids]))
def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids])
def stemmed_words(self, fileids=None): return { t[0].lower(): t[1].lower() for t in concat(self.stemmed_sents(fileids)) }
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False): """A helper function that instantiates BNCWordViews or the list of words/sentences.""" f = BNCWordView if self._lazy else self._words return concat([f(fileid, sent, tag, strip_space, stem) for fileid in self.abspaths(fileids)])
def freqs(self, fileids=None): ''' Return trigram frequencies for a language from the corpus ''' return concat([self.CorpusView(path, self._read_trigram_block) for path in self.abspaths(fileids=fileids)])
def tagged_words(self, fileids=None): # XXX: use LazyConcatenation? return concat(self.tagged_sents(fileids))
def sents(self, fileids=None, **kwargs): return concat([self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs) for fileid in self._list_morph_files(fileids)])
def freqs(self, fileids=None): return concat([self.CorpusView(path, self._read_trigram_block) for path in self.abspaths(fileids=fileids)])
def tagged_paras(self, fileids=None, **kwargs): return concat([self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs) for fileid in self._list_morph_files(fileids)])
def sents(self, fileids=None): # FIXME: not lazy! if not fileids: fileids = self.xmlreader.fileids() return LazyMap(untagged, concat([list(self.xmlreader.xml(fileid)) for fileid in fileids]))
def tagged_sents(self, fileids=None): if not fileids: fileids = self.xmlreader.fileids() return LazyMap( tagged, concat([list(self.xmlreader.xml(fileid)) for fileid in fileids]))
def tagged_words(self, fileids=None): return concat(self.tagged_sents(fileids))
def tagged_words(self, fileids=None, **kwargs): return concat( [self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)] )