def __init__( self, root, fileids, sep="/", word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer("\n", gaps=True), alignedsent_block_reader=read_alignedsent_block, encoding="latin1", ): """ Construct a new Aligned Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader
def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None, target_language=None, paragraph_separator='\n\n', **kwargs): """ :param root: The file root of the corpus directory :param fileids: the list of file ids to consider, or wildcard expression :param skip_keywords: a list of words which indicate whole paragraphs that should be skipped by the paras and words methods() :param target_language: which files to select; sometimes a corpus contains English translations, we expect these files to be named ...english.json -- if not, pass in fileids :param paragraph_separator: character sequence demarcating paragraph separation :param encoding: utf8 :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer, word_tokenizer. """ if not target_language: target_language = '' if not fileids: fileids = r'.*{}\.json'.format(target_language) # Initialize the NLTK corpus reader objects CorpusReader.__init__(self, root, fileids, encoding) if 'sent_tokenizer' in kwargs: self._sent_tokenizer = kwargs['sent_tokenizer'] if 'word_tokenizer' in kwargs: self._word_tokenizer = kwargs['word_tokenizer'] self.skip_keywords = skip_keywords self.paragraph_separator = paragraph_separator
def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default): """ Initialize KNBCorpusReader morphs2str is a function to convert morphlist to str for tree representation for _parse() """ CorpusReader.__init__(self, root, fileids, encoding) self.morphs2str = morphs2str
def __init__(self, root, fileids=DOC_PATTERN, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ CorpusReader.__init__(self, root, fileids)
def __init__(self, root, fileids, tone, tag, wrap_etree=False): self.fileids = fileids self._wrap_etree = wrap_etree CorpusReader.__init__(self, root, fileids) self.tagged_sents = [] self.sents = [] self.words = [] self.tagged_words = [] self.option_tone = tone self.option_tag = tag
def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default): """ Initialize KNBCorpusReader morphs2str is a function to convert morphlist to str for tree representation for _parse() """ # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing # from CorpusReader? CorpusReader.__init__(self, root, fileids, encoding) self.morphs2str = morphs2str
def __init__(self, root, fileids, syntax_parser=CaboChaParser(), word_tokenizer=MeCabTokenizer(), sent_tokenizer=jp_sent_tokenizer, case_parser=KNPParser(), encoding='utf-8'): CorpusReader.__init__(self, root, fileids, encoding) self._syntax_parser = syntax_parser self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._case_parser = case_parser
def __init__(self, root, zipfile, fileids): if isinstance(root, basestring): root = FileSystemPathPointer(root) elif not isinstance(root, PathPointer): raise TypeError('CorpusReader: expected a string or a PathPointer') # convert to a ZipFilePathPointer root = ZipFilePathPointer(root.join(zipfile)) CorpusReader.__init__(self, root, fileids) self._parse_char_replacements()
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids)
def __init__(self, root, fileids, sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), encoding=None): """ @param root: The root directory for this corpus. @param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader=None, self._alignedsent_block_reader = self._alignedsent_block_reader self._alignedsent_corpus_view = None
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids) self._word_tokenizer = WordPunctTokenizer() self._sent_tokenizer = nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle')
def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8', tags=TAGS, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN # Initialize the NLTK corpus reader objects CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding) # Save the tags that we specifically want to extract. self.tags = tags
def assemble_corpus(corpus_reader: CorpusReader, types_requested: List[str], type_dirs: Dict[str, List[str]] = None, type_files: Dict[str, List[str]] = None) -> CorpusReader: """ Create a filtered corpus. :param corpus_reader: This get mutated :param types_requested: a list of string types, which are to be found in the type_dirs and type_files mappings :param type_dirs: a dict of corpus types to directories :param type_files: a dict of corpus types to files :return: a CorpusReader object containing only the mappings desired """ fileid_names = [] # type: List[str] try: all_file_ids = list(corpus_reader.fileids()) clean_ids_types = [] # type: List[Tuple[str, str]] if type_files: for key, valuelist in type_files.items(): if key in types_requested: for value in valuelist: if value in all_file_ids: if key: clean_ids_types.append((value, key)) if type_dirs: for key, valuelist in type_dirs.items(): if key in types_requested: for value in valuelist: corrected_dir = value.replace('./', '') corrected_dir = '{}/'.format(corrected_dir) for name in all_file_ids: if name and name.startswith(corrected_dir): clean_ids_types.append((name, key)) clean_ids_types.sort(key=lambda x: x[0]) fileid_names, categories = zip(*clean_ids_types) # type: ignore corpus_reader._fileids = fileid_names return corpus_reader except Exception: LOG.exception('failure in corpus building')
def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None, **kwargs): """ :param root: The file root of the corpus directory :param fileids: the list of file ids to consider, or wildcard expression :param skip_keywords: a list of words which indicate whole paragraphs that should be skipped by the paras and words methods() :param encoding: utf8 :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer, word_tokenizer. """ if not fileids: fileids = r'.*\.txt' # Initialize the NLTK corpus reader objects PlaintextCorpusReader.__init__(self, root, fileids, encoding) CorpusReader.__init__(self, root, fileids, encoding) if 'sent_tokenizer' in kwargs: self._sent_tokenizer = kwargs['sent_tokenizer'] if 'word_tokenizer' in kwargs: self._word_tokenizer = kwargs['word_tokenizer'] self.skip_keywords = skip_keywords
def __init__(self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding='utf8'): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param word_tokenizer: Tokenizer for breaking the text of Tweets into smaller units, including but not limited to words. """ CorpusReader.__init__(self, root, fileids, encoding) for path in self.abspaths(self._fileids): if isinstance(path, ZipFilePathPointer): pass elif os.path.getsize(path) == 0: raise ValueError("File {} is empty".format(path)) """Check that all user-created corpus files are non-empty.""" self._word_tokenizer = word_tokenizer
def __init__(self, root, fileids, column_types=None, top_node='S', beginning_of_sentence=r'#BOS.+$', end_of_sentence=r'#EOS.+$', encoding=None): """ Construct a new corpus reader for reading NEGRA corpus files. @param root: The root directory of the corpus files. @param fileids: A list of or regex specifying the files to read from. @param column_types: An optional C{list} of columns in the corpus. @param top_node: The top node of parsed sentence trees. @param beginning_of_sentence: A regex specifying the start of a sentence @param end_of_sentence: A regex specifying the end of a sentence @param encoding: The default corpus file encoding. """ # Make sure there are no invalid column type if isinstance(column_types, list): for column_type in column_types: if column_type not in self.COLUMN_TYPES: raise ValueError("Column %r is not supported." % columntype) else: column_types = self.COLUMN_TYPES # Define stuff self._top_node = top_node self._column_types = column_types self._fileids = fileids self._bos = beginning_of_sentence self._eos = end_of_sentence self._colmap = dict((c,i) for (i,c) in enumerate(column_types)) # Finish constructing by calling the extended class' constructor CorpusReader.__init__(self, root, fileids, encoding)
def fileids(self, channels=None, domains=None, categories=None): if channels is not None and domains is not None and \ categories is not None: raise ValueError('You can specify only one of channels, domains ' 'and categories parameter at once') if channels is None and domains is None and \ categories is None: return CorpusReader.fileids(self) if isinstance(channels, basestring): channels = [channels] if isinstance(domains, basestring): domains = [domains] if isinstance(categories, basestring): categories = [categories] if channels: return self._list_morph_files_by('channel', channels) elif domains: return self._list_morph_files_by('domain', domains) else: return self._list_morph_files_by('keyTerm', categories, map=self._map_category)
def __init__(self, root, fileids): CorpusReader.__init__(self, root, fileids, None, None)
def __init__(self, root, fileids, wrap_etree=False): self._wrap_etree = wrap_etree CorpusReader.__init__(self, root, fileids)
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialize the corpus reader """ CorpusReader.__init__(self, root, fileids, **kwargs)
def __init__(self, fileids=r'.*\.gz', data_folder=''): _root = os.path.join(susx._sussex_root, data_folder) CorpusReader.__init__(self, _root, fileids) self._n = None self._n_sents = None