def __init__(self, sep="/", # Note that . needs to be escaped pattern = chinese_pattern, root=None, fileids=None): """docstring for __init__""" TaggedCorpusReader.__init__( self, sep=sep, root=root, fileids=fileids, sent_tokenizer = RegexpTokenizer(pattern, gaps=True), encoding="utf-8")
def __init__(self, root=None, fileids=None, encoding='utf8'): """ Construct a new MTECorpusreader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') :param root: The root directory for this corpus. (default points to location in multext config file) :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml) :param enconding: The encoding of the given files (default is utf8) """ TaggedCorpusReader.__init__(self, root, fileids, encoding)
def __init__(self, root=None, fileids=None, encoding='utf8'): """ Construct a new MTECorpusreader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP :param root: The root directory for this corpus. (default points to location in multext config file) :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml) :param enconding: The encoding of the given files (default is utf8) """ TaggedCorpusReader.__init__(self, root, fileids, encoding)
def __init__(self, file_name, language='', separator='_', ws_delim=True, number_of_groups=10, encoding='utf-8'): """Initialize the corpus reader.""" TaggedCorpusReader.__init__(self, root='.', fileids=[file_name], sep=separator, encoding=encoding)
def __init__( self, sep="/", # Note that . needs to be escaped pattern=chinese_pattern, root=None, fileids=None): """docstring for __init__""" TaggedCorpusReader.__init__(self, sep=sep, root=root, fileids=fileids, sent_tokenizer=RegexpTokenizer(pattern, gaps=True), encoding="utf-8")
def __init__(self, file_name, language="", separator="_", ws_delim=True, number_of_groups=10, encoding="utf-8"): """Initialize the corpus reader.""" TaggedCorpusReader.__init__(self, root=".", fileids=[file_name], sep=separator, encoding=encoding)