Beispiel #1
0
 def __init__(self, sep="/", 
              # Note that . needs to be escaped
              pattern = chinese_pattern, 
              root=None, fileids=None):
     """docstring for __init__"""
     TaggedCorpusReader.__init__(
         self,
         sep=sep, root=root, fileids=fileids,
         sent_tokenizer = RegexpTokenizer(pattern, gaps=True),
         encoding="utf-8")
Beispiel #2
0
    def __init__(self, root=None, fileids=None, encoding='utf8'):
        """
        Construct a new MTECorpusreader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') 

        :param root: The root directory for this corpus. (default points to location in multext config file)
        :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
        :param enconding: The encoding of the given files (default is utf8)
        """
        TaggedCorpusReader.__init__(self, root, fileids, encoding)
Beispiel #3
0
    def __init__(self, root=None, fileids=None, encoding='utf8'):
        """
        Construct a new MTECorpusreader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP

        :param root: The root directory for this corpus. (default points to location in multext config file)
        :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
        :param enconding: The encoding of the given files (default is utf8)
        """
        TaggedCorpusReader.__init__(self, root, fileids, encoding)
Beispiel #4
0
 def __init__(self,
              file_name,
              language='',
              separator='_',
              ws_delim=True,
              number_of_groups=10,
              encoding='utf-8'):
     """Initialize the corpus reader."""
     TaggedCorpusReader.__init__(self,
                                 root='.',
                                 fileids=[file_name],
                                 sep=separator,
                                 encoding=encoding)
Beispiel #5
0
 def __init__(
         self,
         sep="/",
         # Note that . needs to be escaped
         pattern=chinese_pattern,
         root=None,
         fileids=None):
     """docstring for __init__"""
     TaggedCorpusReader.__init__(self,
                                 sep=sep,
                                 root=root,
                                 fileids=fileids,
                                 sent_tokenizer=RegexpTokenizer(pattern,
                                                                gaps=True),
                                 encoding="utf-8")
Beispiel #6
0
 def __init__(self, file_name, language="", separator="_", ws_delim=True, number_of_groups=10, encoding="utf-8"):
     """Initialize the corpus reader."""
     TaggedCorpusReader.__init__(self, root=".", fileids=[file_name], sep=separator, encoding=encoding)