Exemple #1
0
    def __init__(self,
                 root,
                 fileids=DOC_PATTERN,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf8',
                 **kwargs):
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)

        TwitterCorpusReader.__init__(self, root, fileids, encoding)

        if isinstance(root,
                      string_types) and not isinstance(root, PathPointer):
            m = re.match('(.*\.gz)/?(.*\.zip)/?(.*)$|',
                         root)  #'(.*\.zip)/?(.*\.gz)/?(.*)$|'
            gzipfile, zipfile, zipentry = m.groups()
            if zipfile:
                root = ZipFilePathPointer(zipfile, zipentry)
            elif gzipfile:
                root = ZipFilePathPointer(gzipfile, zipentry)
            else:
                root = FileSystemPathPointer(root)
        elif not isinstance(root, PathPointer):
            raise TypeError('CorpusReader: expected a string or a PathPointer')

        self._root = root
        self.current_doc = []
Exemple #2
0
    def __load(self):
        # Find the corpus root directory.
        zip_location = os.path.join(ROOT_PATH, 'plugins', 'crawl', 'wordnet',
                                    'wordnet.zip')
        root = ZipFilePathPointer(zip_location, 'wordnet/')

        # Load the corpus.
        corpus = self.__reader_cls(root, *self.__args, **self.__kwargs)

        # This is where the magic happens!  Transform ourselves into
        # the corpus by modifying our own __dict__ and __class__ to
        # match that of the corpus.
        self.__dict__ = corpus.__dict__
        self.__class__ = corpus.__class__
Exemple #3
0
def create_text_corpus_from_zipfile(
        zf: ZipFile,
        pattern='.*\.txt',
        ensure_loaded=True) -> PlaintextCorpusReader:
    '''
    Loads a text corpus contained in a zipfile.
    '''
    pointer = ZipFilePathPointer(zf)
    corpus = PlaintextCorpusReader(pointer, pattern)

    if ensure_loaded:
        corpus.ensure_loaded()

    return corpus
Exemple #4
0
    def __init__(self, root, fileids, encoding='utf8', tagset=None):
        """
        :type root: PathPointer or str
        :param root: A path pointer identifying the root directory for
            this corpus.  If a string is specified, then it will be
            converted to a ``PathPointer`` automatically.
        :param fileids: A list of the files that make up this corpus.
            This list can either be specified explicitly, as a list of
            strings; or implicitly, as a regular expression over file
            paths.  The absolute path for each file will be constructed
            by joining the reader's root to each file name.
        :param encoding: The default unicode encoding for the files
            that make up the corpus.  The value of ``encoding`` can be any
            of the following:
            - A string: ``encoding`` is the encoding name for all files.
            - A dictionary: ``encoding[file_id]`` is the encoding
              name for the file whose identifier is ``file_id``.  If
              ``file_id`` is not in ``encoding``, then the file
              contents will be processed using non-unicode byte strings.
            - A list: ``encoding`` should be a list of ``(regexp, encoding)``
              tuples.  The encoding for a file whose identifier is ``file_id``
              will be the ``encoding`` value for the first tuple whose
              ``regexp`` matches the ``file_id``.  If no tuple's ``regexp``
              matches the ``file_id``, the file contents will be processed
              using non-unicode byte strings.
            - None: the file contents of all files will be
              processed using non-unicode byte strings.
        :param tagset: The name of the tagset used by this corpus, to be used
              for normalizing or converting the POS tags returned by the
              tagged_...() methods.
        """
        # Convert the root to a path pointer, if necessary.
        if isinstance(root, compat.string_types) and not isinstance(root, PathPointer):
            m = re.match('(.*\.zip)/?(.*)$|', root)
            zipfile, zipentry = m.groups()
            if zipfile:
                root = ZipFilePathPointer(zipfile, zipentry)
            else:
                root = FileSystemPathPointer(root)
        elif not isinstance(root, PathPointer):
            raise TypeError('CorpusReader: expected a string or a PathPointer')

        # If `fileids` is a regexp, then expand it.
        if isinstance(fileids, compat.string_types):
            fileids = find_corpus_fileids(root, fileids)

        self._fileids = fileids
        """A list of the relative paths for the fileids that make up
        this corpus."""

        self._root = root
        """The root directory for this corpus."""

        # If encoding was specified as a list of regexps, then convert
        # it to a dictionary.
        if isinstance(encoding, list):
            encoding_dict = {}
            for fileid in self._fileids:
                for x in encoding:
                    (regexp, enc) = x
                    if re.match(regexp, fileid):
                        encoding_dict[fileid] = enc
                        break
            encoding = encoding_dict

        self._encoding = encoding
        """The default unicode encoding for the fileids that make up
           this corpus.  If ``encoding`` is None, then the file
           contents are processed using byte strings."""
        self._tagset = tagset
Exemple #5
0
# T(he original version of t)his code was written by Ulrich Germann (11/2010)


######################################################################

import nltk
nltk.data.path[0:0] = ['/u/csc485h/include/a3/nltk']

# The following code provides access to the tagged NY Times corpus
# nyt_big is the full corpus
# nyt_mini a small subset for development
from nltk.data         import ZipFilePathPointer
from nltk.corpus       import TaggedCorpusReader

nyt_zipped = ZipFilePathPointer('/u/csc485h/include/a3/nltk/corpora/nyt.zip','nyt/')
nyt_big    = TaggedCorpusReader(nyt_zipped,['2004-tagged.txt'],sep='/', encoding='latin2')
nyt_mini   = TaggedCorpusReader(nyt_zipped,['nytimes-mini.txt'],sep='/', encoding='latin2')

# Finally, let's set up a default pattern for NP chunking
# Setting up the NP chunker itself is left to the main script, to encourage
# trying different variants of the pattern

##  Operator 	Behavior
##  . 	        Wildcard, matches any character
##  ^abc 	Matches some pattern abc at the start of a string
##  abc$ 	Matches some pattern abc at the end of a string
##  [abc] 	Matches one of a set of characters
##  [A-Z0-9] 	Matches one of a range of characters
##  ed|ing|s 	Matches one of the specified strings (disjunction)
##  * 	        Zero or more of previous item, e.g. a*, [a-z]* (also known as Kleene Closure)
Exemple #6
0
    resource_name = normalize_resource_name(resource_name, True)

    # Resolve default paths at runtime in-case the user overrides
    # vikinlp.data.pathevanlp paths is None:
        paths = path

    # Check if the resource name includes a zipfile name
    m = re.match(r'(.*\.zip)/?(.*)$|', resource_name)
    zipfile, zipentry = m.groups()

    # Check each item in our path
    for path_ in paths:
        # Is the path item a zipfile?
        if path_ and (os.path.isfile(path_) and path_.endswith('.zip')):
            try:
                return ZipFilePathPointer(path_, resource_name)
            except IOError:
                # resource not in zipfile
                continue

        # Is the path item a directory or is resource_name an absolute path?
        elif not path_ or os.path.isdir(path_):
            if zipfile is None:
                p = os.path.join(path_, url2pathname(resource_name))
                if os.path.exists(p):
                    if p.endswith('.gz'):
                        return GzipFileSystemPathPointer(p)
                    else:
                        return FileSystemPathPointer(p)
            else:
                p = os.path.join(path_, url2pathname(zipfile))
Exemple #7
0
    def __init__(self,
                 root,
                 fileids,
                 encoding=None,
                 tag_mapping_function=None):
        """
        @type root: L{PathPointer} or C{str}
        @param root: A path pointer identifying the root directory for
            this corpus.  If a string is specified, then it will be
            converted to a L{PathPointer} automatically.
        @param fileids: A list of the files that make up this corpus.
            This list can either be specified explicitly, as a list of
            strings; or implicitly, as a regular expression over file
            paths.  The absolute path for each file will be constructed
            by joining the reader's root to each file name.
        @param encoding: The default unicode encoding for the files
            that make up the corpus.  C{encoding}'s value can be any
            of the following:
            
              - B{A string}: C{encoding} is the encoding name for all
                files.
              - B{A dictionary}: C{encoding[file_id]} is the encoding
                name for the file whose identifier is C{file_id}.  If
                C{file_id} is not in C{encoding}, then the file
                contents will be processed using non-unicode byte
                strings.
              - B{A list}: C{encoding} should be a list of C{(regexp,
                encoding)} tuples.  The encoding for a file whose
                identifier is C{file_id} will be the C{encoding} value
                for the first tuple whose C{regexp} matches the
                C{file_id}.  If no tuple's C{regexp} matches the
                C{file_id}, the file contents will be processed using
                non-unicode byte strings.
              - C{None}: the file contents of all files will be
                processed using non-unicode byte strings.
        @param tag_mapping_function: A function for normalizing or
                simplifying the POS tags returned by the tagged_words()
                or tagged_sents() methods.
        """
        # Convert the root to a path pointer, if necessary.
        if isinstance(root, basestring):
            m = re.match('(.*\.zip)/?(.*)$|', root)
            zipfile, zipentry = m.groups()
            if zipfile:
                root = ZipFilePathPointer(zipfile, zipentry)
            else:
                root = FileSystemPathPointer(root)
        elif not isinstance(root, PathPointer):
            raise TypeError('CorpusReader: expected a string or a PathPointer')

        # If `fileids` is a regexp, then expand it.
        if isinstance(fileids, basestring):
            fileids = find_corpus_fileids(root, fileids)

        self._fileids = fileids
        """A list of the relative paths for the fileids that make up
        this corpus."""

        self._root = root
        """The root directory for this corpus."""

        # If encoding was specified as a list of regexps, then convert
        # it to a dictionary.
        if isinstance(encoding, list):
            encoding_dict = {}
            for fileid in self._fileids:
                for x in encoding:
                    (regexp, enc) = x
                    if re.match(regexp, fileid):
                        encoding_dict[fileid] = enc
                        break
            encoding = encoding_dict

        self._encoding = encoding
        """The default unicode encoding for the fileids that make up
           this corpus.  If C{encoding} is C{None}, then the file
           contents are processed using byte strings (C{str})."""
        self._tag_mapping_function = tag_mapping_function