Beispiel #1
0
    def __init__(
        self,
        root: Path = (Path(__file__).parent.resolve() /
                      Path("../data/corpus/")),
        target: Path = (Path(__file__).parent.resolve() /
                        Path("../data/corpus_processed/")),
        fileids: str = r".+\.html",
        encoding: str = "utf8",
    ) -> None:
        """Initialize the corpus reader.

        Keyword Arguments:
            root {Path} -- Path of corpus root. 
            target {Path} -- Path of transformed corpus root.
            fileids {str} -- Regex pattern for documents.
            encoding {str} -- String encoding of corpus.
        """

        CorpusReader.__init__(self, str(root), fileids, encoding)
        self.target = target

        self.html2text = HTML2Text()
        self.html2text.ignore_links = True
        self.html2text.ignore_images = True
        self.html2text.ignore_tables = True
        self.html2text.ignore_emphasis = True
        self.html2text.unicode_snob = True

        self.log = logging.getLogger("readability.readability")
        self.log.setLevel("WARNING")
Beispiel #2
0
    def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None,
                 target_language=None, paragraph_separator='\n\n', **kwargs):
        """
        :param root: The file root of the corpus directory
        :param fileids: the list of file ids to consider, or wildcard expression
        :param skip_keywords: a list of words which indicate whole paragraphs that should
        be skipped by the paras and words methods()
        :param target_language: which files to select; sometimes a corpus contains English
         translations, we expect these files to be named ...english.json -- if not, pass in fileids
        :param paragraph_separator: character sequence demarcating paragraph separation
        :param encoding: utf8
        :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
        word_tokenizer.
        """

        if not target_language:
            target_language = ''
        if not fileids:
            fileids = r'.*{}\.json'.format(target_language)

        # Initialize the NLTK corpus reader objects
        CorpusReader.__init__(self, root, fileids, encoding)
        if 'sent_tokenizer' in kwargs:
            self._sent_tokenizer = kwargs['sent_tokenizer']
        if 'word_tokenizer' in kwargs:
            self._word_tokenizer = kwargs['word_tokenizer']
        self.skip_keywords = skip_keywords
        self.paragraph_separator = paragraph_separator
Beispiel #3
0
    def __init__(self,
                 root,
                 fileids=DOC_PATTERN,
                 tags=None,
                 word_tokenizer=WordPunctTokenizer(),
                 sent_tokenizer=nltk.data.LazyLoader(
                     'tokenizers/punkt/english.pickle'),
                 encoding='utf8',
                 **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._good_tags = tags or self.TAGS
Beispiel #4
0
    def __init__(
        self,
        root,
        fileids,
        sep="/",
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
        alignedsent_block_reader=read_alignedsent_block,
        encoding="latin1",
    ):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
Beispiel #5
0
    def __init__(self,
                 root,
                 fileids=None,
                 encoding='utf8',
                 skip_keywords=None,
                 **kwargs):
        """

        :param root: The file root of the corpus directory
        :param fileids: the list of file ids to consider, or wildcard expression
        :param skip_keywords: a list of words which indicate whole paragraphs that should
        be skipped by the paras and words methods()
        :param encoding: utf8
        :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
        word_tokenizer.
        """
        if not fileids:
            fileids = r'.*\.txt'

        # Initialize the NLTK corpus reader objects
        PlaintextCorpusReader.__init__(self, root, fileids, encoding)
        CorpusReader.__init__(self, root, fileids, encoding)
        if 'sent_tokenizer' in kwargs:
            self._sent_tokenizer = kwargs['sent_tokenizer']
        if 'word_tokenizer' in kwargs:
            self._word_tokenizer = kwargs['word_tokenizer']
        self.skip_keywords = skip_keywords
Beispiel #6
0
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)
    def __init__(
        self, 
        root, 
        fileids=DOC_PATTERN,
        encoding='utf8', 
        **kwargs
    ):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            # First, try to build a cat_map from standard-style filenames
            try: 
                kwargs['cat_map'] = make_cat_map(root, 'txt')
            # On error, fall back to dir names for categories    
            except Exception as e:
                print(type(e), e, "\nUnable to build category map from file names.\nFalling back to categories by directory name.")
                kwargs['cat_pattern'] = CAT_PATTERN

        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)
Beispiel #8
0
    def __init__(self,
                 root,
                 fileids=None,
                 word_tokenizer=TweetTokenizer(),
                 encoding="utf8"):
        """

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
        smaller units, including but not limited to words.

        """
        CorpusReader.__init__(self, root, fileids, encoding)

        for path in self.abspaths(self._fileids):
            if isinstance(path, ZipFilePathPointer):
                pass
            elif os.path.getsize(path) == 0:
                raise ValueError("File {} is empty".format(path))
        """Check that all user-created corpus files are non-empty."""

        self._word_tokenizer = word_tokenizer
Beispiel #9
0
    def __init__(
        self,
        root,
        fileids,
        sep='/',
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=RegexpTokenizer('\n', gaps=True),
        alignedsent_block_reader=read_alignedsent_block,
        encoding='latin1',
    ):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
Beispiel #10
0
    def __init__(self,
                 root,
                 fileids=None,
                 encoding='utf8',
                 skip_keywords=None,
                 target_language=None,
                 paragraph_separator='\n\n',
                 **kwargs):
        """
        :param root: The file root of the corpus directory
        :param fileids: the list of file ids to consider, or wildcard expression
        :param skip_keywords: a list of words which indicate whole paragraphs that should
        be skipped by the paras and words methods()
        :param target_language: which files to select; sometimes a corpus contains English
         translations, we expect these files to be named ...english.json -- if not, pass in fileids
        :param paragraph_separator: character sequence demarcating paragraph separation
        :param encoding: utf8
        :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
        word_tokenizer.
        """

        if not target_language:
            target_language = ''
        if not fileids:
            fileids = r'.*{}\.json'.format(target_language)

        # Initialize the NLTK corpus reader objects
        CorpusReader.__init__(self, root, fileids, encoding)
        if 'sent_tokenizer' in kwargs:
            self._sent_tokenizer = kwargs['sent_tokenizer']
        if 'word_tokenizer' in kwargs:
            self._word_tokenizer = kwargs['word_tokenizer']
        self.skip_keywords = skip_keywords
        self.paragraph_separator = paragraph_separator
Beispiel #11
0
 def __init__(self, root, fileids=DOC_PATTERN, **kwargs):
     """
     Initialize the corpus reader.  Categorization arguments
     (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
     the ``CategorizedCorpusReader`` constructor.  The remaining
     arguments are passed to the ``CorpusReader`` constructor.
     """
     CorpusReader.__init__(self, root, fileids)
Beispiel #12
0
 def __init__(self, root, fileids=DOC_PATTERN, **kwargs):
     """
     Initialize the corpus reader.  Categorization arguments
     (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
     the ``CategorizedCorpusReader`` constructor.  The remaining
     arguments are passed to the ``CorpusReader`` constructor.
     """
     CorpusReader.__init__(self, root, fileids)
Beispiel #13
0
 def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
     """
     Initialize KNBCorpusReader
     morphs2str is a function to convert morphlist to str for tree representation
     for _parse()
     """
     CorpusReader.__init__(self, root, fileids, encoding)
     self.morphs2str = morphs2str
Beispiel #14
0
 def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
     """
     Initialize KNBCorpusReader
     morphs2str is a function to convert morphlist to str for tree representation
     for _parse()
     """
     CorpusReader.__init__(self, root, fileids, encoding)
     self.morphs2str = morphs2str
Beispiel #15
0
 def __init__(self, root, fileids, tone, tag, wrap_etree=False):
     self.fileids = fileids
     self._wrap_etree = wrap_etree
     CorpusReader.__init__(self, root, fileids)
     self.tagged_sents = []
     self.sents = []
     self.words = []
     self.tagged_words = []
     self.option_tone = tone
     self.option_tag = tag
Beispiel #16
0
 def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
     """
     Initialize KNBCorpusReader
     morphs2str is a function to convert morphlist to str for tree representation
     for _parse()
     """
     # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
     #       from CorpusReader?
     CorpusReader.__init__(self, root, fileids, encoding)
     self.morphs2str = morphs2str
 def __init__(self, root, fileids,
              syntax_parser=CaboChaParser(),
              word_tokenizer=MeCabTokenizer(),
              sent_tokenizer=jp_sent_tokenizer,
              case_parser=KNPParser(),
              encoding='utf-8'):
   CorpusReader.__init__(self, root, fileids, encoding)
   self._syntax_parser = syntax_parser
   self._word_tokenizer = word_tokenizer
   self._sent_tokenizer = sent_tokenizer
   self._case_parser = case_parser
Beispiel #18
0
    def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8', **kwargs):
        """
        Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        # Initialize the NLTK corpus reader objects
        TwitterCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)
 def __init__(self, root, zipfile, fileids):
     if isinstance(root, basestring):
         root = FileSystemPathPointer(root)
     elif not isinstance(root, PathPointer): 
         raise TypeError('CorpusReader: expected a string or a PathPointer')
     
     # convert to a ZipFilePathPointer
     root = ZipFilePathPointer(root.join(zipfile))
     
     CorpusReader.__init__(self, root, fileids)
     
     self._parse_char_replacements()
Beispiel #20
0
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)
 def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8',**kwargs):
     
     """
     Initialize the corpus reader.
     """
     
     # add the default category pattern if not passed into the class.
     if not any(key.startswith('cat_') for key in kwargs.keys()):
         kwargs['cat_pattern'] = CAT_PATTERN
         
     # Initialize the NLTK corpus reader objects
     CategorizedCorpusReader.__init__(self,kwargs)
     CorpusReader.__init__(self, root, fileids, encoding)
Beispiel #22
0
    def __init__(
        self,
        root: Path = (Path(__file__).parent.resolve() /
                      Path("../data/corpus_processed/")),
        fileids=r".+\.pickle",
    ):
        """
        Initialize the corpus reader.

        Keyword Arguments:
            root -- Path of corpus root.
            fileids -- Regex pattern for documents.
        """
        CorpusReader.__init__(self, str(root), fileids)
Beispiel #23
0
    def __init__(self, root, fileids=DOC_PATTERN, encoding="utf8", **kwargs):
        """
        Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``)
        are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the
        ``CorpusReader`` constructor.
        """

        # add default category pattern if not passed into the class
        if not any(key.startswith("cat_") for key in kwargs.keys()):
            kwargs["cat_pattern"] = self.CAT_PATTERN

        # initialize the NLTK corpus reader objects
        CorpusReader.__init__(self, root, fileids, encoding)
        CategorizedCorpusReader.__init__(self, kwargs)
Beispiel #24
0
    def __init__(self, root, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """

        # Get the CorpusReader specific arguments
        fileids = kwargs.pop('fileids')
        encoding = kwargs.pop('encoding')

        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)
 def __init__(self, root, fileids, 
              sep='/', word_tokenizer=WhitespaceTokenizer(),
              sent_tokenizer=RegexpTokenizer('\n', gaps=True),
              encoding=None):
     """
     @param root: The root directory for this corpus.
     @param fileids: A list or regexp specifying the fileids in this corpus.
     """
     CorpusReader.__init__(self, root, fileids, encoding)
     self._sep = sep
     self._word_tokenizer = word_tokenizer
     self._sent_tokenizer = sent_tokenizer
     self._alignedsent_block_reader=None,
     self._alignedsent_block_reader = self._alignedsent_block_reader
     self._alignedsent_corpus_view = None
Beispiel #26
0
    def __init__(self,
                 root,
                 fileids=None,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf-8-sig'):

        CorpusReader.__init__(self, root, fileids, encoding)

        for path in self.abspaths(self._fileids):
            if isinstance(path, ZipFilePathPointer):
                pass
            elif os.path.getsize(path) == 0:
                raise ValueError("File {} is empty".format(path))
        """Check that all user-created corpus files are non-empty."""

        self._word_tokenizer = word_tokenizer
Beispiel #27
0
    def __init__(self, root, fileids=PathPattern.doc_pattern.value, encoding='utf8', **kwargs):
        """
        Инициализирует объект чтения промежуточного обработанных файлов
        корпуса.
        """

        # Добавить шаблон категорий, если он не был передан в класс явно
        if not any(key.startswitch('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = PathPattern.cat_pattern.value

        # Инициализировать объекты чтения корпуса из NLTK
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        # Инициировать список для сбора токенов
        self.__tokens = Counter()
Beispiel #28
0
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

        self._word_tokenizer = WordPunctTokenizer()
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle')
Beispiel #29
0
    def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8',
                 tags=TAGS, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        # Save the tags that we specifically want to extract.
        self.tags = tags
Beispiel #30
0
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialise the pickled corpus reader using two corpus readers from
        the nltk library
        Parameters
        ----------
        root : str like
            the root directory for the corpus
        fileids : str like
            a regex pattern for the corpus document files
        kwargs :
            Additional arguements passed to the nltk corpus readers
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)
Beispiel #31
0
    def __init__(self, rootpath, fileids, encoding="utf8",
                 default_text_selector=lambda row: None, **kwargs):
        """
        Initialize CSV corpus reader

        Arguments:
        rootpath (str) - path to folder with corpus files (see NLTK CorpusReader for more info)
        fileids (list str) - names of files in root (see NLTK CorpusReader for more info)
        default_test_selector (lambda) - default selector that will be used to extract text from corpus
        **kwargs (named arguments) - arguemnts passed to csv.DictReader (see csv.DictReader for more info)
        """

        # Initialize base NLTK corpus reader object
        CorpusReader.__init__(self, rootpath, fileids, encoding=encoding)

        # Initialize default selector
        self.__default_text_selectors = default_text_selector

        # Save csv parser params
        self.csv_kwargs = kwargs
Beispiel #32
0
    def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None,
                 **kwargs):
        """
        :param root: The file root of the corpus directory
        :param fileids: the list of file ids to consider, or wildcard expression
        :param skip_keywords: a list of words which indicate whole paragraphs that should
        be skipped by the paras and words methods()
        :param encoding: utf8
        :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
        word_tokenizer.
        """
        if not fileids:
            fileids = r'.*\.txt'

        # Initialize the NLTK corpus reader objects
        PlaintextCorpusReader.__init__(self, root, fileids, encoding)
        CorpusReader.__init__(self, root, fileids, encoding)
        if 'sent_tokenizer' in kwargs:
            self._sent_tokenizer = kwargs['sent_tokenizer']
        if 'word_tokenizer' in kwargs:
            self._word_tokenizer = kwargs['word_tokenizer']
        self.skip_keywords = skip_keywords
Beispiel #33
0
    def __init__(self,
                 root,
                 fileids=DOC_PATTERN,
                 encoding='utf8',
                 tags=TAGS,
                 **kwargs):
        """
        Инициализирует объект чтения корпуса.
        Аргументы, управляющие классификацией
        (``cat_pattern``, ``cat_map`` и ``cat_file``), передаются
        в конструктор ``CategorizedCorpusReader``. остальные аргументы
        передаются в конструктор ``CorpusReader``.
        """
        # Добавить шаблон категорий, если он не был передан в класс явно.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN
        # Инициализировать объекты чтения корпуса из NLTK
        CategorizedCorpusReader.__init__(
            self, kwargs)  # передаются именованные аргументы
        CorpusReader.__init__(self, root, fileids)

        # Сохранить теги, подлежащие извлечению.
        self.tags = tags
    def __init__(self, root, fileids=None,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf8'):
        """

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
        smaller units, including but not limited to words.

        """
        CorpusReader.__init__(self, root, fileids, encoding)

        for path in self.abspaths(self._fileids):
            if isinstance(path, ZipFilePathPointer):
                pass
            elif os.path.getsize(path) == 0:
                raise ValueError("File {} is empty".format(path))
        """Check that all user-created corpus files are non-empty."""

        self._word_tokenizer = word_tokenizer
    def __init__(self,
                 root,
                 fileids,
                 column_types=None,
                 top_node='S',
                 beginning_of_sentence=r'#BOS.+$',
                 end_of_sentence=r'#EOS.+$',
                 encoding=None):
        """ Construct a new corpus reader for reading NEGRA corpus files.
        @param root: The root directory of the corpus files.
        @param fileids: A list of or regex specifying the files to read from.
        @param column_types: An optional C{list} of columns in the corpus.
        @param top_node: The top node of chunked sentence trees.
        @param beginning_of_sentence: A regex specifying the start of a sentence
        @param end_of_sentence: A regex specifying the end of a sentence
        @param encoding: The default corpus file encoding.
        """

        # Make sure there are no invalid column type
        if isinstance(column_types, list):
            for column_type in column_types:
                if column_type not in self.COLUMN_TYPES:
                    raise ValueError("Column %r is not supported." %
                                     columntype)
        else:
            column_types = self.COLUMN_TYPES

        # Define stuff
        self._top_node = top_node
        self._column_types = column_types
        self._fileids = fileids
        self._bos = beginning_of_sentence
        self._eos = end_of_sentence
        self._colmap = dict((c, i) for (i, c) in enumerate(column_types))

        # Finish constructing by calling the extended class' constructor
        CorpusReader.__init__(self, root, fileids, encoding)
Beispiel #36
0
    def __init__(self,
                 events,
                 fileids=None,
                 encoding='utf8',
                 tags=TAGS,
                 **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = None

        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, '.', fileids, encoding)

        # Save the events list
        self.events = events
        self.tagger = pos_tagger('spacy')
        self.htmltags = tags
    def __init__(self,
                 root,
                 fileids,
                 column_types=None,
                 top_node='S',
                 beginning_of_sentence=r'#BOS.+$',
                 end_of_sentence=r'#EOS.+$',
                 encoding=None):
        """ Construct a new corpus reader for reading NEGRA corpus files.
        @param root: The root directory of the corpus files.
        @param fileids: A list of or regex specifying the files to read from.
        @param column_types: An optional C{list} of columns in the corpus.
        @param top_node: The top node of parsed sentence trees.
        @param beginning_of_sentence: A regex specifying the start of a sentence
        @param end_of_sentence: A regex specifying the end of a sentence
        @param encoding: The default corpus file encoding.
        """

        # Make sure there are no invalid column type
        if isinstance(column_types, list):
            for column_type in column_types:
                if column_type not in self.COLUMN_TYPES:
                    raise ValueError("Column %r is not supported." % columntype)
        else:
            column_types = self.COLUMN_TYPES

        # Define stuff
        self._top_node = top_node
        self._column_types = column_types
        self._fileids = fileids
        self._bos = beginning_of_sentence
        self._eos = end_of_sentence
        self._colmap = dict((c,i) for (i,c) in enumerate(column_types))

        # Finish constructing by calling the extended class' constructor
        CorpusReader.__init__(self, root, fileids, encoding)
Beispiel #38
0
 def __init__(self, root, fileids):
     CorpusReader.__init__(self, root, fileids, None, None)
Beispiel #39
0
 def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
     """
     Initialize the corpus reader
     """
     CorpusReader.__init__(self, root, fileids, **kwargs)
Beispiel #40
0
 def __init__(self, root, fileids):
     CorpusReader.__init__(self, root, fileids, None, None)
Beispiel #41
0
 def __init__(self, fileids=r'.*\.review'):
     _root = os.path.join(susx._sussex_root, 'data/amazon_customer_reviews')
     self._n_sents = 140443
     CorpusReader.__init__(self, _root, fileids)
     self._n = None
Beispiel #42
0
 def __init__(self, root, fileids=DOC_PATTERN):
     """
     Инициальзируем класс чтения
     """
     CorpusReader.__init__(self, root, fileids)
Beispiel #43
0
 def __init__(self, fileids=r'.*\.mrg'):
     _root = os.path.join(susx._sussex_root,
                          'data/penn_treebank_npbrac_stanforddeps')
     CorpusReader.__init__(self, _root, fileids)
     self._n = None
     self._n_sents = 51520
Beispiel #44
0
 def __init__(self, root, fileids, wrap_etree=False):
     self._wrap_etree = wrap_etree
     CorpusReader.__init__(self, root, fileids)
Beispiel #45
0
 def __init__(self, root, fileids, wrap_etree=False):
     self._wrap_etree = wrap_etree
     CorpusReader.__init__(self, root, fileids)
Beispiel #46
0
 def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
     """
     Initialize the corpus reader
     """
     CorpusReader.__init__(self, root, fileids, **kwargs)
Beispiel #47
0
 def __init__(self, fileids=r'.*\.gz', data_folder=''):
     _root = os.path.join(susx._sussex_root, data_folder)
     CorpusReader.__init__(self, _root, fileids)
     self._n = None
     self._n_sents = None