Exemple #1
0
 def __init__(self,
              files='-',
              filehandle=None,
              zone='keep',
              bundles_per_doc=0,
              encoding='utf-8-sig',
              sent_id_filter=None,
              split_docs=False,
              ignore_sent_id=False,
              **kwargs):
     super().__init__(**kwargs)
     if filehandle is not None:
         files = None
     self.files = Files(filenames=files,
                        filehandle=filehandle,
                        encoding=encoding)
     self.zone = zone
     self.bundles_per_doc = bundles_per_doc
     self._buffer = None
     self.finished = False
     self.sent_id_filter = None
     if sent_id_filter is not None:
         self.sent_id_filter = re.compile(str(sent_id_filter))
         logging.debug('Using sent_id_filter=%s', sent_id_filter)
     self.split_docs = split_docs
     self.ignore_sent_id = ignore_sent_id
Exemple #2
0
 def __init__(self,
              files='-',
              docname_as_file=False,
              encoding='utf-8',
              newline='\n',
              **kwargs):
     super().__init__(**kwargs)
     self.orig_files = files
     self.files = Files(filenames=files)
     self.encoding = encoding
     self.newline = newline
     self.docname_as_file = docname_as_file
     if docname_as_file and files != '-':
         raise ValueError(
             "docname_as_file=1 is not compatible with files=" + files)
 def __init__(self, files='-', zone='keep', bundles_per_doc=0, encoding='utf-8',
              sent_id_filter=None, split_docs=False, **kwargs):
     super().__init__(**kwargs)
     self.files = Files(filenames=files)
     self.filenames = self.files.filenames # !!! ADDED !!!
     self.zone = zone
     self.bundles_per_doc = bundles_per_doc
     self.encoding = encoding
     self._buffer = None
     self.finished = False
     self.sent_id_filter = None
     if sent_id_filter is not None:
         self.sent_id_filter = re.compile(str(sent_id_filter))
         logging.debug('Using sent_id_filter=%s', sent_id_filter)
     self.split_docs = split_docs
Exemple #4
0
 def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8',
              newline='\n', overwrite=False, **kwargs):
     super().__init__(**kwargs)
     self.orig_files = files
     self.orig_stdout = sys.stdout
     if filehandle is not None:
         files = None
         self.orig_files = '<filehandle>'
     self.files = Files(filenames=files, filehandle=filehandle)
     self.encoding = encoding
     self.newline = newline
     self.docname_as_file = docname_as_file
     if docname_as_file and files != '-':
         raise ValueError("docname_as_file=1 is not compatible with files=" + files)
     self.overwrite = overwrite
     if overwrite and files != '-':
         raise ValueError("overwrite=1 is not compatible with files=" + files)
     if overwrite and docname_as_file:
         raise ValueError("overwrite=1 is not compatible with docname_as_file=1")
Exemple #5
0
 def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig',
              sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs):
     super().__init__(**kwargs)
     if filehandle is not None:
         files = None
     self.files = Files(filenames=files, filehandle=filehandle, encoding=encoding)
     self.zone = zone
     self.bundles_per_doc = bundles_per_doc
     self._buffer = None
     self.finished = False
     self.sent_id_filter = None
     if sent_id_filter is not None:
         self.sent_id_filter = re.compile(str(sent_id_filter))
         logging.debug('Using sent_id_filter=%s', sent_id_filter)
     self.split_docs = split_docs
     self.ignore_sent_id = ignore_sent_id
Exemple #6
0
class BaseWriter(Block):
    """Base class for all reader blocks."""

    def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8',
                 newline='\n', overwrite=False, **kwargs):
        super().__init__(**kwargs)
        self.orig_files = files
        self.orig_stdout = sys.stdout
        if filehandle is not None:
            files = None
            self.orig_files = '<filehandle>'
        self.files = Files(filenames=files, filehandle=filehandle)
        self.encoding = encoding
        self.newline = newline
        self.docname_as_file = docname_as_file
        if docname_as_file and files != '-':
            raise ValueError("docname_as_file=1 is not compatible with files=" + files)
        self.overwrite = overwrite
        if overwrite and files != '-':
            raise ValueError("overwrite=1 is not compatible with files=" + files)
        if overwrite and docname_as_file:
            raise ValueError("overwrite=1 is not compatible with docname_as_file=1")

    @property
    def filename(self):
        """Property with the current filehandle."""
        return self.files.filename

    @property
    def file_number(self):
        """Property with the current file number (1-based)."""
        return self.files.file_number

    def next_filename(self):
        """Go to the next file and retrun its filename."""
        return self.files.next_filename()

    def before_process_document(self, document):
        udapi.core.coref.store_coref_to_misc(document)
        if self.orig_files == '<filehandle>':
            logging.info('Writing to filehandle.')
            sys.stdout = self.files.filehandle
            return
        old_filehandle = sys.stdout
        if self.orig_files == '-':
            if self.docname_as_file:
                docname = document.meta.get('docname', None)
                if docname is not None:
                    logging.info('Writing to file %s.', docname)
                    sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline)
                else:
                    logging.warning('docname_as_file=1 but the document contains no docname')
            elif self.overwrite:
                docname = document.meta.get('loaded_from', None)
                if docname is not None:
                    logging.info('Writing to file %s.', docname)
                    sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline)
                else:
                    logging.warning('overwrite=1 but documet.meta["loaded_from"] is None')
            else:
                sys.stdout = self.orig_stdout
        else:
            filename = self.next_filename()
            if filename is None:
                raise RuntimeError('There are more documents to save than filenames given (%s)'
                                % self.orig_files)
            elif filename == '-':
                logging.info('Writing to stdout.')
                sys.stdout = self.orig_stdout
            else:
                logging.info('Writing to file %s.', filename)
                sys.stdout = open(filename, 'wt', encoding=self.encoding, newline=self.newline)
        if old_filehandle not in (sys.stdout, self.orig_stdout):
            old_filehandle.close()


    def after_process_document(self, document):
        sys.stdout.flush()
        if sys.stdout != self.orig_stdout:
            sys.stdout.close()
            sys.stdout = self.orig_stdout
Exemple #7
0
class BaseReader(Block):
    """Base class for all reader blocks."""

    # pylint: disable=too-many-arguments
    def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig',
                 sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs):
        super().__init__(**kwargs)
        if filehandle is not None:
            files = None
        self.files = Files(filenames=files, filehandle=filehandle, encoding=encoding)
        self.zone = zone
        self.bundles_per_doc = bundles_per_doc
        self._buffer = None
        self.finished = False
        self.sent_id_filter = None
        if sent_id_filter is not None:
            self.sent_id_filter = re.compile(str(sent_id_filter))
            logging.debug('Using sent_id_filter=%s', sent_id_filter)
        self.split_docs = split_docs
        self.ignore_sent_id = ignore_sent_id

    @staticmethod
    def is_multizone_reader():
        """Can this reader read bundles which contain more zones?.

        This implementation returns always True.
        If a subclass supports just one zone in file (e.g. `read.Sentences`),
        this method should be overriden to return False, so `process_document`
        can take advatage of this knowledge and optimize the reading
        (no buffer needed even if `bundles_per_doc` specified).
        """
        return True

    @property
    def filehandle(self):
        """Property with the current file handle."""
        return self.files.filehandle

    @property
    def filename(self):
        """Property with the current filename."""
        return self.files.filename

    @property
    def file_number(self):
        """Property with the current file number (1-based)."""
        return self.files.file_number

    def next_filehandle(self):
        """Go to the next file and retrun its filehandle."""
        return self.files.next_filehandle()

    def read_tree(self):
        """Load one (more) tree from self.files and return its root.

        This method must be overriden in all readers.
        Usually it is the only method that needs to be implemented.
        The implementation in this base clases raises `NotImplementedError`.
        """
        raise NotImplementedError("Class %s doesn't implement read_tree" % self.__class__.__name__)

    def filtered_read_tree(self):
        """Load and return one more tree matching the `sent_id_filter`.

        This method uses `read_tree()` internally.
        This is the method called by `process_document`.
        """
        tree = self.read_tree()
        if self.sent_id_filter is None:
            return tree
        while True:
            if tree is None:
                return None
            if self.sent_id_filter.match(tree.sent_id) is not None:
                return tree
            logging.debug('Skipping sentence %s as it does not match the sent_id_filter %s.',
                          tree.sent_id, self.sent_id_filter)
            tree = self.read_tree()

    # pylint: disable=too-many-branches,too-many-statements
    # Maybe the code could be refactored, but it is speed-critical,
    # so benchmarking is needed because calling extra methods may result in slowdown.
    def process_document(self, document):
        orig_bundles = document.bundles[:]
        last_bundle_id = ''
        bundle = None

        # There may be a tree left in the buffer when reading the last doc.
        if self._buffer:
            root = self._buffer
            self._buffer = None
            if orig_bundles:
                # TODO list.pop(0) is inefficient, use collections.deque.popleft()
                bundle = orig_bundles.pop(0)
            else:
                bundle = document.create_bundle()
                if root._sent_id is not None:
                    bundle.bundle_id = root._sent_id.split('/', 1)[0]
            bundle.add_tree(root)
            if root.newdoc and root.newdoc is not True:
                document.meta["docname"] = root.newdoc

        filehandle = self.filehandle
        if filehandle is None:
            filehandle = self.next_filehandle()
            if filehandle is None:
                self.finished = True
                return

        trees_loaded = 0
        while True:
            root = self.filtered_read_tree()
            if root is None:
                if trees_loaded == 0 and self.files.has_next_file():
                    filehandle = self.next_filehandle()
                    continue
                self.finished = not self.files.has_next_file()
                break
            add_to_the_last_bundle = 0
            trees_loaded += 1

            if self.ignore_sent_id:
                root._sent_id = None
            if root._sent_id is not None:
                parts = root._sent_id.split('/', 1)
                bundle_id = parts[0]
                if len(parts) == 2:
                    root.zone = parts[1]
                add_to_the_last_bundle = bundle_id == last_bundle_id
                last_bundle_id = bundle_id

            if self.zone != 'keep':
                root.zone = self.zone

            # The `# newdoc` comment in CoNLL-U marks a start of a new document.
            if root.newdoc:
                if not bundle and root.newdoc is not True:
                    document.meta["docname"] = root.newdoc
                if bundle and self.split_docs:
                    self._buffer = root
                    if orig_bundles:
                        logging.warning("split_docs=1 but the doc had contained %d bundles",
                                        len(orig_bundles))
                    self.finished = False
                    return

            # assign new/next bundle to `bundle` if needed
            if not bundle or not add_to_the_last_bundle:
                if self.bundles_per_doc and bundle and self.bundles_per_doc == bundle.number:
                    self._buffer = root
                    if orig_bundles:
                        logging.warning("bundles_per_doc=%d but the doc had contained %d bundles",
                                        self.bundles_per_doc, len(orig_bundles))
                    return

                if orig_bundles:
                    # TODO list.pop(0) is inefficient, use collections.deque.popleft()
                    bundle = orig_bundles.pop(0)
                    if last_bundle_id and last_bundle_id != bundle.bundle_id:
                        logging.warning('Mismatch in bundle IDs: %s vs %s. Keeping the former one.',
                                        bundle.bundle_id, last_bundle_id)
                else:
                    bundle = document.create_bundle()
                    if last_bundle_id != '':
                        bundle.bundle_id = last_bundle_id

            bundle.add_tree(root)

            # If bundles_per_doc is set and we have read the specified number of bundles,
            # we should end the current document and return.
            # However, if the reader supports reading multiple zones, we can never know
            # if the current bundle has ended or there will be another tree for this bundle.
            # So in case of multizone readers we need to read one extra tree
            # and store it in the buffer (and include it into the next document).
            if self.bundles_per_doc and self.bundles_per_doc == bundle.number \
               and not self.is_multizone_reader():
                return
Exemple #8
0
class BaseWriter(Block):
    """Base class for all reader blocks."""
    def __init__(self,
                 files='-',
                 docname_as_file=False,
                 encoding='utf-8',
                 newline='\n',
                 **kwargs):
        super().__init__(**kwargs)
        self.orig_files = files
        self.files = Files(filenames=files)
        self.encoding = encoding
        self.newline = newline
        self.docname_as_file = docname_as_file
        if docname_as_file and files != '-':
            raise ValueError(
                "docname_as_file=1 is not compatible with files=" + files)

    @property
    def filename(self):
        """Property with the current filehandle."""
        return self.files.filename

    @property
    def file_number(self):
        """Property with the current file number (1-based)."""
        return self.files.file_number

    def next_filename(self):
        """Go to the next file and retrun its filename."""
        return self.files.next_filename()

    def before_process_document(self, document):
        if self.orig_files == '-':
            if self.docname_as_file:
                docname = document.meta.get('docname', None)
                if docname is not None:
                    logging.info('Writing to file %s.', docname)
                    sys.stdout = open(docname,
                                      'wt',
                                      encoding=self.encoding,
                                      newline=self.newline)
                else:
                    logging.warning(
                        'docname_as_file=1 but the document contains no docname'
                    )
            else:
                sys.stdout = sys.__stdout__
            return

        old_filehandle = sys.stdout
        if old_filehandle.fileno != sys.stdout.fileno:
            old_filehandle.close()

        filename = self.next_filename()
        if filename is None:
            raise RuntimeError(
                'There are more documents to save than filenames given (%s)' %
                self.orig_files)
        elif filename == '-':
            logging.info('Writing to stdout.')
            sys.stdout = sys.__stdout__
        else:
            logging.info('Writing to file %s.', filename)
            sys.stdout = open(filename,
                              'wt',
                              encoding=self.encoding,
                              newline=self.newline)
Exemple #9
0
class BaseReader(Block):
    """Base class for all reader blocks."""

    # pylint: disable=too-many-arguments
    def __init__(self,
                 files='-',
                 filehandle=None,
                 zone='keep',
                 bundles_per_doc=0,
                 encoding='utf-8-sig',
                 sent_id_filter=None,
                 split_docs=False,
                 ignore_sent_id=False,
                 **kwargs):
        super().__init__(**kwargs)
        if filehandle is not None:
            files = None
        self.files = Files(filenames=files,
                           filehandle=filehandle,
                           encoding=encoding)
        self.zone = zone
        self.bundles_per_doc = bundles_per_doc
        self._buffer = None
        self.finished = False
        self.sent_id_filter = None
        if sent_id_filter is not None:
            self.sent_id_filter = re.compile(str(sent_id_filter))
            logging.debug('Using sent_id_filter=%s', sent_id_filter)
        self.split_docs = split_docs
        self.ignore_sent_id = ignore_sent_id

    @staticmethod
    def is_multizone_reader():
        """Can this reader read bundles which contain more zones?.

        This implementation returns always True.
        If a subclass supports just one zone in file (e.g. `read.Sentences`),
        this method should be overriden to return False, so `process_document`
        can take advatage of this knowledge and optimize the reading
        (no buffer needed even if `bundles_per_doc` specified).
        """
        return True

    @property
    def filehandle(self):
        """Property with the current file handle."""
        return self.files.filehandle

    @property
    def filename(self):
        """Property with the current filename."""
        return self.files.filename

    @property
    def file_number(self):
        """Property with the current file number (1-based)."""
        return self.files.file_number

    def next_filehandle(self):
        """Go to the next file and retrun its filehandle."""
        return self.files.next_filehandle()

    def read_tree(self):
        """Load one (more) tree from self.files and return its root.

        This method must be overriden in all readers.
        Usually it is the only method that needs to be implemented.
        The implementation in this base clases raises `NotImplementedError`.
        """
        raise NotImplementedError("Class %s doesn't implement read_tree" %
                                  self.__class__.__name__)

    def filtered_read_tree(self):
        """Load and return one more tree matching the `sent_id_filter`.

        This method uses `read_tree()` internally.
        This is the method called by `process_document`.
        """
        tree = self.read_tree()
        if self.sent_id_filter is None:
            return tree
        while True:
            if tree is None:
                return None
            if self.sent_id_filter.match(tree.sent_id) is not None:
                return tree
            logging.debug(
                'Skipping sentence %s as it does not match the sent_id_filter %s.',
                tree.sent_id, self.sent_id_filter)
            tree = self.read_tree()

    # pylint: disable=too-many-branches,too-many-statements
    # Maybe the code could be refactored, but it is speed-critical,
    # so benchmarking is needed because calling extra methods may result in slowdown.
    def process_document(self, document):
        orig_bundles = document.bundles[:]
        last_bundle_id = ''
        bundle = None

        # There may be a tree left in the buffer when reading the last doc.
        if self._buffer:
            root = self._buffer
            self._buffer = None
            if orig_bundles:
                # TODO list.pop(0) is inefficient, use collections.deque.popleft()
                bundle = orig_bundles.pop(0)
            else:
                bundle = document.create_bundle()
                if root._sent_id is not None:
                    bundle.bundle_id = root._sent_id.split('/', 1)[0]
            bundle.add_tree(root)
            if root.newdoc and root.newdoc is not True:
                document.meta["docname"] = root.newdoc

        filehandle = self.filehandle
        if filehandle is None:
            filehandle = self.next_filehandle()
            if filehandle is None:
                self.finished = True
                return

        trees_loaded = 0
        while True:
            root = self.filtered_read_tree()
            if root is None:
                if trees_loaded == 0 and self.files.has_next_file():
                    filehandle = self.next_filehandle()
                    continue
                self.finished = not self.files.has_next_file()
                break
            add_to_the_last_bundle = 0
            trees_loaded += 1

            if self.ignore_sent_id:
                root._sent_id = None
            if root._sent_id is not None:
                parts = root._sent_id.split('/', 1)
                bundle_id = parts[0]
                if len(parts) == 2:
                    root.zone = parts[1]
                add_to_the_last_bundle = bundle_id == last_bundle_id
                last_bundle_id = bundle_id

            if self.zone != 'keep':
                root.zone = self.zone

            # The `# newdoc` comment in CoNLL-U marks a start of a new document.
            if root.newdoc:
                if not bundle and root.newdoc is not True:
                    document.meta["docname"] = root.newdoc
                if bundle and self.split_docs:
                    self._buffer = root
                    if orig_bundles:
                        logging.warning(
                            "split_docs=1 but the doc had contained %d bundles",
                            len(orig_bundles))
                    self.finished = False
                    return

            # assign new/next bundle to `bundle` if needed
            if not bundle or not add_to_the_last_bundle:
                if self.bundles_per_doc and bundle and self.bundles_per_doc == bundle.number:
                    self._buffer = root
                    if orig_bundles:
                        logging.warning(
                            "bundles_per_doc=%d but the doc had contained %d bundles",
                            self.bundles_per_doc, len(orig_bundles))
                    return

                if orig_bundles:
                    # TODO list.pop(0) is inefficient, use collections.deque.popleft()
                    bundle = orig_bundles.pop(0)
                    if last_bundle_id and last_bundle_id != bundle.bundle_id:
                        logging.warning(
                            'Mismatch in bundle IDs: %s vs %s. Keeping the former one.',
                            bundle.bundle_id, last_bundle_id)
                else:
                    bundle = document.create_bundle()
                    if last_bundle_id != '':
                        bundle.bundle_id = last_bundle_id

            bundle.add_tree(root)

            # If bundles_per_doc is set and we have read the specified number of bundles,
            # we should end the current document and return.
            # However, if the reader supports reading multiple zones, we can never know
            # if the current bundle has ended or there will be another tree for this bundle.
            # So in case of multizone readers we need to read one extra tree
            # and store it in the buffer (and include it into the next document).
            if self.bundles_per_doc and self.bundles_per_doc == bundle.number \
               and not self.is_multizone_reader():
                return