def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None self.files = Files(filenames=files, filehandle=filehandle, encoding=encoding) self.zone = zone self.bundles_per_doc = bundles_per_doc self._buffer = None self.finished = False self.sent_id_filter = None if sent_id_filter is not None: self.sent_id_filter = re.compile(str(sent_id_filter)) logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id
def __init__(self, files='-', docname_as_file=False, encoding='utf-8', newline='\n', **kwargs): super().__init__(**kwargs) self.orig_files = files self.files = Files(filenames=files) self.encoding = encoding self.newline = newline self.docname_as_file = docname_as_file if docname_as_file and files != '-': raise ValueError( "docname_as_file=1 is not compatible with files=" + files)
def __init__(self, files='-', zone='keep', bundles_per_doc=0, encoding='utf-8', sent_id_filter=None, split_docs=False, **kwargs): super().__init__(**kwargs) self.files = Files(filenames=files) self.filenames = self.files.filenames # !!! ADDED !!! self.zone = zone self.bundles_per_doc = bundles_per_doc self.encoding = encoding self._buffer = None self.finished = False self.sent_id_filter = None if sent_id_filter is not None: self.sent_id_filter = re.compile(str(sent_id_filter)) logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs
def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8', newline='\n', overwrite=False, **kwargs): super().__init__(**kwargs) self.orig_files = files self.orig_stdout = sys.stdout if filehandle is not None: files = None self.orig_files = '<filehandle>' self.files = Files(filenames=files, filehandle=filehandle) self.encoding = encoding self.newline = newline self.docname_as_file = docname_as_file if docname_as_file and files != '-': raise ValueError("docname_as_file=1 is not compatible with files=" + files) self.overwrite = overwrite if overwrite and files != '-': raise ValueError("overwrite=1 is not compatible with files=" + files) if overwrite and docname_as_file: raise ValueError("overwrite=1 is not compatible with docname_as_file=1")
class BaseWriter(Block): """Base class for all reader blocks.""" def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8', newline='\n', overwrite=False, **kwargs): super().__init__(**kwargs) self.orig_files = files self.orig_stdout = sys.stdout if filehandle is not None: files = None self.orig_files = '<filehandle>' self.files = Files(filenames=files, filehandle=filehandle) self.encoding = encoding self.newline = newline self.docname_as_file = docname_as_file if docname_as_file and files != '-': raise ValueError("docname_as_file=1 is not compatible with files=" + files) self.overwrite = overwrite if overwrite and files != '-': raise ValueError("overwrite=1 is not compatible with files=" + files) if overwrite and docname_as_file: raise ValueError("overwrite=1 is not compatible with docname_as_file=1") @property def filename(self): """Property with the current filehandle.""" return self.files.filename @property def file_number(self): """Property with the current file number (1-based).""" return self.files.file_number def next_filename(self): """Go to the next file and retrun its filename.""" return self.files.next_filename() def before_process_document(self, document): udapi.core.coref.store_coref_to_misc(document) if self.orig_files == '<filehandle>': logging.info('Writing to filehandle.') sys.stdout = self.files.filehandle return old_filehandle = sys.stdout if self.orig_files == '-': if self.docname_as_file: docname = document.meta.get('docname', None) if docname is not None: logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: logging.warning('docname_as_file=1 but the document contains no docname') elif self.overwrite: docname = document.meta.get('loaded_from', None) if docname is not None: logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: logging.warning('overwrite=1 but documet.meta["loaded_from"] is None') else: sys.stdout = self.orig_stdout else: filename = self.next_filename() if filename is None: raise RuntimeError('There are more documents to save than filenames given (%s)' % self.orig_files) elif filename == '-': logging.info('Writing to stdout.') sys.stdout = self.orig_stdout else: logging.info('Writing to file %s.', filename) sys.stdout = open(filename, 'wt', encoding=self.encoding, newline=self.newline) if old_filehandle not in (sys.stdout, self.orig_stdout): old_filehandle.close() def after_process_document(self, document): sys.stdout.flush() if sys.stdout != self.orig_stdout: sys.stdout.close() sys.stdout = self.orig_stdout
class BaseReader(Block): """Base class for all reader blocks.""" # pylint: disable=too-many-arguments def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None self.files = Files(filenames=files, filehandle=filehandle, encoding=encoding) self.zone = zone self.bundles_per_doc = bundles_per_doc self._buffer = None self.finished = False self.sent_id_filter = None if sent_id_filter is not None: self.sent_id_filter = re.compile(str(sent_id_filter)) logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id @staticmethod def is_multizone_reader(): """Can this reader read bundles which contain more zones?. This implementation returns always True. If a subclass supports just one zone in file (e.g. `read.Sentences`), this method should be overriden to return False, so `process_document` can take advatage of this knowledge and optimize the reading (no buffer needed even if `bundles_per_doc` specified). """ return True @property def filehandle(self): """Property with the current file handle.""" return self.files.filehandle @property def filename(self): """Property with the current filename.""" return self.files.filename @property def file_number(self): """Property with the current file number (1-based).""" return self.files.file_number def next_filehandle(self): """Go to the next file and retrun its filehandle.""" return self.files.next_filehandle() def read_tree(self): """Load one (more) tree from self.files and return its root. This method must be overriden in all readers. Usually it is the only method that needs to be implemented. The implementation in this base clases raises `NotImplementedError`. """ raise NotImplementedError("Class %s doesn't implement read_tree" % self.__class__.__name__) def filtered_read_tree(self): """Load and return one more tree matching the `sent_id_filter`. This method uses `read_tree()` internally. This is the method called by `process_document`. """ tree = self.read_tree() if self.sent_id_filter is None: return tree while True: if tree is None: return None if self.sent_id_filter.match(tree.sent_id) is not None: return tree logging.debug('Skipping sentence %s as it does not match the sent_id_filter %s.', tree.sent_id, self.sent_id_filter) tree = self.read_tree() # pylint: disable=too-many-branches,too-many-statements # Maybe the code could be refactored, but it is speed-critical, # so benchmarking is needed because calling extra methods may result in slowdown. def process_document(self, document): orig_bundles = document.bundles[:] last_bundle_id = '' bundle = None # There may be a tree left in the buffer when reading the last doc. if self._buffer: root = self._buffer self._buffer = None if orig_bundles: # TODO list.pop(0) is inefficient, use collections.deque.popleft() bundle = orig_bundles.pop(0) else: bundle = document.create_bundle() if root._sent_id is not None: bundle.bundle_id = root._sent_id.split('/', 1)[0] bundle.add_tree(root) if root.newdoc and root.newdoc is not True: document.meta["docname"] = root.newdoc filehandle = self.filehandle if filehandle is None: filehandle = self.next_filehandle() if filehandle is None: self.finished = True return trees_loaded = 0 while True: root = self.filtered_read_tree() if root is None: if trees_loaded == 0 and self.files.has_next_file(): filehandle = self.next_filehandle() continue self.finished = not self.files.has_next_file() break add_to_the_last_bundle = 0 trees_loaded += 1 if self.ignore_sent_id: root._sent_id = None if root._sent_id is not None: parts = root._sent_id.split('/', 1) bundle_id = parts[0] if len(parts) == 2: root.zone = parts[1] add_to_the_last_bundle = bundle_id == last_bundle_id last_bundle_id = bundle_id if self.zone != 'keep': root.zone = self.zone # The `# newdoc` comment in CoNLL-U marks a start of a new document. if root.newdoc: if not bundle and root.newdoc is not True: document.meta["docname"] = root.newdoc if bundle and self.split_docs: self._buffer = root if orig_bundles: logging.warning("split_docs=1 but the doc had contained %d bundles", len(orig_bundles)) self.finished = False return # assign new/next bundle to `bundle` if needed if not bundle or not add_to_the_last_bundle: if self.bundles_per_doc and bundle and self.bundles_per_doc == bundle.number: self._buffer = root if orig_bundles: logging.warning("bundles_per_doc=%d but the doc had contained %d bundles", self.bundles_per_doc, len(orig_bundles)) return if orig_bundles: # TODO list.pop(0) is inefficient, use collections.deque.popleft() bundle = orig_bundles.pop(0) if last_bundle_id and last_bundle_id != bundle.bundle_id: logging.warning('Mismatch in bundle IDs: %s vs %s. Keeping the former one.', bundle.bundle_id, last_bundle_id) else: bundle = document.create_bundle() if last_bundle_id != '': bundle.bundle_id = last_bundle_id bundle.add_tree(root) # If bundles_per_doc is set and we have read the specified number of bundles, # we should end the current document and return. # However, if the reader supports reading multiple zones, we can never know # if the current bundle has ended or there will be another tree for this bundle. # So in case of multizone readers we need to read one extra tree # and store it in the buffer (and include it into the next document). if self.bundles_per_doc and self.bundles_per_doc == bundle.number \ and not self.is_multizone_reader(): return
class BaseWriter(Block): """Base class for all reader blocks.""" def __init__(self, files='-', docname_as_file=False, encoding='utf-8', newline='\n', **kwargs): super().__init__(**kwargs) self.orig_files = files self.files = Files(filenames=files) self.encoding = encoding self.newline = newline self.docname_as_file = docname_as_file if docname_as_file and files != '-': raise ValueError( "docname_as_file=1 is not compatible with files=" + files) @property def filename(self): """Property with the current filehandle.""" return self.files.filename @property def file_number(self): """Property with the current file number (1-based).""" return self.files.file_number def next_filename(self): """Go to the next file and retrun its filename.""" return self.files.next_filename() def before_process_document(self, document): if self.orig_files == '-': if self.docname_as_file: docname = document.meta.get('docname', None) if docname is not None: logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: logging.warning( 'docname_as_file=1 but the document contains no docname' ) else: sys.stdout = sys.__stdout__ return old_filehandle = sys.stdout if old_filehandle.fileno != sys.stdout.fileno: old_filehandle.close() filename = self.next_filename() if filename is None: raise RuntimeError( 'There are more documents to save than filenames given (%s)' % self.orig_files) elif filename == '-': logging.info('Writing to stdout.') sys.stdout = sys.__stdout__ else: logging.info('Writing to file %s.', filename) sys.stdout = open(filename, 'wt', encoding=self.encoding, newline=self.newline)
class BaseReader(Block): """Base class for all reader blocks.""" # pylint: disable=too-many-arguments def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None self.files = Files(filenames=files, filehandle=filehandle, encoding=encoding) self.zone = zone self.bundles_per_doc = bundles_per_doc self._buffer = None self.finished = False self.sent_id_filter = None if sent_id_filter is not None: self.sent_id_filter = re.compile(str(sent_id_filter)) logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id @staticmethod def is_multizone_reader(): """Can this reader read bundles which contain more zones?. This implementation returns always True. If a subclass supports just one zone in file (e.g. `read.Sentences`), this method should be overriden to return False, so `process_document` can take advatage of this knowledge and optimize the reading (no buffer needed even if `bundles_per_doc` specified). """ return True @property def filehandle(self): """Property with the current file handle.""" return self.files.filehandle @property def filename(self): """Property with the current filename.""" return self.files.filename @property def file_number(self): """Property with the current file number (1-based).""" return self.files.file_number def next_filehandle(self): """Go to the next file and retrun its filehandle.""" return self.files.next_filehandle() def read_tree(self): """Load one (more) tree from self.files and return its root. This method must be overriden in all readers. Usually it is the only method that needs to be implemented. The implementation in this base clases raises `NotImplementedError`. """ raise NotImplementedError("Class %s doesn't implement read_tree" % self.__class__.__name__) def filtered_read_tree(self): """Load and return one more tree matching the `sent_id_filter`. This method uses `read_tree()` internally. This is the method called by `process_document`. """ tree = self.read_tree() if self.sent_id_filter is None: return tree while True: if tree is None: return None if self.sent_id_filter.match(tree.sent_id) is not None: return tree logging.debug( 'Skipping sentence %s as it does not match the sent_id_filter %s.', tree.sent_id, self.sent_id_filter) tree = self.read_tree() # pylint: disable=too-many-branches,too-many-statements # Maybe the code could be refactored, but it is speed-critical, # so benchmarking is needed because calling extra methods may result in slowdown. def process_document(self, document): orig_bundles = document.bundles[:] last_bundle_id = '' bundle = None # There may be a tree left in the buffer when reading the last doc. if self._buffer: root = self._buffer self._buffer = None if orig_bundles: # TODO list.pop(0) is inefficient, use collections.deque.popleft() bundle = orig_bundles.pop(0) else: bundle = document.create_bundle() if root._sent_id is not None: bundle.bundle_id = root._sent_id.split('/', 1)[0] bundle.add_tree(root) if root.newdoc and root.newdoc is not True: document.meta["docname"] = root.newdoc filehandle = self.filehandle if filehandle is None: filehandle = self.next_filehandle() if filehandle is None: self.finished = True return trees_loaded = 0 while True: root = self.filtered_read_tree() if root is None: if trees_loaded == 0 and self.files.has_next_file(): filehandle = self.next_filehandle() continue self.finished = not self.files.has_next_file() break add_to_the_last_bundle = 0 trees_loaded += 1 if self.ignore_sent_id: root._sent_id = None if root._sent_id is not None: parts = root._sent_id.split('/', 1) bundle_id = parts[0] if len(parts) == 2: root.zone = parts[1] add_to_the_last_bundle = bundle_id == last_bundle_id last_bundle_id = bundle_id if self.zone != 'keep': root.zone = self.zone # The `# newdoc` comment in CoNLL-U marks a start of a new document. if root.newdoc: if not bundle and root.newdoc is not True: document.meta["docname"] = root.newdoc if bundle and self.split_docs: self._buffer = root if orig_bundles: logging.warning( "split_docs=1 but the doc had contained %d bundles", len(orig_bundles)) self.finished = False return # assign new/next bundle to `bundle` if needed if not bundle or not add_to_the_last_bundle: if self.bundles_per_doc and bundle and self.bundles_per_doc == bundle.number: self._buffer = root if orig_bundles: logging.warning( "bundles_per_doc=%d but the doc had contained %d bundles", self.bundles_per_doc, len(orig_bundles)) return if orig_bundles: # TODO list.pop(0) is inefficient, use collections.deque.popleft() bundle = orig_bundles.pop(0) if last_bundle_id and last_bundle_id != bundle.bundle_id: logging.warning( 'Mismatch in bundle IDs: %s vs %s. Keeping the former one.', bundle.bundle_id, last_bundle_id) else: bundle = document.create_bundle() if last_bundle_id != '': bundle.bundle_id = last_bundle_id bundle.add_tree(root) # If bundles_per_doc is set and we have read the specified number of bundles, # we should end the current document and return. # However, if the reader supports reading multiple zones, we can never know # if the current bundle has ended or there will be another tree for this bundle. # So in case of multizone readers we need to read one extra tree # and store it in the buffer (and include it into the next document). if self.bundles_per_doc and self.bundles_per_doc == bundle.number \ and not self.is_multizone_reader(): return