def process_all(self): with open_or_default(self.output, 'wt', sys.stdout) as out: for filename in self.inputs: try: stdin = sys.stdin.buffer except AttributeError: # py2 stdin = sys.stdin with open_or_default(filename, 'rb', stdin) as fh: self.process_one(fh, out, filename)
def test_open_or_default(self): default_fh = BytesIO(b'NOTWARC/1.0\r\n') with utils.open_or_default(get_test_file('example.warc'), 'rb', default_fh) as fh: assert fh.readline().decode('utf-8') == 'WARC/1.0\r\n' with utils.open_or_default(None, 'rb', default_fh) as fh: assert fh.readline().decode('utf-8') == 'NOTWARC/1.0\r\n' default_fh.seek(0) with utils.open_or_default(default_fh, 'rb', None) as fh: assert fh.readline().decode('utf-8') == 'NOTWARC/1.0\r\n'
def process_all(self): data_out = None with open_or_default(self.output, "wt", sys.stdout) as fh: if self.compress: if isinstance(self.compress, str): data_out = open(self.compress, "wb") if os.path.splitext(self.compress)[1] == "": self.compress += ".cdxj.gz" fh = CompressedWriter( fh, data_out=data_out, data_out_name=self.compress, num_lines=self.num_lines, ) else: fh = CompressedWriter( fh, data_out=self.compress, data_out_name=self.data_out_name, num_lines=self.num_lines, ) if self.sort: fh = SortingWriter(fh) self.output = fh super().process_all() if self.sort or self.compress: fh.flush() if data_out: data_out.close()
def indexer(cmd): fields = cmd.fields.split(',') with open_or_default(cmd.output, 'wt', sys.stdout) as out: for filename in cmd.inputs: with open(filename, 'rb') as fh: for record in ArchiveIterator(fh, no_record_parse=True, arc2warc=True): index = OrderedDict() for field in fields: value = record.rec_headers.get_header(field) if value: index[field] = value out.write(json.dumps(index) + '\n')
def process_all(self): with open_or_default(self.output, 'wt', sys.stdout) as out: for filename in self.inputs: with open_or_default(filename, 'rb', sys.stdin) as fh: self.process_one(fh, out, filename)