Esempio n. 1
0
 def process_all(self):
     with open_or_default(self.output, 'wt', sys.stdout) as out:
         for filename in self.inputs:
             try:
                 stdin = sys.stdin.buffer
             except AttributeError:  # py2
                 stdin = sys.stdin
             with open_or_default(filename, 'rb', stdin) as fh:
                 self.process_one(fh, out, filename)
Esempio n. 2
0
    def test_open_or_default(self):
        default_fh = BytesIO(b'NOTWARC/1.0\r\n')

        with utils.open_or_default(get_test_file('example.warc'), 'rb',
                                   default_fh) as fh:
            assert fh.readline().decode('utf-8') == 'WARC/1.0\r\n'

        with utils.open_or_default(None, 'rb', default_fh) as fh:
            assert fh.readline().decode('utf-8') == 'NOTWARC/1.0\r\n'

        default_fh.seek(0)
        with utils.open_or_default(default_fh, 'rb', None) as fh:
            assert fh.readline().decode('utf-8') == 'NOTWARC/1.0\r\n'
Esempio n. 3
0
    def process_all(self):
        data_out = None

        with open_or_default(self.output, "wt", sys.stdout) as fh:
            if self.compress:
                if isinstance(self.compress, str):
                    data_out = open(self.compress, "wb")
                    if os.path.splitext(self.compress)[1] == "":
                        self.compress += ".cdxj.gz"

                    fh = CompressedWriter(
                        fh,
                        data_out=data_out,
                        data_out_name=self.compress,
                        num_lines=self.num_lines,
                    )
                else:
                    fh = CompressedWriter(
                        fh,
                        data_out=self.compress,
                        data_out_name=self.data_out_name,
                        num_lines=self.num_lines,
                    )

            if self.sort:
                fh = SortingWriter(fh)

            self.output = fh

            super().process_all()

            if self.sort or self.compress:
                fh.flush()
                if data_out:
                    data_out.close()
Esempio n. 4
0
def indexer(cmd):
    fields = cmd.fields.split(',')

    with open_or_default(cmd.output, 'wt', sys.stdout) as out:
        for filename in cmd.inputs:
            with open(filename, 'rb') as fh:
                for record in ArchiveIterator(fh,
                                              no_record_parse=True,
                                              arc2warc=True):

                    index = OrderedDict()
                    for field in fields:
                        value = record.rec_headers.get_header(field)
                        if value:
                            index[field] = value

                    out.write(json.dumps(index) + '\n')
Esempio n. 5
0
 def process_all(self):
     with open_or_default(self.output, 'wt', sys.stdout) as out:
         for filename in self.inputs:
             with open_or_default(filename, 'rb', sys.stdin) as fh:
                 self.process_one(fh, out, filename)