Ejemplo n.º 1
0
    def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
        concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
        profile=None, refers_to=None, refers_to_target_uri=None,
        refers_to_date=None, payload_digest=None):

        if warc_date is None:
            warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())

        record_id = warctools.WarcRecord.random_warc_uuid()

        headers = []
        if warc_type is not None:
            headers.append((warctools.WarcRecord.TYPE, warc_type))
        headers.append((warctools.WarcRecord.ID, record_id))
        headers.append((warctools.WarcRecord.DATE, warc_date))
        headers.append((warctools.WarcRecord.URL, url))
        if remote_ip is not None:
            headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
        if profile is not None:
            headers.append((warctools.WarcRecord.PROFILE, profile))
        if refers_to is not None:
            headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
        if refers_to_target_uri is not None:
            headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
        if refers_to_date is not None:
            headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
        if concurrent_to is not None:
            headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
        if content_type is not None:
            headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
        if payload_digest is not None:
            headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))

        if recorder is not None:
            headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
                warcprox.digest_str(recorder.block_digest, self.base32)))
            if recorder.payload_digest is not None:
                headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
                    warcprox.digest_str(recorder.payload_digest, self.base32)))

            recorder.tempfile.seek(0)
            record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)

        else:
            headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
            digest = hashlib.new(self.digest_algorithm, data)
            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
                warcprox.digest_str(digest, self.base32)))
            if not payload_digest:
                headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
                                warcprox.digest_str(digest, self.base32)))

            content_tuple = content_type, data
            record = warctools.WarcRecord(headers=headers, content=content_tuple)

        return record
Ejemplo n.º 2
0
    def build_warc_record(self, url, warc_date=None, content_buffer=None,
            content_file=None, content_length=None, concurrent_to=None,
            warc_type=None, content_type=None, remote_ip=None, profile=None,
            refers_to=None, refers_to_target_uri=None, refers_to_date=None,
            record_id=None, block_digest=None, payload_digest=None):

        if warc_date is None:
            warc_date = warctools.warc.warc_datetime_str(datetime.now())

        if record_id is None:
            record_id = warctools.WarcRecord.random_warc_uuid()

        headers = []
        if warc_type is not None:
            headers.append((warctools.WarcRecord.TYPE, warc_type))
        headers.append((warctools.WarcRecord.ID, record_id))
        headers.append((warctools.WarcRecord.DATE, warc_date))
        headers.append((warctools.WarcRecord.URL, url))
        if remote_ip is not None:
            headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
        if profile is not None:
            headers.append((warctools.WarcRecord.PROFILE, profile))
        if refers_to is not None:
            headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
        if refers_to_target_uri is not None:
            headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
        if refers_to_date is not None:
            headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
        if concurrent_to is not None:
            headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
        if content_type is not None:
            headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
        if content_length is not None:
            headers.append((warctools.WarcRecord.CONTENT_LENGTH, content_length))
        if block_digest is not None:
            headers.append((warctools.WarcRecord.BLOCK_DIGEST, block_digest))
        if payload_digest is not None:
            headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest))

        if content_file is not None:
            assert content_buffer is None
            assert content_length is not None
            record = warctools.WarcRecord(headers=headers, content_file=content_file)
        else:
            assert content_buffer is not None
            content_tuple = (content_type, content_buffer)
            record = warctools.WarcRecord(headers=headers, content=content_tuple)

        return record
Ejemplo n.º 3
0
    def write_warc_record(self,
                          record_type,
                          url,
                          data,
                          content_type,
                          warc_date=None,
                          out_file=None,
                          extra_headers=None):
        # set default date and convert to string if necessary
        warc_date = warc_date or timezone.now()
        if hasattr(warc_date, 'isoformat'):
            warc_date = warctools.warc.warc_datetime_str(warc_date)

        close_file = not out_file
        out_file = out_file or self.open_warc_for_writing()
        headers = [(warctools.WarcRecord.TYPE, record_type),
                   (warctools.WarcRecord.ID,
                    warctools.WarcRecord.random_warc_uuid()),
                   (warctools.WarcRecord.DATE, warc_date),
                   (warctools.WarcRecord.URL, url),
                   (warctools.WarcRecord.BLOCK_DIGEST,
                    b'sha1:%s' % hashlib.sha1(data).hexdigest())]
        if extra_headers:
            headers.extend(extra_headers)
        record = warctools.WarcRecord(headers=headers,
                                      content=(content_type, data))
        record.write_to(out_file, gzip=True)

        if close_file:
            self.close_warc_after_writing(out_file)

        return headers
Ejemplo n.º 4
0
    def build_warcinfo_record(self, filename):
        warc_record_date = self.format_warc_date(datetime.datetime.utcnow())
        record_id = warctools.WarcRecord.random_warc_uuid()

        headers = []
        headers.append((warctools.WarcRecord.ID, record_id))
        headers.append(
            (warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
        headers.append(
            (warctools.WarcRecord.FILENAME, filename.encode('latin1')))
        headers.append((warctools.WarcRecord.DATE, warc_record_date))

        warcinfo_fields = []
        warcinfo_fields.append(b'software: warcprox ' +
                               warcprox.__version__.encode('latin1'))
        hostname = socket.gethostname()
        warcinfo_fields.append(
            'hostname: {}'.format(hostname).encode('latin1'))
        warcinfo_fields.append(
            ('ip: %s' % self._local_address()).encode('latin1'))
        warcinfo_fields.append(b'format: WARC File Format 1.0')
        # warcinfo_fields.append('robots: ignore')
        # warcinfo_fields.append('description: {0}'.format(self.description))
        # warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
        data = b'\r\n'.join(warcinfo_fields) + b'\r\n'

        record = warctools.WarcRecord(headers=headers,
                                      content=(b'application/warc-fields',
                                               data))

        return record
Ejemplo n.º 5
0
def write_perma_warc_header(out_file, guid, timestamp):
    # build warcinfo header
    headers = [
        (warctools.WarcRecord.ID, warctools.WarcRecord.random_warc_uuid()),
        (warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO),
        (warctools.WarcRecord.DATE, warctools.warc.warc_datetime_str(timestamp))
    ]
    warcinfo_fields = [
        b'operator: Perma.cc',
        b'format: WARC File Format 1.0',
        bytes('Perma-GUID: {}'.format(guid), 'utf-8')
    ]
    data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
    warcinfo_record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data))
    warcinfo_record.write_to(out_file, gzip=True)
Ejemplo n.º 6
0
def write_resource_record_from_asset(data, url, content_type, out_file, extra_headers=None):
    """
    Constructs a single WARC resource record from an asset (screenshot, uploaded file, etc.)
    and writes to out_file.
    """
    warc_date = warctools.warc.warc_datetime_str(timezone.now()).replace(b'+00:00Z', b'Z')
    headers = [
        (warctools.WarcRecord.TYPE, warctools.WarcRecord.RESOURCE),
        (warctools.WarcRecord.ID, warctools.WarcRecord.random_warc_uuid()),
        (warctools.WarcRecord.DATE, warc_date),
        (warctools.WarcRecord.URL, bytes(url, 'utf-8')),
        (warctools.WarcRecord.BLOCK_DIGEST, bytes('sha1:{}'.format(hashlib.sha1(data).hexdigest()), 'utf-8'))
    ]
    if extra_headers:
        headers.extend(extra_headers)
    record = warctools.WarcRecord(headers=headers, content=(bytes(content_type, 'utf-8'), data))
    record.write_to(out_file, gzip=True)
Ejemplo n.º 7
0
    def write_record(self, headers, content_type, content):
        '''
        write WARC record (of any type) to WARC GZ file

        :param  headers       list of header tuples [('foo', 'bar')]
        :param  content_type  WARC Content-Type header string
        :param  content       WARC payload
        '''
        self.bump_serial(sys.getsizeof(content))

        bheaders = []
        for key, val in headers:
            bheaders.append((_bytes(key), _bytes(val)))

        with open(self.warc_fname, 'ab') as _fh:
            record = warctools.WarcRecord(headers=bheaders,
                                          content=(_bytes(content_type),
                                                   _bytes(content)))

            record.write_to(_fh, gzip=True)

            self.log.info('Wrote %s bytes (%s) to file: %s', _fh.tell(),
                          content_type, self.warc_fname)
Ejemplo n.º 8
0
    def build_warc_record(self,
                          url,
                          warc_date=None,
                          recorder=None,
                          data=None,
                          concurrent_to=None,
                          warc_type=None,
                          content_type=None,
                          remote_ip=None,
                          profile=None,
                          refers_to=None,
                          refers_to_target_uri=None,
                          refers_to_date=None,
                          payload_digest=None,
                          truncated=None,
                          content_length=None):

        if warc_date is None:
            warc_date = self.format_warc_date(datetime.datetime.utcnow())

        record_id = warctools.WarcRecord.random_warc_uuid()

        headers = []
        if warc_type is not None:
            headers.append((warctools.WarcRecord.TYPE, warc_type))
        headers.append((warctools.WarcRecord.ID, record_id))
        headers.append((warctools.WarcRecord.DATE, warc_date))
        headers.append((warctools.WarcRecord.URL, url))
        if remote_ip is not None:
            headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
        if profile is not None:
            headers.append((warctools.WarcRecord.PROFILE, profile))
        if refers_to is not None:
            headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
        if refers_to_target_uri is not None:
            headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI,
                            refers_to_target_uri))
        if refers_to_date is not None:
            headers.append(
                (warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
        if concurrent_to is not None:
            headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
        if content_type is not None:
            headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
        if payload_digest is not None:
            headers.append(
                (warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
        # truncated value may be 'length' or 'time'
        if truncated is not None:
            headers.append((b'WARC-Truncated', truncated))

        if recorder is not None:
            if content_length is not None:
                headers.append((warctools.WarcRecord.CONTENT_LENGTH,
                                str(content_length).encode('latin1')))
            else:
                headers.append((warctools.WarcRecord.CONTENT_LENGTH,
                                str(len(recorder)).encode('latin1')))
            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
                            warcprox.digest_str(recorder.block_digest,
                                                self.base32)))
            recorder.tempfile.seek(0)
            record = warctools.WarcRecord(headers=headers,
                                          content_file=recorder.tempfile)
        else:
            if content_length is not None:
                headers.append((warctools.WarcRecord.CONTENT_LENGTH,
                                str(content_length).encode('latin1')))
            else:
                headers.append((warctools.WarcRecord.CONTENT_LENGTH,
                                str(len(data)).encode('latin1')))
            # no http headers so block digest == payload digest
            if not payload_digest:
                payload_digest = warcprox.digest_str(
                    hashlib.new(self.digest_algorithm, data), self.base32)
                headers.append(
                    (warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
            headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest))
            if hasattr(data, 'read'):
                record = warctools.WarcRecord(headers=headers,
                                              content_file=data)
            else:
                content_tuple = content_type, data
                record = warctools.WarcRecord(headers=headers,
                                              content=content_tuple)

        return record