コード例 #1
0
    def _build_response_principal_record(self, recorded_url, warc_date):
        """Builds response or revisit record, whichever is appropriate."""
        if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
            # revisit record
            recorded_url.response_recorder.tempfile.seek(0)
            if recorded_url.response_recorder.payload_offset is not None:
                response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset)
            else:
                response_header_block = recorded_url.response_recorder.tempfile.read()

            return self.build_warc_record(
                    url=recorded_url.url, warc_date=warc_date,
                    data=response_header_block,
                    warc_type=warctools.WarcRecord.REVISIT,
                    refers_to=recorded_url.dedup_info.get('id'),
                    refers_to_target_uri=recorded_url.dedup_info['url'],
                    refers_to_date=recorded_url.dedup_info['date'],
                    payload_digest=warcprox.digest_str(
                        recorded_url.payload_digest, self.base32),
                    profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
                    content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
                    remote_ip=recorded_url.remote_ip)
        else:
            # response record
            return self.build_warc_record(
                    url=recorded_url.url, warc_date=warc_date,
                    recorder=recorded_url.response_recorder,
                    warc_type=warctools.WarcRecord.RESPONSE,
                    content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
                    remote_ip=recorded_url.remote_ip,
                    payload_digest=warcprox.digest_str(
                        recorded_url.payload_digest, self.base32),
                    truncated=recorded_url.truncated)
コード例 #2
0
ファイル: warc.py プロジェクト: nlevitt/warcprox
    def _build_response_principal_record(self, recorded_url, warc_date):
        """Builds response or revisit record, whichever is appropriate."""
        if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
            # revisit record
            recorded_url.response_recorder.tempfile.seek(0)
            if recorded_url.response_recorder.payload_offset is not None:
                response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset)
            else:
                response_header_block = recorded_url.response_recorder.tempfile.read()

            return self.build_warc_record(
                    url=recorded_url.url, warc_date=warc_date,
                    data=response_header_block,
                    warc_type=warctools.WarcRecord.REVISIT,
                    refers_to=recorded_url.dedup_info.get('id'),
                    refers_to_target_uri=recorded_url.dedup_info['url'],
                    refers_to_date=recorded_url.dedup_info['date'],
                    payload_digest=warcprox.digest_str(
                        recorded_url.payload_digest, self.base32),
                    profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
                    content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
                    remote_ip=recorded_url.remote_ip)
        else:
            # response record
            return self.build_warc_record(
                    url=recorded_url.url, warc_date=warc_date,
                    recorder=recorded_url.response_recorder,
                    warc_type=warctools.WarcRecord.RESPONSE,
                    content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
                    remote_ip=recorded_url.remote_ip,
                    payload_digest=warcprox.digest_str(
                        recorded_url.payload_digest, self.base32))
コード例 #3
0
    def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
        concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
        profile=None, refers_to=None, refers_to_target_uri=None,
        refers_to_date=None, payload_digest=None):

        if warc_date is None:
            warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())

        record_id = warctools.WarcRecord.random_warc_uuid()

        headers = []
        if warc_type is not None:
            headers.append((warctools.WarcRecord.TYPE, warc_type))
        headers.append((warctools.WarcRecord.ID, record_id))
        headers.append((warctools.WarcRecord.DATE, warc_date))
        headers.append((warctools.WarcRecord.URL, url))
        if remote_ip is not None:
            headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
        if profile is not None:
            headers.append((warctools.WarcRecord.PROFILE, profile))
        if refers_to is not None:
            headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
        if refers_to_target_uri is not None:
            headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
        if refers_to_date is not None:
            headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
        if concurrent_to is not None:
            headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
        if content_type is not None:
            headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
        if payload_digest is not None:
            headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))

        if recorder is not None:
            headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
                warcprox.digest_str(recorder.block_digest, self.base32)))
            if recorder.payload_digest is not None:
                headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
                    warcprox.digest_str(recorder.payload_digest, self.base32)))

            recorder.tempfile.seek(0)
            record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)

        else:
            headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
            digest = hashlib.new(self.digest_algorithm, data)
            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
                warcprox.digest_str(digest, self.base32)))
            if not payload_digest:
                headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
                                warcprox.digest_str(digest, self.base32)))

            content_tuple = content_type, data
            record = warctools.WarcRecord(headers=headers, content=content_tuple)

        return record
コード例 #4
0
ファイル: warc.py プロジェクト: nlevitt/warcprox
    def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
        concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
        profile=None, refers_to=None, refers_to_target_uri=None,
        refers_to_date=None, payload_digest=None):

        if warc_date is None:
            warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())

        record_id = warctools.WarcRecord.random_warc_uuid()

        headers = []
        if warc_type is not None:
            headers.append((warctools.WarcRecord.TYPE, warc_type))
        headers.append((warctools.WarcRecord.ID, record_id))
        headers.append((warctools.WarcRecord.DATE, warc_date))
        headers.append((warctools.WarcRecord.URL, url))
        if remote_ip is not None:
            headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
        if profile is not None:
            headers.append((warctools.WarcRecord.PROFILE, profile))
        if refers_to is not None:
            headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
        if refers_to_target_uri is not None:
            headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
        if refers_to_date is not None:
            headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
        if concurrent_to is not None:
            headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
        if content_type is not None:
            headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
        if payload_digest is not None:
            headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))

        if recorder is not None:
            headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
                warcprox.digest_str(recorder.block_digest, self.base32)))
            recorder.tempfile.seek(0)
            record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
        else:
            headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
            digest = hashlib.new(self.digest_algorithm, data)
            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
                warcprox.digest_str(digest, self.base32)))
            if not payload_digest:
                headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
                                warcprox.digest_str(digest, self.base32)))
            content_tuple = content_type, data
            record = warctools.WarcRecord(headers=headers, content=content_tuple)

        return record
コード例 #5
0
ファイル: dedup.py プロジェクト: kliu128/warcprox
 def _filter_and_bucketize(self, batch):
     '''
     Returns `{bucket: [recorded_url, ...]}`, excluding urls that should not
     be looked up.
     '''
     buckets = collections.defaultdict(list)
     discards = []
     for recorded_url in batch:
         if (recorded_url.response_recorder
                 and recorded_url.payload_digest
                 and self.trough_dedup_db.should_dedup(recorded_url)):
             if (recorded_url.warcprox_meta
                     and 'dedup-bucket' in recorded_url.warcprox_meta):
                 bucket = recorded_url.warcprox_meta['dedup-bucket']
             else:
                 bucket = '__unspecified__'
             buckets[bucket].append(recorded_url)
         else:
             discards.append(
                     warcprox.digest_str(
                         recorded_url.payload_digest, self.options.base32)
                     if recorded_url.payload_digest else 'n/a')
     self.logger.debug(
             'len(batch)=%s len(discards)=%s buckets=%s',
             len(batch), len(discards),
             {bucket: len(buckets[bucket]) for bucket in buckets})
     return buckets
コード例 #6
0
ファイル: dedup.py プロジェクト: internetarchive/warcprox
 def _filter_and_bucketize(self, batch):
     '''
     Returns `{bucket: [recorded_url, ...]}`, excluding urls that should not
     be looked up.
     '''
     buckets = collections.defaultdict(list)
     discards = []
     for recorded_url in batch:
         if (recorded_url.response_recorder
                 and recorded_url.payload_digest
                 and self.trough_dedup_db.should_dedup(recorded_url)):
             if (recorded_url.warcprox_meta
                     and 'dedup-bucket' in recorded_url.warcprox_meta):
                 bucket = recorded_url.warcprox_meta['dedup-bucket']
             else:
                 bucket = '__unspecified__'
             buckets[bucket].append(recorded_url)
         else:
             discards.append(
                     warcprox.digest_str(
                         recorded_url.payload_digest, self.options.base32)
                     if recorded_url.payload_digest else 'n/a')
     self.logger.debug(
             'len(batch)=%s len(discards)=%s buckets=%s',
             len(batch), len(discards),
             {bucket: len(buckets[bucket]) for bucket in buckets})
     return buckets
コード例 #7
0
ファイル: warc.py プロジェクト: ukwa/warcprox
    def build_warc_records(self, recorded_url):
        """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
        warc_date = self.format_warc_date(recorded_url.timestamp)

        if recorded_url.response_recorder:
            principal_record = self._build_response_principal_record(
                recorded_url, warc_date)
            request_record = self.build_warc_record(
                url=recorded_url.url,
                warc_date=warc_date,
                data=recorded_url.request_data,
                warc_type=warctools.WarcRecord.REQUEST,
                content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE,
                concurrent_to=principal_record.id)
            return principal_record, request_record
        else:
            principal_record = self.build_warc_record(
                url=recorded_url.url,
                warc_date=warc_date,
                data=recorded_url.request_data,
                warc_type=recorded_url.custom_type,
                content_type=recorded_url.content_type.encode("latin1"),
                payload_digest=warcprox.digest_str(recorded_url.payload_digest,
                                                   self.base32),
                content_length=recorded_url.size)
            return (principal_record, )
コード例 #8
0
ファイル: dedup.py プロジェクト: kliu128/warcprox
 def notify(self, recorded_url, records):
     if (records and records[0].type == b'response'
             and self.should_dedup(recorded_url)):
         digest_key = warcprox.digest_str(
                 recorded_url.payload_digest, self.options.base32)
         if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
             self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["dedup-bucket"])
         else:
             self.save(digest_key, records[0])
コード例 #9
0
ファイル: dedup.py プロジェクト: nlevitt/warcprox
 def notify(self, recorded_url, records):
     if (records and records[0].type == b'response'
             and recorded_url.response_recorder.payload_size() > 0):
         digest_key = warcprox.digest_str(
                 recorded_url.payload_digest, self.options.base32)
         if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
             self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
         else:
             self.save(digest_key, records[0])
コード例 #10
0
ファイル: dedup.py プロジェクト: internetarchive/warcprox
 def notify(self, recorded_url, records):
     if (records and records[0].type == b'response'
             and self.should_dedup(recorded_url)):
         digest_key = warcprox.digest_str(
                 recorded_url.payload_digest, self.options.base32)
         if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
             self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["dedup-bucket"])
         else:
             self.save(digest_key, records[0])
コード例 #11
0
ファイル: dedup.py プロジェクト: mikalv/warcprox
 def notify(self, recorded_url, records):
     if (records and records[0].type == b'response'
             and recorded_url.response_recorder.payload_size() > 0):
         digest_key = warcprox.digest_str(
                 recorded_url.payload_digest, self.options.base32)
         if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
             self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
         else:
             self.save(digest_key, records[0])
コード例 #12
0
 def notify(self, recorded_url, records):
     if (records and records[0].type == b'response'
             and self.should_dedup(recorded_url)):
         digest_key = warcprox.digest_str(
                 recorded_url.payload_digest, self.options.base32)
         if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta:
             for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
                 if not bucket_mode == 'ro':
                     self.save(digest_key, records[0], bucket=bucket)
         else:
             self.save(digest_key, records[0])
コード例 #13
0
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
    if (recorded_url.response_recorder and recorded_url.payload_digest
            and recorded_url.response_recorder.payload_size() > 0):
        digest_key = warcprox.digest_str(recorded_url.payload_digest, base32)
        if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
            recorded_url.dedup_info = dedup_db.lookup(
                digest_key, recorded_url.warcprox_meta["captures-bucket"],
                recorded_url.url)
        else:
            recorded_url.dedup_info = dedup_db.lookup(digest_key,
                                                      url=recorded_url.url)
コード例 #14
0
ファイル: dedup.py プロジェクト: nlevitt/warcprox
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
    if (recorded_url.response_recorder
            and recorded_url.payload_digest
            and recorded_url.response_recorder.payload_size() > 0):
        digest_key = warcprox.digest_str(recorded_url.payload_digest, base32)
        if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
            recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
                                                      recorded_url.url)
        else:
            recorded_url.dedup_info = dedup_db.lookup(digest_key,
                                                      url=recorded_url.url)
コード例 #15
0
ファイル: dedup.py プロジェクト: kliu128/warcprox
 def _process_url(self, recorded_url):
     if (recorded_url.response_recorder
             and recorded_url.payload_digest
             and self.should_dedup(recorded_url)):
         digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32)
         if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
             recorded_url.dedup_info = self.dedup_db.lookup(
                 digest_key, recorded_url.warcprox_meta["dedup-bucket"],
                 recorded_url.url)
         else:
             recorded_url.dedup_info = self.dedup_db.lookup(
                 digest_key, url=recorded_url.url)
コード例 #16
0
ファイル: dedup.py プロジェクト: internetarchive/warcprox
 def _process_url(self, recorded_url):
     if (recorded_url.response_recorder
             and recorded_url.payload_digest
             and self.should_dedup(recorded_url)):
         digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32)
         if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta:
             recorded_url.dedup_info = self.dedup_db.lookup(
                 digest_key, recorded_url.warcprox_meta["dedup-bucket"],
                 recorded_url.url)
         else:
             recorded_url.dedup_info = self.dedup_db.lookup(
                 digest_key, url=recorded_url.url)
コード例 #17
0
ファイル: crawl_log.py プロジェクト: mikalv/warcprox
    def notify(self, recorded_url, records):
        # 2017-08-03T21:45:24.496Z   200       2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
        now = datetime.datetime.utcnow()
        extra_info = {
            'contentSize': recorded_url.size,
        }
        if records:
            extra_info['warcFilename'] = records[0].warc_filename
            extra_info['warcFileOffset'] = records[0].offset
        if recorded_url.response_recorder:
            content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
            payload_digest = warcprox.digest_str(recorded_url.payload_digest,
                                                 self.options.base32)
        else:
            # WARCPROX_WRITE_RECORD request
            content_length = len(recorded_url.request_data)
            payload_digest = records[0].get_header(b'WARC-Payload-Digest')
        fields = [
            '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now,
                                                  now.microsecond // 1000),
            '% 5s' % recorded_url.status,
            '% 10s' % content_length,
            recorded_url.url,
            '-',  # hop path
            recorded_url.referer or '-',
            recorded_url.mimetype or '-',
            '-',
            '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
                recorded_url.timestamp,
                recorded_url.timestamp.microsecond // 1000,
                recorded_url.duration.microseconds // 1000),
            payload_digest,
            recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
            'duplicate:digest'
            if records and records[0].type == b'revisit' else '-',
            json.dumps(extra_info, separators=(',', ':')),
        ]
        for i in range(len(fields)):
            # `fields` is a mix of `bytes` and `unicode`, make them all `bytes`
            try:
                fields[i] = fields[i].encode('utf-8')
            except:
                pass
        line = b' '.join(fields) + b'\n'

        if 'warc-prefix' in recorded_url.warcprox_meta:
            filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix']
        else:
            filename = 'crawl.log'

        crawl_log_path = os.path.join(self.dir, filename)
        with open(crawl_log_path, 'ab') as f:
            f.write(line)
コード例 #18
0
ファイル: dedup.py プロジェクト: rlugojr/warcprox
 def notify(self, recorded_url, records):
     if (records[0].get_header(
             warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
             and recorded_url.response_recorder.payload_size() > 0):
         digest_key = warcprox.digest_str(
             recorded_url.response_recorder.payload_digest,
             self.options.base32)
         if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
             self.save(digest_key,
                       records[0],
                       bucket=recorded_url.warcprox_meta["captures-bucket"])
         else:
             self.save(digest_key, records[0])
コード例 #19
0
ファイル: crawl_log.py プロジェクト: internetarchive/warcprox
    def notify(self, recorded_url, records):
        # 2017-08-03T21:45:24.496Z   200       2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
        now = datetime.datetime.utcnow()
        extra_info = {'contentSize': recorded_url.size,}
        if records:
            extra_info['warcFilename'] = records[0].warc_filename
            extra_info['warcFileOffset'] = records[0].offset
        if recorded_url.method != 'GET':
            extra_info['method'] = recorded_url.method
        if recorded_url.response_recorder:
            content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
            payload_digest = warcprox.digest_str(
                recorded_url.payload_digest,
                self.options.base32)
        else:
            # WARCPROX_WRITE_RECORD request
            content_length = int(records[0].get_header(b'Content-Length'))
            payload_digest = records[0].get_header(b'WARC-Payload-Digest')
        fields = [
            '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
            '% 5s' % recorded_url.status,
            '% 10s' % content_length,
            recorded_url.url,
            '-', # hop path
            recorded_url.referer or '-',
            recorded_url.mimetype or '-',
            '-',
            '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
                recorded_url.timestamp,
                recorded_url.timestamp.microsecond//1000,
                recorded_url.duration.microseconds//1000),
            payload_digest,
            recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
            'duplicate:digest' if records and records[0].type == b'revisit' else '-',
            json.dumps(extra_info, separators=(',',':')),
        ]
        for i in range(len(fields)):
            # `fields` is a mix of `bytes` and `unicode`, make them all `bytes`
            try:
                fields[i] = fields[i].encode('utf-8')
            except:
                pass
        line = b' '.join(fields) + b'\n'

        prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl')
        filename = '%s-%s-%s.log' % (
                prefix, self.hostname, self.options.server_port)
        crawl_log_path = os.path.join(self.dir, filename)

        with open(crawl_log_path, 'ab') as f:
            f.write(line)
コード例 #20
0
ファイル: dedup.py プロジェクト: kliu128/warcprox
 def _process_url(self, recorded_url):
     try:
         digest_key = warcprox.digest_str(recorded_url.payload_digest,
                                          self.options.base32)
         dedup_info = self.cdx_dedup.lookup(digest_key, recorded_url.url)
         if dedup_info:
             recorded_url.dedup_info = dedup_info
     except ValueError as exc:
         self.logger.error('CdxServerDedupLoader _process_url failed for url=%s %s',
                           recorded_url.url, exc)
     finally:
         self.batch.remove(recorded_url)
         if self.outq:
             self.outq.put(recorded_url)
コード例 #21
0
ファイル: dedup.py プロジェクト: kliu128/warcprox
 def batch_save(self, batch, bucket='__unspecified__'):
     sql_tmpl = ('insert or ignore into dedup\n'
                   '(digest_key, url, date, id)\n'
                   'values %s;' % ','.join(
                       '(%s,%s,%s,%s)' for i in range(len(batch))))
     values = []
     for recorded_url in batch:
         values.extend([
             warcprox.digest_str(
                 recorded_url.payload_digest, self.options.base32),
             recorded_url.url,
             recorded_url.warc_records[0].date,
             recorded_url.warc_records[0].id,])
     self._trough_cli.write(bucket, sql_tmpl, values, self.SCHEMA_ID)
コード例 #22
0
ファイル: dedup.py プロジェクト: internetarchive/warcprox
 def batch_save(self, batch, bucket='__unspecified__'):
     sql_tmpl = ('insert or ignore into dedup\n'
                   '(digest_key, url, date, id)\n'
                   'values %s;' % ','.join(
                       '(%s,%s,%s,%s)' for i in range(len(batch))))
     values = []
     for recorded_url in batch:
         values.extend([
             warcprox.digest_str(
                 recorded_url.payload_digest, self.options.base32),
             recorded_url.url,
             recorded_url.warc_records[0].date,
             recorded_url.warc_records[0].id,])
     self._trough_cli.write(bucket, sql_tmpl, values, self.SCHEMA_ID)
コード例 #23
0
def to_json(recorded_url: warcprox.warcproxy.RecordedUrl, records: List[warctools.WarcRecord]):
    # {"status_code":200,"content_digest":"sha1:3VU56HI3BTMDZBL2TP7SQYXITT7VEAJQ","host":"www.kaosgl.com","via":"http://www.kaosgl.com/sayfa.php?id=4427","account_id":"877","seed":"http://www.kaosgl.com/","warc_filename":"ARCHIVEIT-6003-WEEKLY-JOB171310-20150903100014694-00002.warc.gz","url":"http://www.kaosgl.com/resim/HomofobiKarsitiBulusma/trabzon05.jpg","size":29700,"start_time_plus_duration":"20150903175709637+1049","timestamp":"2015-09-03T17:57:10.707Z","mimetype":"image/jpeg","collection_id":"6003","is_test_crawl":"false","job_name":"6003-20150902172136074","warc_offset":856320200,"thread":6,"hop_path":"RLLLLLE","extra_info":{},"annotations":"duplicate:digest","content_length":29432}

    # Normal recorded response:
    if recorded_url.response_recorder:
        content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
        payload_digest = warcprox.digest_str(
            recorded_url.payload_digest, True)
    else:
        # WARCPROX_WRITE_RECORD request:
        content_length = recorded_url.size
        payload_digest = records[0].get_header(b'WARC-Payload-Digest')

    # Deal with variation in content type:
    content_type = recorded_url.mimetype
    if content_type and content_type.find(" ") >= 0:
        content_type = "application/malformed-header"

    # Build the record:
    now = datetime.datetime.utcnow()
    d = {
        'url': recorded_url.url.decode('utf-8'),
        'host': recorded_url.host,
        'http_method': recorded_url.method,
        'status_code': recorded_url.status,
        'wire_bytes': recorded_url.size,
        'content_type': content_type,
        'content_digest': payload_digest.decode("utf-8"),
        'content_length': content_length,
        'start_time_plus_duration': '{:%Y%m%d%H%M%S}{:03d}+{}'.format(
            recorded_url.timestamp, recorded_url.timestamp.microsecond // 1000,
            int(recorded_url.duration.total_seconds() * 1000)),
        'annotations': 'duplicate:digest' if records[0].type == 'revisit' else '',
        'warc_filename': os.path.basename(records[0].warc_filename),
        'warc_offset': records[0].offset,
        'warc_length': records[0].length,
        "warc_content_type": records[0].content_type.decode("utf-8"),
        "warc_type": records[0].type.decode("utf-8"),
        "warc_id": records[0].id.decode("utf-8"),
        'timestamp': '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond // 1000)
    }

    # fields expected to be populated here are (for archive-it):
    # account_id, collection_id, is_test_crawl, seed, job_name
    if recorded_url.warcprox_meta and 'capture-feed-extra-fields' in recorded_url.warcprox_meta:
        for (k, v) in recorded_url.warcprox_meta['capture-feed-extra-fields'].items():
            d[k] = v

    return d
コード例 #24
0
 def _process_url(self, recorded_url):
     if (recorded_url.response_recorder
             and recorded_url.payload_digest
             and self.should_dedup(recorded_url)):
         digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32)
         if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta:
             for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
                 recorded_url.dedup_info = self.dedup_db.lookup(
                     digest_key, bucket, recorded_url.url)
                 if recorded_url.dedup_info:
                     # we found an existing capture
                     break
         else:
             recorded_url.dedup_info = self.dedup_db.lookup(
                 digest_key, url=recorded_url.url)
コード例 #25
0
ファイル: dedup.py プロジェクト: kliu128/warcprox
    def _build_key_index(self, batch):
        '''
        Builds index of RecordedUrl by digest key.

        Args:
            batch(list): list of RecordedUrl

        Returns:
            dict `{digest_key: [recorded_url, ...]}`
        '''
        key_index = collections.defaultdict(list)
        for recorded_url in batch:
            digest_key = warcprox.digest_str(
                    recorded_url.payload_digest, self.options.base32)
            key_index[digest_key].append(recorded_url)
        return key_index
コード例 #26
0
ファイル: dedup.py プロジェクト: internetarchive/warcprox
    def _build_key_index(self, batch):
        '''
        Builds index of RecordedUrl by digest key.

        Args:
            batch(list): list of RecordedUrl

        Returns:
            dict `{digest_key: [recorded_url, ...]}`
        '''
        key_index = collections.defaultdict(list)
        for recorded_url in batch:
            digest_key = warcprox.digest_str(
                    recorded_url.payload_digest, self.options.base32)
            key_index[digest_key].append(recorded_url)
        return key_index
コード例 #27
0
ファイル: dedup.py プロジェクト: internetarchive/warcprox
 def _process_url(self, recorded_url):
     try:
         digest_key = warcprox.digest_str(recorded_url.payload_digest,
                                          self.options.base32)
         dedup_info = self.cdx_dedup.cached_lookup(digest_key, recorded_url.url)
         cache_info = self.cdx_dedup.cached_lookup.cache_info()
         if (cache_info.hits + cache_info.misses) % 1000 == 0:
             self.logger.info(self.cdx_dedup.cached_lookup.cache_info())
         if dedup_info:
             recorded_url.dedup_info = dedup_info
     except ValueError as exc:
         self.logger.error('CdxServerDedupLoader _process_url failed for url=%s %s',
                           recorded_url.url, exc)
     finally:
         self.batch.remove(recorded_url)
         if self.outq:
             self.outq.put(recorded_url)
コード例 #28
0
ファイル: warc.py プロジェクト: ukwa/warcprox
    def build_warc_records(self, recorded_url):
        """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
        warc_date = self.format_warc_date(recorded_url.timestamp)

        if recorded_url.response_recorder:
            principal_record = self._build_response_principal_record(recorded_url, warc_date)
            request_record = self.build_warc_record(url=recorded_url.url,
                    warc_date=warc_date, data=recorded_url.request_data,
                    warc_type=warctools.WarcRecord.REQUEST,
                    content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE,
                    concurrent_to=principal_record.id)
            return principal_record, request_record
        else:
            principal_record = self.build_warc_record(
                    url=recorded_url.url,
                    warc_date=warc_date, data=recorded_url.request_data,
                    warc_type=recorded_url.custom_type,
                    content_type=recorded_url.content_type.encode("latin1"),
                    payload_digest=warcprox.digest_str(
                        recorded_url.payload_digest, self.base32),
                    content_length=recorded_url.size)
            return (principal_record,)
コード例 #29
0
ファイル: warc.py プロジェクト: ukwa/warcprox
    def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
        concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
        profile=None, refers_to=None, refers_to_target_uri=None,
        refers_to_date=None, payload_digest=None, truncated=None,
        content_length=None):

        if warc_date is None:
            warc_date = self.format_warc_date(datetime.datetime.utcnow())

        record_id = warctools.WarcRecord.random_warc_uuid()

        headers = []
        if warc_type is not None:
            headers.append((warctools.WarcRecord.TYPE, warc_type))
        headers.append((warctools.WarcRecord.ID, record_id))
        headers.append((warctools.WarcRecord.DATE, warc_date))
        headers.append((warctools.WarcRecord.URL, url))
        if remote_ip is not None:
            headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
        if profile is not None:
            headers.append((warctools.WarcRecord.PROFILE, profile))
        if refers_to is not None:
            headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
        if refers_to_target_uri is not None:
            headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
        if refers_to_date is not None:
            headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
        if concurrent_to is not None:
            headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
        if content_type is not None:
            headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
        if payload_digest is not None:
            headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
        # truncated value may be 'length' or 'time'
        if truncated is not None:
            headers.append((b'WARC-Truncated', truncated))

        if recorder is not None:
            if content_length is not None:
                headers.append((
                    warctools.WarcRecord.CONTENT_LENGTH,
                    str(content_length).encode('latin1')))
            else:
                headers.append((
                    warctools.WarcRecord.CONTENT_LENGTH,
                    str(len(recorder)).encode('latin1')))
            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
                warcprox.digest_str(recorder.block_digest, self.base32)))
            recorder.tempfile.seek(0)
            record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
        else:
            if content_length is not None:
                headers.append((
                    warctools.WarcRecord.CONTENT_LENGTH,
                    str(content_length).encode('latin1')))
            else:
                headers.append((
                    warctools.WarcRecord.CONTENT_LENGTH,
                    str(len(data)).encode('latin1')))
            # no http headers so block digest == payload digest
            if not payload_digest:
                payload_digest = warcprox.digest_str(
                        hashlib.new(self.digest_algorithm, data), self.base32)
                headers.append((
                    warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
            headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest))
            if hasattr(data, 'read'):
                record = warctools.WarcRecord(
                        headers=headers, content_file=data)
            else:
                content_tuple = content_type, data
                record = warctools.WarcRecord(
                        headers=headers, content=content_tuple)

        return record
コード例 #30
0
 def put(self, recorded_url, block=True, timeout=None):
     logging.info("{} {} {} {} {} size={} {}".format(
         recorded_url.client_ip, recorded_url.status, recorded_url.method,
         recorded_url.url.decode("utf-8"), recorded_url.mimetype,
         recorded_url.size, warcprox.digest_str(recorded_url.payload_digest, False).decode('utf-8')))
コード例 #31
0
 def put(self, recorded_url, block=True, timeout=None):
     logging.info("{} {} {} {} {} size={} {}".format(
         recorded_url.client_ip, recorded_url.status, recorded_url.method,
         recorded_url.url.decode("utf-8"), recorded_url.mimetype,
         recorded_url.size, warcprox.digest_str(recorded_url.response_recorder.payload_digest, False).decode('utf-8')))
コード例 #32
0
ファイル: warc.py プロジェクト: ukwa/warcprox
    def build_warc_record(self,
                          url,
                          warc_date=None,
                          recorder=None,
                          data=None,
                          concurrent_to=None,
                          warc_type=None,
                          content_type=None,
                          remote_ip=None,
                          profile=None,
                          refers_to=None,
                          refers_to_target_uri=None,
                          refers_to_date=None,
                          payload_digest=None,
                          truncated=None,
                          content_length=None):

        if warc_date is None:
            warc_date = self.format_warc_date(datetime.datetime.utcnow())

        record_id = warctools.WarcRecord.random_warc_uuid()

        headers = []
        if warc_type is not None:
            headers.append((warctools.WarcRecord.TYPE, warc_type))
        headers.append((warctools.WarcRecord.ID, record_id))
        headers.append((warctools.WarcRecord.DATE, warc_date))
        headers.append((warctools.WarcRecord.URL, url))
        if remote_ip is not None:
            headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
        if profile is not None:
            headers.append((warctools.WarcRecord.PROFILE, profile))
        if refers_to is not None:
            headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
        if refers_to_target_uri is not None:
            headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI,
                            refers_to_target_uri))
        if refers_to_date is not None:
            headers.append(
                (warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
        if concurrent_to is not None:
            headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
        if content_type is not None:
            headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
        if payload_digest is not None:
            headers.append(
                (warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
        # truncated value may be 'length' or 'time'
        if truncated is not None:
            headers.append((b'WARC-Truncated', truncated))

        if recorder is not None:
            if content_length is not None:
                headers.append((warctools.WarcRecord.CONTENT_LENGTH,
                                str(content_length).encode('latin1')))
            else:
                headers.append((warctools.WarcRecord.CONTENT_LENGTH,
                                str(len(recorder)).encode('latin1')))
            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
                            warcprox.digest_str(recorder.block_digest,
                                                self.base32)))
            recorder.tempfile.seek(0)
            record = warctools.WarcRecord(headers=headers,
                                          content_file=recorder.tempfile)
        else:
            if content_length is not None:
                headers.append((warctools.WarcRecord.CONTENT_LENGTH,
                                str(content_length).encode('latin1')))
            else:
                headers.append((warctools.WarcRecord.CONTENT_LENGTH,
                                str(len(data)).encode('latin1')))
            # no http headers so block digest == payload digest
            if not payload_digest:
                payload_digest = warcprox.digest_str(
                    hashlib.new(self.digest_algorithm, data), self.base32)
                headers.append(
                    (warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
            headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest))
            if hasattr(data, 'read'):
                record = warctools.WarcRecord(headers=headers,
                                              content_file=data)
            else:
                content_tuple = content_type, data
                record = warctools.WarcRecord(headers=headers,
                                              content=content_tuple)

        return record