Esempio n. 1
0
    def parse_warc_record(self, record):
        """ Parse warc record
        """

        entry = self._create_index_entry(record.rec_type)

        if record.rec_type == 'warcinfo':
            entry['url'] = record.rec_headers.get_header('WARC-Filename')
            entry['urlkey'] = entry['url']
            entry['_warcinfo'] = record.stream.read(record.length)
            return entry

        entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')

        # timestamp
        entry['timestamp'] = iso_date_to_timestamp(record.rec_headers.
                                                   get_header('WARC-Date'))

        # mime
        if record.rec_type == 'revisit':
            entry['mime'] = 'warc/revisit'
        elif self.options.get('minimal'):
            entry['mime'] = '-'
        else:
            def_mime = '-' if record.rec_type == 'request' else 'unk'
            entry.extract_mime(record.status_headers.
                               get_header('Content-Type'),
                               def_mime)

        # status -- only for response records (by convention):
        if record.rec_type == 'response' and not self.options.get('minimal'):
            entry.extract_status(record.status_headers)
        else:
            entry['status'] = '-'

        # digest
        digest = record.rec_headers.get_header('WARC-Payload-Digest')
        entry['digest'] = digest
        if digest and digest.startswith('sha1:'):
            entry['digest'] = digest[len('sha1:'):]

        elif not entry.get('digest'):
            entry['digest'] = '-'

        # optional json metadata, if present
        metadata = record.rec_headers.get_header('WARC-Json-Metadata')
        if metadata:
            entry['metadata'] = metadata

        return entry
Esempio n. 2
0
    def _load_different_url_payload(self, cdx, headers_record,
                                    failed_files, cdx_loader):
        """
        Handle the case where a duplicate of a capture with same digest
        exists at a different url.

        If a cdx_server is provided, a query is made for matching
        url, timestamp and digest.

        Raise exception if no matches found.
        """

        ref_target_uri = (headers_record.rec_headers.
                          get_header('WARC-Refers-To-Target-URI'))

        target_uri = headers_record.rec_headers.get_header('WARC-Target-URI')

        # if no target uri, no way to find the original
        if not ref_target_uri:
            raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)

        ref_target_date = (headers_record.rec_headers.
                           get_header('WARC-Refers-To-Date'))

        if not ref_target_date:
            ref_target_date = cdx['timestamp']
        else:
            ref_target_date = iso_date_to_timestamp(ref_target_date)

        digest = cdx.get('digest', '-')

        try:
            orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
                                                    ref_target_date,
                                                    digest,
                                                    cdx_loader)
        except NotFoundException:
            raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)

        for orig_cdx in orig_cdx_lines:
            try:
                payload_record = self._resolve_path_load(orig_cdx, False,
                                                         failed_files)
                return payload_record

            except ArchiveLoadFailed as e:
                pass

        raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)
Esempio n. 3
0
    def parse_warc_record(self, record):
        """ Parse warc record
        """

        entry = self._create_index_entry(record.rec_type)

        if record.rec_type == 'warcinfo':
            entry['url'] = record.rec_headers.get_header('WARC-Filename')
            entry['urlkey'] = entry['url']
            entry['_warcinfo'] = record.stream.read(record.length)
            return entry

        entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')

        # timestamp
        entry['timestamp'] = iso_date_to_timestamp(
            record.rec_headers.get_header('WARC-Date'))

        # mime
        if record.rec_type == 'revisit':
            entry['mime'] = 'warc/revisit'
        elif self.options.get('minimal'):
            entry['mime'] = '-'
        else:
            def_mime = '-' if record.rec_type == 'request' else 'unk'
            entry.extract_mime(
                record.status_headers.get_header('Content-Type'), def_mime)

        # status -- only for response records (by convention):
        if record.rec_type == 'response' and not self.options.get('minimal'):
            entry.extract_status(record.status_headers)
        else:
            entry['status'] = '-'

        # digest
        digest = record.rec_headers.get_header('WARC-Payload-Digest')
        entry['digest'] = digest
        if digest and digest.startswith('sha1:'):
            entry['digest'] = digest[len('sha1:'):]

        elif not entry.get('digest'):
            entry['digest'] = '-'

        # optional json metadata, if present
        metadata = record.rec_headers.get_header('WARC-Json-Metadata')
        if metadata:
            entry['metadata'] = metadata

        return entry
Esempio n. 4
0
    def _load_different_url_payload(self, cdx, headers_record,
                                    failed_files, cdx_loader):
        """
        Handle the case where a duplicate of a capture with same digest
        exists at a different url.

        If a cdx_server is provided, a query is made for matching
        url, timestamp and digest.

        Raise exception if no matches found.
        """

        ref_target_uri = (headers_record.rec_headers.
                          get_header('WARC-Refers-To-Target-URI'))

        target_uri = headers_record.rec_headers.get_header('WARC-Target-URI')

        # Check for unresolved revisit error,
        # if refers to target uri not present or same as the current url
        if not ref_target_uri or (ref_target_uri == target_uri):
            raise ArchiveLoadFailed('Missing Revisit Original')

        ref_target_date = (headers_record.rec_headers.
                           get_header('WARC-Refers-To-Date'))

        if not ref_target_date:
            ref_target_date = cdx['timestamp']
        else:
            ref_target_date = iso_date_to_timestamp(ref_target_date)

        digest = cdx['digest']

        orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
                                                ref_target_date,
                                                digest,
                                                cdx_loader)

        for cdx in orig_cdx_lines:
            try:
                payload_record = self._resolve_path_load(cdx, False,
                                                         failed_files)
                return payload_record

            except ArchiveLoadFailed as e:
                pass

        raise ArchiveLoadFailed('Original for revisit could not be loaded')
Esempio n. 5
0
def parse_warc_record(record):
    """ Parse warc record
    """

    entry = ArchiveIndexEntry()

    if record.rec_type == 'warcinfo':
        entry.url = record.rec_headers.get_header('WARC-Filename')
        entry.key = entry.url
        entry.warcinfo = record.stream.read(record.length)
        return entry

    entry.url = record.rec_headers.get_header('WARC-Target-Uri')

    # timestamp
    entry.timestamp = iso_date_to_timestamp(record.rec_headers.
                                            get_header('WARC-Date'))

    # mime
    if record.rec_type == 'revisit':
        entry.mime = 'warc/revisit'
    else:
        def_mime = '-' if record.rec_type == 'request' else 'unk'
        entry.extract_mime(record.status_headers.
                           get_header('Content-Type'),
                           def_mime)

    # status -- only for response records (by convention):
    if record.rec_type == 'response':
        entry.extract_status(record.status_headers)
    else:
        entry.status = '-'

    # digest
    entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')
    if entry.digest and entry.digest.startswith('sha1:'):
        entry.digest = entry.digest[len('sha1:'):]

    if not entry.digest:
        entry.digest = '-'

    return entry
Esempio n. 6
0
def parse_warc_record(record):
    """ Parse warc record
    """

    entry = ArchiveIndexEntry()

    if record.rec_type == 'warcinfo':
        entry.url = record.rec_headers.get_header('WARC-Filename')
        entry.key = entry.url
        entry.warcinfo = record.stream.read(record.length)
        return entry

    entry.url = record.rec_headers.get_header('WARC-Target-Uri')

    # timestamp
    entry.timestamp = iso_date_to_timestamp(
        record.rec_headers.get_header('WARC-Date'))

    # mime
    if record.rec_type == 'revisit':
        entry.mime = 'warc/revisit'
    else:
        def_mime = '-' if record.rec_type == 'request' else 'unk'
        entry.extract_mime(record.status_headers.get_header('Content-Type'),
                           def_mime)

    # status -- only for response records (by convention):
    if record.rec_type == 'response':
        entry.extract_status(record.status_headers)
    else:
        entry.status = '-'

    # digest
    entry.digest = record.rec_headers.get_header('WARC-Payload-Digest')
    if entry.digest and entry.digest.startswith('sha1:'):
        entry.digest = entry.digest[len('sha1:'):]

    if not entry.digest:
        entry.digest = '-'

    return entry
Esempio n. 7
0
    def lookup_revisit(self, params, digest, url, iso_dt):
        params['url'] = url
        params['closest'] = iso_date_to_timestamp(iso_dt)

        filters = []

        filters.append('!mime:warc/revisit')

        if digest and digest != '-':
            filters.append('digest:' + digest.split(':')[-1])

        params['filter'] = filters

        cdx_iter, errs = self.cdx_lookup(params)

        for cdx in cdx_iter:
            res = self.dupe_policy(cdx, params)
            if res:
                return res

        return None
Esempio n. 8
0
    def write_snapshot(self,
                       user,
                       coll,
                       url,
                       title,
                       html_text,
                       referrer,
                       user_agent,
                       browser=None):

        snap_title = 'Static Snapshots'

        snap_rec = self.sanitize_title(snap_title)

        if not self.manager.has_recording(user, coll, snap_rec):
            recording = self.manager.create_recording(user, coll, snap_rec,
                                                      snap_title)

        kwargs = dict(user=user,
                      coll=quote(coll),
                      rec=quote(snap_rec, safe='/*'),
                      type='snapshot')

        params = {'url': url}

        upstream_url = self.manager.content_app.get_upstream_url(
            '', kwargs, params)

        headers = {
            'Content-Type': 'text/html; charset=utf-8',
            'WARC-User-Agent': user_agent,
            'WARC-Referer': referrer,
        }

        r = requests.put(
            upstream_url,
            data=BytesIO(html_text.encode('utf-8')),
            headers=headers,
        )

        try:
            res = r.json()
            if res['success'] != 'true':
                print(res)
                return {'error_message': 'Snapshot Failed'}

            warc_date = res.get('WARC-Date')

        except Exception as e:
            print(e)
            return {'error_message': 'Snapshot Failed'}

        if not title:
            return {'snapshot': ''}

        if warc_date:
            timestamp = iso_date_to_timestamp(warc_date)
        else:
            timestamp = timestamp_now()

        page_data = {
            'url': url,
            'title': title,
            'timestamp': timestamp,
            'tags': ['snapshot'],
        }
        if browser:
            page_data['browser'] = browser

        res = self.manager.add_page(user, coll, snap_rec, page_data)

        return {'snapshot': page_data}