Exemple #1
0
def load_test_archive(test_file, offset, length):
    path = test_warc_dir + test_file

    testloader = ArcWarcRecordLoader()

    archive = testloader.load(path, offset, length)

    pprint.pprint(((archive.format, archive.rec_type),
                   archive.rec_headers, archive.status_headers))
Exemple #2
0
    def _init_replay_view(self, config):
        cookie_maker = config.get('cookie_maker')
        record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)

        paths = config.get('archive_paths')

        resolving_loader = ResolvingLoader(PathResolverMapper()(paths),
                                           record_loader=record_loader)

        return ReplayView(resolving_loader, config)
Exemple #3
0
    def __init__(self, fileobj, no_record_parse=False, verify_http=False):
        self.fh = fileobj

        self.loader = ArcWarcRecordLoader(verify_http=verify_http)
        self.reader = None

        self.offset = 0
        self.known_format = None

        self.member_info = None
        self.no_record_parse = no_record_parse
Exemple #4
0
def load_test_archive(test_file, offset, length):
    path = test_warc_dir + test_file

    testloader = ArcWarcRecordLoader()

    archive = testloader.load(path, offset, length)

    pywb.utils.statusandheaders.WRAP_WIDTH = 160

    pprint.pprint(((archive.format, archive.rec_type), archive.rec_headers,
                   archive.status_headers),
                  indent=1,
                  width=160)
Exemple #5
0
    def _init_replay_view(self, config):
        cookie_maker = config.get('cookie_maker')
        record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)

        paths = config.get('archive_paths')

        resolving_loader = ResolvingLoader(PathResolverMapper()(paths),
                                           record_loader=record_loader)

        redis_warc_resolver = config.get('redis_warc_resolver')
        if redis_warc_resolver:
            resolving_loader.path_resolvers.append(redis_warc_resolver)

        return WebRecReplayView(resolving_loader, config)
Exemple #6
0
    def __init__(self, query_handler, config=None):
        super(WBHandler, self).__init__(config)

        self.index_reader = query_handler

        cookie_maker = config.get('cookie_maker')
        record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)

        paths = config.get('archive_paths')

        resolving_loader = ResolvingLoader(paths=paths,
                                           record_loader=record_loader)

        self.replay = ReplayView(resolving_loader, config)

        self.fallback_handler = None
        self.fallback_name = config.get('fallback')
Exemple #7
0
def get_rendered_original_stream(warc_filename, warc_offset, compressedendoffset):
    # If not found, say so:
    if warc_filename is None:
        return None, None

    # Grab the payload from the WARC and return it.
    url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset)
    if compressedendoffset:
        url = "%s&length=%s" % (url, compressedendoffset)
    logger.info("Requesting copy from HDFS: %s " % url)
    r = requests.get(url, stream=True)
    logger.info("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    logger.info("Passing response to parser...")
    record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw))
    logger.info("RESULT:")
    logger.info(record)

    logger.info("Returning stream...")
    return record.stream, record.content_type
Exemple #8
0
def get_rendered_original(url,
                          type='screenshot',
                          target_timestamp=30001201235900):
    """
    Grabs a rendered resource.

    Only reason Wayback can't do this is that it does not like the extended URIs
    i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://'
    """
    # Query URL
    qurl = "%s:%s" % (type, url)
    # Query CDX Server for the item
    #logger.info("Querying CDX for prefix...")
    warc_filename, warc_offset, compressedendoffset = lookup_in_cdx(
        qurl, target_timestamp)

    # If not found, say so:
    if warc_filename is None:
        return None

    # Grab the payload from the WARC and return it.
    WEBHDFS_PREFIX = os.environ['WEBHDFS_PREFIX']
    WEBHDFS_USER = os.environ['WEBHDFS_USER']
    url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (
        WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset)
    if compressedendoffset:
        url = "%s&length=%s" % (url, compressedendoffset)
    #logger.info("Requesting copy from HDFS: %s " % url)
    r = requests.get(url, stream=True)
    #logger.info("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    #logger.info("Passing response to parser...")
    record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw))
    #logger.info("RESULT:")
    #logger.info(record)

    #logger.info("Returning stream...")
    return (record.stream, record.content_type)
Exemple #9
0
def get_rendered_original():
    """
    Grabs a rendered resource.

    Only reason Wayback can't do this is that it does not like the extended URIs
    i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://'
    """
    url = request.args.get('url')
    app.logger.debug("Got URL: %s" % url)
    #
    type = request.args.get('type', 'screenshot')
    app.logger.debug("Got type: %s" % type)

    # Query URL
    qurl = "%s:%s" % (type, url)
    # Query CDX Server for the item
    (warc_filename, warc_offset) = lookup_in_cdx(qurl)

    # If not found, say so:
    if warc_filename is None:
        abort(404)

    # Grab the payload from the WARC and return it.
    r = requests.get("%s%s%s?op=OPEN&user.name=%s&offset=%s" %
                     (systems().webhdfs, h3().hdfs_root_folder, warc_filename,
                      webhdfs().user, warc_offset))
    app.logger.info("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    record = rl.parse_record_stream(
        DecompressingBufferedReader(stream=io.BytesIO(r.content)))
    print(record)
    print(record.length)
    print(record.stream.limit)

    return send_file(record.stream, mimetype=record.content_type)
Exemple #10
0
 def __init__(self, path_resolvers, record_loader=ArcWarcRecordLoader(), no_record_parse=False):
     self.path_resolvers = path_resolvers
     self.record_loader = record_loader
     self.no_record_parse = no_record_parse
Exemple #11
0
def parse_stream_error(**params):
    try:
        return ArcWarcRecordLoader().parse_record_stream(**params)
    except Exception as e:
        print 'Exception: ' + e.__class__.__name__