Example #1
0
def test_unicode_url():
    x = CDXObject(u'com,example,cafe)/ 123 {"url": "http://example.com/café/path"}'.encode('utf-8'))
    assert x['urlkey'] == 'com,example,cafe)/'
    assert x['timestamp'] == '123'
    assert x['url'] == 'http://example.com/caf%C3%A9/path'

    assert x.to_cdxj() == 'com,example,cafe)/ 123 {"url": "http://example.com/caf%C3%A9/path"}\n'
Example #2
0
    def lookup(self, digest, url, timestamp):
        start, end = calc_search_range(url, 'exact')
        results = self.redis.zrangebylex(self.key, '[' + start, '(' + end)
        for res in results:
            cdx = CDXObject(res)
            if digest == cdx.get('digest'):
                return ('revisit', cdx['url'], timestamp_to_datetime(cdx['timestamp']))

        return None
Example #3
0
def test_lt_le():
    A = CDXObject(b'ca,example)/ 2016 {"url": "http://example.com/"}')
    B = CDXObject(b'com,example)/ 2015 {"url": "http://example.com/"}')
    C = CDXObject(b'com,example)/ 2016 {"url": "http://example.com/"}')

    assert A < B
    assert B < C
    assert B >= A
    assert C >= A
    assert A < C
Example #4
0
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False,
                       failed_files=None):
    resolve_loader = ResolvingLoader(PathResolverMapper()(test_warc_dir))
    cdx = CDXObject(cdx.encode('utf-8'))

    try:
        (headers, stream) = resolve_loader(cdx, failed_files, revisit_func)
        print(repr(headers))
        sys.stdout.write(stream.readline().decode('utf-8'))
        sys.stdout.write(stream.readline().decode('utf-8'))
    except ArchiveLoadFailed as e:
        if reraise:
            raise
        else:
            print('Exception: ' + e.__class__.__name__)
Example #5
0
    def cdx_index(self, z_key, stream, filename):
        cdxout = BytesIO()
        write_cdx_index(cdxout, stream, filename, cdxj=True, append_post=True)

        cdx_list = cdxout.getvalue().rstrip().split(b'\n')
        count = 0

        min_ = max_ = None

        for cdx in cdx_list:
            if cdx and not self.dry:
                self.dst_redis.zadd(z_key, 0, cdx)
                cdxobj = CDXObject(cdx)

                ts = cdxobj['timestamp']

                min_ = min(min_, ts) if min_ else ts
                max_ = max(max_, ts) if max_ else ts

                count += 1

        if count:
            min_ = timestamp_to_sec(min_)
            max_ = timestamp_to_sec(max_)

        logging.info('  CDXJ: {0} {1} {2}'.format(count, min_, max_))
        return min_, max_
    def test_anon_download_coll(self):
        res = self._get_anon('/temp/$download')

        assert res.headers['Content-Disposition'].startswith("attachment; filename*=UTF-8''temp-")

        warcin = self._get_dechunked(res.body)

        cdxout = BytesIO()
        write_cdx_index(cdxout, warcin, 'temp.warc.gz', include_all=True, cdxj=True)

        #print(cdxout.getvalue().decode('utf-8'))

        cdx = [CDXObject(cdx) for cdx in cdxout.getvalue().rstrip().split(b'\n')]
        assert len(cdx) == 6

        # response
        cdx[0]['url'] = 'http://httpbin.org/get?food=bar'
        cdx[0]['mime'] = 'application/json'

        # request
        cdx[1]['url'] = 'http://httpbin.org/get?food=bar'
        cdx[1]['mime'] = '-'

        # response
        cdx[2]['url'] = 'http://httpbin.org/get?bood=far'
        cdx[2]['mime'] = 'application/json'

        # request
        cdx[3]['url'] = 'http://httpbin.org/get?bood=far'
        cdx[3]['mime'] = '-'
Example #7
0
    def add_page(self, user, coll, pagedata):
        if not self.can_write_coll(user, coll):
            print('Cannot Write')
            return False

        url = pagedata['url']

        try:
            key, end_key = calc_search_range(url, 'exact')
        except:
            print('Cannot Canon')
            return False

        if 'ts' not in pagedata:
            cdx_key = self.make_key(user, coll, self.CDX_KEY)
            result = self.redis.zrangebylex(cdx_key, '[' + key, '(' + end_key)
            if not result:
                print('NO CDX')
                return False

            last_cdx = CDXObject(result[-1])

            pagedata['ts'] = last_cdx['timestamp']

        pagedata_json = json.dumps(pagedata)

        key = self.make_key(user, coll, self.PAGE_KEY)

        self.redis.sadd(key, pagedata_json)
Example #8
0
def load_from_cdx_test(cdx,
                       revisit_func=load_orig_cdx,
                       reraise=False,
                       failed_files=None):
    resolve_loader = ResolvingLoader(PathResolverMapper()(test_warc_dir))
    cdx = CDXObject(cdx.encode('utf-8'))

    try:
        (headers, stream) = resolve_loader(cdx, failed_files, revisit_func)
        print(repr(headers))
        sys.stdout.write(stream.readline().decode('utf-8'))
        sys.stdout.write(stream.readline().decode('utf-8'))
    except ArchiveLoadFailed as e:
        if reraise:
            raise
        else:
            print('Exception: ' + e.__class__.__name__)
Example #9
0
 def index(self):
     for filename in self.sources:
         with open_file(filename) as fd:
             for line in fd:
                 idx = CDXObject(line)
                 idx['url_parsed'] = urlparse(idx['url'])
                 yield idx
             fd.close()
Example #10
0
    def convert_line(self, line, url):
        timestamp, mime, filename = line.split('\t')

        cdx = CDXObject()
        cdx['urlkey'] = canonicalize(url)
        cdx['timestamp'] = timestamp
        cdx['original'] = url
        cdx['mimetype'] = mime
        cdx['statuscode'] = '200'
        cdx['digest'] = '-'
        cdx['length'] = '-1'
        cdx['offset'] = '0'
        cdx['filename'] = filename
        return cdx
Example #11
0
 def convert_to_cdx(self, item, urlkey, url):
     cdx = CDXObject()
     cdx['urlkey'] = canonicalize(url)
     cdx['timestamp'] = gettext(item, 'tstamp')[:14]
     cdx['url'] = url
     cdx['mime'] = gettext(item, 'primaryType') + '/' + gettext(
         item, 'subType')
     cdx['status'] = '-'
     cdx['digest'] = gettext(item, 'digest')
     #cdx['length'] = gettext(item, 'contentLength')
     cdx['length'] = '-'
     cdx['offset'] = gettext(item, 'arcoffset')
     cdx['filename'] = gettext(item, 'arcname') + '.arc.gz'
     return cdx
Example #12
0
def test_collapseTime_resolveRevisits_reverse(client):
    resp = query(client,
                 'http://www.iana.org/_css/2013.1/print.css',
                 collapseTime='11',
                 resolveRevisits='true',
                 reverse='true')

    cdxes = [CDXObject(l) for l in resp.body.splitlines()]

    assert len(cdxes) == 3

    # timestamp is in descending order
    for i in range(len(cdxes) - 1):
        assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp']
    def detect_pages(self, user, coll, rec):
        key = self.cdxj_key.format(user=user, coll=coll, rec=rec)

        pages = []

        #for member, score in self.manager.redis.zscan_iter(key):
        for member in self.manager.redis.zrange(key, 0, -1):
            cdxj = CDXObject(member.encode('utf-8'))

            if len(pages) < 500 and self.is_page(cdxj):
                pages.append(dict(url=cdxj['url'],
                                  timestamp=cdxj['timestamp']))

        return pages
Example #14
0
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False,
                       failed_files=None):
    resolve_loader = ResolvingLoader(test_warc_dir)
    cdx = CDXObject(cdx)

    try:
        (headers, stream) = resolve_loader(cdx, failed_files, revisit_func)
        print headers
        sys.stdout.write(stream.readline())
        sys.stdout.write(stream.readline())
    except ArchiveLoadFailed as e:
        if reraise:
            raise
        else:
            print 'Exception: ' + e.__class__.__name__
Example #15
0
    def _get_url_ts(self, user, coll, rec, url):
        try:
            key, end_key = calc_search_range(url, 'exact')
        except:
            return None

        cdx_key = self.cdx_key.format(user=user, coll=coll, rec=rec)

        result = self.redis.zrangebylex(cdx_key, '[' + key, '(' + end_key)
        if not result:
            return None

        last_cdx = CDXObject(result[-1])

        return last_cdx['timestamp']
Example #16
0
    def convert_to_cdxj(self):
        cdxj_writer = CDXJ()
        for filename in self.iter_cdx_files():
            outfile = filename + 'j'

            print('Converting {0} -> {1}'.format(filename, outfile))

            with open(outfile + '.tmp', 'w+b') as out:
                with open(filename) as fh:
                    for line in fh:
                        if line.startswith(' CDX'):
                            continue
                        cdx = CDXObject(line)
                        cdx[URLKEY] = canonicalize(cdx[ORIGINAL])
                        cdxj_writer.write_cdx_line(out, cdx, cdx['filename'])

            shutil.move(outfile + '.tmp', outfile)
            os.remove(filename)
Example #17
0
    def test_convert_cdx(self):
        """ Create non-surt cdx, then convert to cdxj
        """
        migrate_dir = os.path.join(self.root_dir, '_migrate')

        os.mkdir(migrate_dir)

        cdxindexer_main(['-u', migrate_dir, self._get_sample_warc('')])

        # try one file with -9
        cdxindexer_main(['-u', '-9', migrate_dir, self._get_sample_warc('example.warc.gz')])

        cdxs = os.listdir(migrate_dir)
        assert all(x.endswith('.cdx') for x in cdxs)

        @patch('pywb.manager.manager.get_input', lambda x: 'blah')
        def do_migrate_no():
            main(['cdx-convert', migrate_dir])

        do_migrate_no()
        assert os.listdir(migrate_dir) == cdxs

        @patch('pywb.manager.manager.get_input', lambda x: 'y')
        def do_migrate_yes():
            main(['cdx-convert', migrate_dir])

        do_migrate_yes()
        cdxjs = os.listdir(migrate_dir)

        assert len(cdxs) == len(cdxjs)
        assert all(x.endswith('.cdxj') for x in cdxjs)

        with open(os.path.join(migrate_dir, 'iana.cdxj'), 'rb') as fh:
            cdx = CDXObject(fh.readline())
            assert cdx['urlkey'] == 'org,iana)/'
            assert cdx['timestamp'] == '20140126200624'
            assert cdx['url'] == 'http://www.iana.org/'
            #assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",')

        # Nothing else to migrate
        main(['cdx-convert', migrate_dir])
Example #18
0
 def __parsed(self):
     return CDXObject(self.raw)
Example #19
0
def load_orig_cdx(self):
    return [CDXObject(BAD_ORIG_CDX),
            CDXObject(URL_AGNOSTIC_ORIG_CDX)]
Example #20
0
def load_orig_bad_cdx(self):
    return [CDXObject(BAD_ORIG_CDX),
            CDXObject(BAD_ORIG_CDX)]
Example #21
0
def test_invalid_cdx_format():
    with raises(CDXException):
        x = CDXObject('a b c')
Example #22
0
def test_empty_cdxobject():
    x = CDXObject('')
    assert len(x) == 0
Example #23
0
def load_orig_cdx(_):
    return [
        CDXObject(BAD_ORIG_CDX),
        CDXObject(URL_AGNOSTIC_ORIG_CDX.encode('utf-8'))
    ]
Example #24
0
def test_unicode_url():
    x = CDXObject(
        'com,example,cafe)/ 123 {"url": "http://example.com/café/path"}')
    assert x['urlkey'] == 'com,example,cafe)/'
    assert x['timestamp'] == '123'
    assert x['url'] == 'http://example.com/caf%C3%A9/path'
Example #25
0
    def process_record(self, record, flow):
        headers = flow.response.headers
        url = flow.request.req_url
        scheme = flow.request.req_scheme

        if not self.content_rewriter:
            return record.http_headers, StreamIO(record.raw_stream)

        cookie_rewriter = None

        template_params = flow.extra_data

        environ = {
            'pywb_proxy_magic': self.proxy_magic,
            'webrec.template_params': template_params
        }

        wb_url = WbUrl(url)
        wb_prefix = ''
        host_prefix = flow.request.req_scheme + '://' + self.proxy_magic
        urlrewriter = SchemeOnlyUrlRewriter(wb_url, '')

        if flow.request.headers.get('X-Requested-With',
                                    '').lower() == 'xmlhttprequest':
            urlrewriter.rewrite_opts['is_ajax'] = True

        head_insert_func = (self.head_insert_view.create_insert_func(
            wb_url, wb_prefix, host_prefix, url, environ, False))

        urlkey = canonicalize(wb_url.url)

        cdx = CDXObject()
        cdx['urlkey'] = urlkey
        cdx['timestamp'] = http_date_to_timestamp(
            headers.get('Memento-Datetime'))
        cdx['url'] = wb_url.url
        if headers.get('Webagg-Source-Coll') == 'live':
            cdx['is_live'] = 'true'

        result = self.content_rewriter.rewrite_content(
            urlrewriter, record.http_headers, record.raw_stream,
            head_insert_func, urlkey, cdx, cookie_rewriter, environ)

        status_headers, gen, is_rw = result

        status_headers.remove_header('Content-Security-Policy')

        # check for content-length
        res = status_headers.get_header('content-length')
        try:
            if int(res) > 0:
                return status_headers, IterIdent(gen)
        except:
            pass

        # need to either chunk or buffer to get content-length
        if flow.request.http_version == 'HTTP/1.1':
            status_headers.remove_header('content-length')
            status_headers.headers.append(('Transfer-Encoding', 'chunked'))
            #gen = chunk_encode_iter(gen)
        else:
            gen = buffer_iter(status_headers, gen)

        return status_headers, IterIdent(gen)
Example #26
0
def _make_line(fields):
    line = ' '.join(['-'] * fields)
    x = CDXObject(line)
    assert len(x) == fields
    assert str(x) == line
Example #27
0
def _make_line(fields):
    line = ' '.join(['-'] * fields)
    x = CDXObject(line.encode('utf-8'))
    assert len(x) == fields
    assert str(x) == line