Example #1
0
    def cdx_index(self, z_key, stream, filename):
        cdxout = BytesIO()
        write_cdx_index(cdxout, stream, filename, cdxj=True, append_post=True)

        cdx_list = cdxout.getvalue().rstrip().split(b'\n')
        count = 0

        min_ = max_ = None

        for cdx in cdx_list:
            if cdx and not self.dry:
                self.dst_redis.zadd(z_key, 0, cdx)
                cdxobj = CDXObject(cdx)

                ts = cdxobj['timestamp']

                min_ = min(min_, ts) if min_ else ts
                max_ = max(max_, ts) if max_ else ts

                count += 1

        if count:
            min_ = timestamp_to_sec(min_)
            max_ = timestamp_to_sec(max_)

        logging.info('  CDXJ: {0} {1} {2}'.format(count, min_, max_))
        return min_, max_
Example #2
0
def cdx_index(warc, **options):
    buff = BytesIO()

    with open(TEST_WARC_DIR + warc, 'rb') as fh:
        write_cdx_index(buff, fh,  warc, **options)

    return buff.getvalue()
Example #3
0
    def _load_and_index(self, warc_path):
        warckey = self.warc_bucket.get_key(warc_path)

        cdx_path = self._conv_warc_to_cdx_path(warc_path)

        if self.options.skip_existing:
            cdxkey = self.cdx_bucket.get_key(cdx_path)

            if cdxkey:
                sys.stderr.write('Already Exists\n')
                return

        with TemporaryFile(mode='w+b') as warctemp:
            shutil.copyfileobj(warckey, warctemp)

            warctemp.seek(0)

            with TemporaryFile(mode='w+b') as cdxtemp:
                with GzipFile(fileobj=cdxtemp, mode='w+b') as cdxfile:
                    # Index to temp
                    write_cdx_index(cdxfile, warctemp, warc_path, **self.index_options)

                # Upload temp
                cdxkey = self.cdx_bucket.new_key(cdx_path)
                cdxtemp.flush()

                cdxkey.set_contents_from_file(cdxtemp, rewind=True)
Example #4
0
    def cdx_index(self, z_key, stream, filename):
        cdxout = BytesIO()
        write_cdx_index(cdxout, stream, filename,
                        cdxj=True, append_post=True)

        cdx_list = cdxout.getvalue().rstrip().split(b'\n')
        count = 0

        min_ = max_ = None

        for cdx in cdx_list:
            if cdx and not self.dry:
                self.dst_redis.zadd(z_key, 0, cdx)
                cdxobj = CDXObject(cdx)

                ts = cdxobj['timestamp']

                min_ = min(min_, ts) if min_ else ts
                max_ = max(max_, ts) if max_ else ts

                count += 1

        if count:
            min_ = timestamp_to_sec(min_)
            max_ = timestamp_to_sec(max_)

        logging.info('  CDXJ: {0} {1} {2}'.format(count, min_, max_))
        return min_, max_
Example #5
0
def cdx_index(warc, **options):
    buff = BytesIO()

    with open(TEST_WARC_DIR + warc, 'rb') as fh:
        write_cdx_index(buff, fh, warc, **options)

    return buff.getvalue()
    def test_anon_download_coll(self):
        res = self._get_anon('/temp/$download')

        assert res.headers['Content-Disposition'].startswith("attachment; filename*=UTF-8''temp-")

        warcin = self._get_dechunked(res.body)

        cdxout = BytesIO()
        write_cdx_index(cdxout, warcin, 'temp.warc.gz', include_all=True, cdxj=True)

        #print(cdxout.getvalue().decode('utf-8'))

        cdx = [CDXObject(cdx) for cdx in cdxout.getvalue().rstrip().split(b'\n')]
        assert len(cdx) == 6

        # response
        cdx[0]['url'] = 'http://httpbin.org/get?food=bar'
        cdx[0]['mime'] = 'application/json'

        # request
        cdx[1]['url'] = 'http://httpbin.org/get?food=bar'
        cdx[1]['mime'] = '-'

        # response
        cdx[2]['url'] = 'http://httpbin.org/get?bood=far'
        cdx[2]['mime'] = 'application/json'

        # request
        cdx[3]['url'] = 'http://httpbin.org/get?bood=far'
        cdx[3]['mime'] = '-'
Example #7
0
    def create_all_from_link(self, link):
        warc_path = link.warc_storage_file()
        with default_storage.open(warc_path, 'rb') as warc_file, io.BytesIO() as cdx_io:
            write_cdx_index(cdx_io, warc_file, warc_path)
            cdx_io.seek(0)
            next(cdx_io) # first line is a header so skip it
            results = [CDXLine.objects.get_or_create(link=link, raw=line)[0] for line in cdx_io]

        return results
Example #8
0
    def load_cdx(self, query):
        """
            This function accepts a standard CDX request, except with a GUID instead of date, and returns a standard CDX 11 response.
        """
        guid = query.params['guid']
        url = query.url

        # We'll first check the key-value store to see if we cached the lookup for this guid on a previous request.
        # This will be common, since each playback triggers lots of requests for the same .warc file.
        cache_key = guid + '-surts'
        url_key = guid+'-url'
        surt_lookup = django_cache.get(cache_key)
        url = url or django_cache.get(url_key)
        if surt_lookup and url:
            surt_lookup = json.loads(surt_lookup)

        else:
            # nothing in cache; find requested link in database
            try:
                link = Link.objects.select_related().get(pk=guid)
            except Link.DoesNotExist:
                return []

            # cache url, which may be blank if this is the first request
            if not url:
                url = link.submitted_url
            django_cache.set(url_key, url, timeout=60*60)

            # get warc file
            for asset in link.assets.all():
                if '.warc' in asset.warc_capture:
                    warc_path = os.path.join(asset.base_storage_path, asset.warc_capture)
                    break
            else:
                return []  # no .warc file -- do something to handle this?

            # now we have to get an index of all the URLs in this .warc file
            # first try fetching it from a .cdx file on disk
            cdx_path = warc_path.replace('.gz', '').replace('.warc', '.cdx')

            if not default_storage.exists(cdx_path):
                # there isn't a .cdx file on disk either -- let's create it
                with default_storage.open(warc_path, 'rb') as warc_file, default_storage.open(cdx_path, 'wb') as cdx_file:
                    write_cdx_index(cdx_file, warc_file, warc_path, sort=True)

            # now load the URL index from disk and stick it in the cache
            cdx_lines = (line.strip() for line in default_storage.open(cdx_path, 'rb'))
            surt_lookup = dict((key, list(val)) for key, val in groupby(cdx_lines, key=lambda line: line.split(' ', 1)[0]))
            django_cache.set(cache_key, json.dumps(surt_lookup), timeout=60*60)

        # find cdx lines for url
        sorted_url = surt(url)
        if sorted_url in surt_lookup:
            return (str(i) for i in surt_lookup[sorted_url])

        # didn't find requested url in this archive
        return []
Example #9
0
def test_cdxj_empty():
    options = dict(cdxj=True)

    buff = BytesIO()

    empty = BytesIO()

    write_cdx_index(buff, empty, 'empty.warc.gz', **options)

    assert buff.getvalue() == b''
Example #10
0
def test_cdxj_empty():
    options = dict(cdxj=True)

    buff = BytesIO()

    empty = BytesIO()

    write_cdx_index(buff, empty,  'empty.warc.gz', **options)

    assert buff.getvalue() == b''
Example #11
0
    def create_all_from_asset(self, asset):
        results = []
        warc_path = os.path.join(asset.base_storage_path, asset.warc_capture)
        with default_storage.open(warc_path,
                                  'rb') as warc_file, io.BytesIO() as cdx_io:
            write_cdx_index(cdx_io, warc_file, warc_path)
            cdx_io.seek(0)
            next(cdx_io)  # first line is a header so skip it
            for line in cdx_io:
                results.append(
                    CDXLine.objects.get_or_create(asset=asset, raw=line)[0])

        return results
Example #12
0
    def create_all_from_link(self, link):
        warc_path = link.warc_storage_file()
        with default_storage.open(warc_path,
                                  'rb') as warc_file, io.BytesIO() as cdx_io:
            write_cdx_index(cdx_io, warc_file, warc_path)
            cdx_io.seek(0)
            next(cdx_io)  # first line is a header so skip it
            results = [
                CDXLine.objects.get_or_create(link=link, raw=line)[0]
                for line in cdx_io
            ]

        return results
Example #13
0
    def create_all_from_link(self, link):
        warc_path = link.warc_storage_file()
        with default_storage.open(warc_path, 'rb') as warc_file, io.BytesIO() as cdx_io:
            write_cdx_index(cdx_io, warc_file, warc_path)
            cdx_io.seek(0)
            next(cdx_io) # first line is a header so skip it
            results = []
            for line in cdx_io:
                cdxline = CDXLine.objects.get_or_create(raw=line, link_id=link.guid)[0]
                cdxline.is_unlisted = link.is_unlisted
                cdxline.is_private = link.is_private
                cdxline.save()
                results.append(cdxline)

        return results
Example #14
0
    def add_record(self, stream, name=None):
        stream.seek(0)
        if not name:
            name = stream.name

        cdxout = BytesIO()
        write_cdx_index(cdxout, stream, name,
                        cdxj=True, append_post=True)

        cdxes = cdxout.getvalue()
        for cdx in cdxes.split('\n'):
            if cdx:
                self.redis.zadd(self.key, 0, cdx)

        return cdx
Example #15
0
def cdx(request):
    """
        This function handles WARC lookups by our warc server (running in warc_server).
        It accepts a standard CDX request, except with a GUID instead of date, and returns a standard CDX 11 response.
        If there's no warc for the requested GUID, or the requested URL isn't stored in that WARC, it returns a 404.
    """
    # find requested link and url
    try:
        link = Link.objects.select_related().get(pk=request.GET.get('guid'))
    except Link.DoesNotExist:
        print "COULDN'T FIND LINK"
        raise Http404
    url = request.GET.get('url', link.submitted_url)

    # get warc file
    for asset in link.assets.all():
        if '.warc' in asset.warc_capture:
            warc_path = os.path.join(asset.base_storage_path, asset.warc_capture)
            break
    else:
        print "COULDN'T FIND WARC"
        raise Http404 # no .warc file -- do something to handle this?

    # get cdx file
    cdx_path = warc_path.replace('.gz', '').replace('.warc', '.cdx')
    if not default_storage.exists(cdx_path):
        # if we can't find the CDX file associated with this WARC, create it
        with default_storage.open(warc_path, 'rb') as warc_file, default_storage.open(cdx_path, 'wb') as cdx_file:
            write_cdx_index(cdx_file, warc_file, warc_path, sort=True)

    cdx_lines = default_storage.open(cdx_path, 'rb')

    # find cdx lines for url
    sorted_url = surt.surt(url)
    out = ""
    for line in cdx_lines:
        if line.startswith(sorted_url+" "):
            out += line
        elif out:
            # file may contain multiple matching lines in a row; we want to return all of them
            # if we've already found one or more matching lines, and now they're no longer matching, we're done
            break

    if out:
        return HttpResponse(out, content_type="text/plain")

    print "COULDN'T FIND URL"
    raise Http404 # didn't find requested url in .cdx file
Example #16
0
def cdx(request):
    """
        This function handles WARC lookups by our warc server (running in warc_server).
        It accepts a standard CDX request, except with a GUID instead of date, and returns a standard CDX 11 response.
        If there's no warc for the requested GUID, or the requested URL isn't stored in that WARC, it returns a 404.
    """
    # find requested link and url
    try:
        link = Link.objects.select_related().get(pk=request.GET.get('guid'))
    except Link.DoesNotExist:
        print "COULDN'T FIND LINK"
        raise Http404
    url = request.GET.get('url', link.submitted_url)

    # get warc file
    for asset in link.assets.all():
        if '.warc' in asset.warc_capture:
            warc_path = os.path.join(asset.base_storage_path, asset.warc_capture)
            break
    else:
        print "COULDN'T FIND WARC"
        raise Http404 # no .warc file -- do something to handle this?

    # get cdx file
    cdx_path = warc_path.replace('.gz', '').replace('.warc', '.cdx')
    if not default_storage.exists(cdx_path):
        # if we can't find the CDX file associated with this WARC, create it
        with default_storage.open(warc_path, 'rb') as warc_file, default_storage.open(cdx_path, 'wb') as cdx_file:
            write_cdx_index(cdx_file, warc_file, warc_path, sort=True)

    cdx_lines = default_storage.open(cdx_path, 'rb')

    # find cdx lines for url
    sorted_url = surt.surt(url)
    out = ""
    for line in cdx_lines:
        if line.startswith(sorted_url+" "):
            out += line
        elif out:
            # file may contain multiple matching lines in a row; we want to return all of them
            # if we've already found one or more matching lines, and now they're no longer matching, we're done
            break

    if out:
        return HttpResponse(out, content_type="text/plain")

    print "COULDN'T FIND URL"
    raise Http404 # didn't find requested url in .cdx file
Example #17
0
    def add_urls_to_index(self, stream, params, filename, length):
        rel_path = res_template(self.rel_path_template, params)
        filename = os.path.relpath(filename, rel_path)

        cdxout = BytesIO()
        write_cdx_index(cdxout, stream, filename,
                        cdxj=True, append_post=True)

        z_key = res_template(self.redis_key_template, params)

        cdx_list = cdxout.getvalue().rstrip().split(b'\n')

        for cdx in cdx_list:
            if cdx:
                self.redis.zadd(z_key, 0, cdx)

        return cdx_list
Example #18
0
    def test_record_multiple_writes_keep_open(self):
        warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = self.root_dir + '/warcs/'

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body


        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?boo=far', '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/FOO/', 1)

        # Check two records in WARC
        r = FakeStrictRedis.from_url('redis://localhost/2')
        res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
        fullname = coll_dir + files[0]

        cdxout = BytesIO()
        with open(fullname, 'rb') as fh:
            filename = os.path.relpath(fullname, rel_path)
            write_cdx_index(cdxout, fh, filename,
                            cdxj=True, append_post=True, sort=True)

        res = [CDXObject(x) for x in res]

        cdxres = cdxout.getvalue().strip()
        cdxres = cdxres.split(b'\n')
        cdxres = [CDXObject(x) for x in cdxres]

        assert cdxres == res

        assert len(writer.fh_cache) == 1

        writer.close_key(self.root_dir + '/warcs/FOO/')

        assert len(writer.fh_cache) == 0

        writer.close()

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                '/get?boo=far', '&param.recorder.coll=FOO')

        self._test_all_warcs('/warcs/FOO/', 2)

        warcs = r.hgetall('FOO:warc')
        assert len(warcs) == 2