Beispiel #1
0
def log_invalid_cdx(self, cdxline=b''):
    # begin Perma customization
    from pywb.warcserver.index.cdxobject import (OrderedDict, to_native_str,
        json_decode, six, quote, CDXException, URLKEY, TIMESTAMP)
    # end Perma customization

    OrderedDict.__init__(self)

    cdxline = cdxline.rstrip()
    self._from_json = False
    self._cached_json = None

    # Allows for filling the fields later or in a custom way
    if not cdxline:
        self.cdxline = cdxline
        return

    fields = cdxline.split(b' ' , 2)

    # Check for CDX JSON
    if fields[-1].startswith(b'{'):
        self[URLKEY] = to_native_str(fields[0], 'utf-8')
        self[TIMESTAMP] = to_native_str(fields[1], 'utf-8')
        json_fields = json_decode(to_native_str(fields[-1], 'utf-8'))
        for n, v in six.iteritems(json_fields):
            n = to_native_str(n, 'utf-8')
            n = self.CDX_ALT_FIELDS.get(n, n)

            if n == 'url':
                try:
                    v.encode('ascii')
                except UnicodeEncodeError:
                    v = quote(v.encode('utf-8'), safe=':/')

            if n != 'filename':
                v = to_native_str(v, 'utf-8')

            self[n] = v

        self.cdxline = cdxline
        self._from_json = True
        return

    more_fields = fields.pop().split(b' ')
    fields.extend(more_fields)

    cdxformat = None
    for i in self.CDX_FORMATS:
        if len(i) == len(fields):
            cdxformat = i
    if not cdxformat:
        # begin Perma customization
        msg = 'unknown {0}-field cdx format: {1}'.format(len(fields), fields)
        # begin Perma customization
        raise CDXException(msg)

    for header, field in zip(cdxformat, fields):
        self[header] = to_native_str(field, 'utf-8')

    self.cdxline = cdxline
Beispiel #2
0
    def compute_page_range(self, reader, query):
        pagesize = query.page_size
        if not pagesize:
            pagesize = self.max_blocks
        else:
            pagesize = int(pagesize)

        last_line = None

        # Get End
        end_iter = search(reader, query.end_key, prev_size=1)

        try:
            end_line = six.next(end_iter)
        except StopIteration:
            last_line = read_last_line(reader)
            end_line = last_line

        # Get Start
        first_iter = iter_range(reader, query.key, query.end_key, prev_size=1)

        try:
            first_line = six.next(first_iter)
        except StopIteration:
            if end_line == last_line and query.key >= last_line:
                first_line = last_line
            else:
                reader.close()
                if query.page_count:
                    yield self._page_info(0, pagesize, 0)
                return

        first = IDXObject(first_line)

        end = IDXObject(end_line)

        try:
            blocks = end['lineno'] - first['lineno']
            total_pages = int(blocks / pagesize) + 1
        except:
            blocks = -1
            total_pages = 1

        if query.page_count:
            # same line, so actually need to look at cdx
            # to determine if it exists
            if blocks == 0:
                try:
                    block_cdx_iter = self.idx_to_cdx([first_line], query)
                    block = six.next(block_cdx_iter)
                    cdx = six.next(block)
                except StopIteration:
                    total_pages = 0
                    blocks = -1

            yield self._page_info(total_pages, pagesize, blocks + 1)
            reader.close()
            return

        curr_page = query.page
        if curr_page >= total_pages or curr_page < 0:
            msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
            reader.close()
            raise CDXException(msg.format(curr_page, total_pages - 1))

        startline = curr_page * pagesize
        endline = startline + pagesize - 1
        if blocks >= 0:
            endline = min(endline, blocks)

        if curr_page == 0:
            yield first_line
        else:
            startline -= 1

        idxiter = itertools.islice(first_iter, startline, endline)
        for idx in idxiter:
            yield idx

        reader.close()
Beispiel #3
0
 def url(self):
     try:
         return self.params['url']
     except KeyError:
         msg = 'A url= param must be specified to query the cdx server'
         raise CDXException(msg)
Beispiel #4
0
 def page(self):
     try:
         return int(self.params.get('page', 0))
     except ValueError:
         msg = 'Invalid value for page= param: {}'
         raise CDXException(msg.format(self.params.get('page')))