Exemple #1
0
    def create_record_iter(self, arcv_iter):
        append_post = self.options.get('append_post')
        include_all = self.options.get('include_all')
        block_size = self.options.get('block_size', 16384)
        surt_ordered = self.options.get('surt_ordered', True)
        minimal = self.options.get('minimal')
        append_post = self.options.get('append_post')

        if append_post and minimal:
            raise Exception('Sorry, minimal index option and ' +
                            'append POST options can not be used together')

        for record in arcv_iter.iter_records(block_size):
            entry = None

            if not include_all and not minimal and (
                    record.status_headers.get_statuscode() == '-'):
                continue

            if record.format == 'warc':
                if (record.rec_type in ('request', 'warcinfo')
                        and not include_all and not append_post):
                    continue

                elif (not include_all
                      and record.content_type == 'application/warc-fields'):
                    continue

                entry = self.parse_warc_record(record)
            elif record.format == 'arc':
                entry = self.parse_arc_record(record)

            if not entry:
                continue

            if entry.get('url') and not entry.get('urlkey'):
                entry['urlkey'] = canonicalize(entry['url'], surt_ordered)

            compute_digest = False

            if (entry.get('digest', '-') == '-' and record.rec_type
                    not in ('revisit', 'request', 'warcinfo')):

                compute_digest = True

            elif not minimal and record.rec_type == 'request' and append_post:
                method = record.status_headers.protocol
                len_ = record.status_headers.get_header('Content-Length')

                post_query = extract_post_query(method, entry.get('mime'),
                                                len_, record.stream)

                entry['_post_query'] = post_query

            arcv_iter.read_to_end(record, compute_digest)
            entry.set_rec_info(*arcv_iter.member_info)
            entry.record = record

            yield entry
Exemple #2
0
def create_record_iter(arcv_iter, options):
    append_post = options.get('append_post')
    include_all = options.get('include_all')
    block_size = options.get('block_size', 16384)

    for record in arcv_iter.iter_records(block_size):
        entry = None

        if not include_all and (record.status_headers.get_statuscode() == '-'):
            continue

        if record.format == 'warc':
            if (record.rec_type in ('request', 'warcinfo') and
                 not include_all and
                 not append_post):
                continue

            elif (not include_all and
                  record.content_type == 'application/warc-fields'):
                continue

            entry = parse_warc_record(record)
        elif record.format == 'arc':
            entry = parse_arc_record(record)

        if not entry:
            continue

        if entry.url and not entry.key:
            entry.key = canonicalize(entry.url,
                                     options.get('surt_ordered', True))

        compute_digest = False

        if (entry.digest == '-' and
            record.rec_type not in ('revisit', 'request', 'warcinfo')):

            compute_digest = True

        elif record.rec_type == 'request' and options.get('append_post'):
            method = record.status_headers.protocol
            len_ = record.status_headers.get_header('Content-Length')

            post_query = extract_post_query(method,
                                            entry.mime,
                                            len_,
                                            record.stream)

            entry.post_query = post_query

        #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
        arcv_iter.read_to_end(record, compute_digest)
        entry.set_rec_info(*arcv_iter.member_info)
        entry.record = record

        yield entry
Exemple #3
0
def create_record_iter(arcv_iter, options):
    append_post = options.get('append_post')
    include_all = options.get('include_all')
    block_size = options.get('block_size', 16384)

    for record in arcv_iter.iter_records(block_size):
        entry = None

        if not include_all and (record.status_headers.get_statuscode() == '-'):
            continue

        if record.format == 'warc':
            if (record.rec_type in ('request', 'warcinfo') and not include_all
                    and not append_post):
                continue

            elif (not include_all
                  and record.content_type == 'application/warc-fields'):
                continue

            entry = parse_warc_record(record)
        elif record.format == 'arc':
            entry = parse_arc_record(record)

        if not entry:
            continue

        if entry.url and not entry.key:
            entry.key = canonicalize(entry.url,
                                     options.get('surt_ordered', True))

        compute_digest = False

        if (entry.digest == '-'
                and record.rec_type not in ('revisit', 'request', 'warcinfo')):

            compute_digest = True

        elif record.rec_type == 'request' and options.get('append_post'):
            method = record.status_headers.protocol
            len_ = record.status_headers.get_header('Content-Length')

            post_query = extract_post_query(method, entry.mime, len_,
                                            record.stream)

            entry.post_query = post_query

        #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
        arcv_iter.read_to_end(record, compute_digest)
        entry.set_rec_info(*arcv_iter.member_info)
        entry.record = record

        yield entry
    def normalize_post_query(self):
        if self.method != 'POST':
            return

        if not self.wb_url:
            return

        mime = self.env.get('CONTENT_TYPE').split(';')[0]
        length = self.env.get('CONTENT_LENGTH')
        stream = self.env['wsgi.input']

        post_query = extract_post_query('POST', mime, length, stream)

        if post_query:
            self.wb_url.url = append_post_query(self.wb_url.url, post_query)
    def normalize_post_query(self):
        if self.method != 'POST':
            return

        if not self.wb_url:
            return

        mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
        length = self.env.get('CONTENT_LENGTH')
        stream = self.env['wsgi.input']

        post_query = extract_post_query('POST', mime, length, stream)

        if post_query:
            self.wb_url.url = append_post_query(self.wb_url.url, post_query)
    def normalize_post_query(self):
        if self.method != 'POST':
            return

        if not self.wb_url:
            return

        mime = self.env.get('CONTENT_TYPE', '')
        length = self.env.get('CONTENT_LENGTH')
        stream = self.env['wsgi.input']

        buffered_stream = BytesIO()

        post_query = extract_post_query('POST', mime, length, stream,
                                        buffered_stream=buffered_stream,
                                        environ=self.env)

        if post_query:
            self.env['wsgi.input'] = buffered_stream
            self.wb_url.url = append_post_query(self.wb_url.url, post_query)
    def include_post_query(self, url):
        if not url or self.get_req_method() != 'POST':
            return url

        mime = self._get_content_type()
        #mime = mime.split(';')[0] if mime else ''
        length = self._get_content_length()
        stream = self.env['wsgi.input']

        buffered_stream = BytesIO()

        post_query = extract_post_query('POST', mime, length, stream,
                                        buffered_stream=buffered_stream,
                                        environ=self.env)

        if post_query:
            self.env['wsgi.input'] = buffered_stream
            url = append_post_query(url, post_query)

        return url
Exemple #8
0
    def normalize_post_query(self):
        if self.method != 'POST':
            return

        if not self.wb_url:
            return

        mime = self.env.get('CONTENT_TYPE', '')
        length = self.env.get('CONTENT_LENGTH')
        stream = self.env['wsgi.input']

        buffered_stream = BytesIO()

        post_query = extract_post_query('POST',
                                        mime,
                                        length,
                                        stream,
                                        buffered_stream=buffered_stream,
                                        environ=self.env)

        if post_query:
            self.env['wsgi.input'] = buffered_stream
            self.wb_url.url = append_post_query(self.wb_url.url, post_query)
Exemple #9
0
    def create_record_iter(self, arcv_iter):
        append_post = self.options.get('append_post')
        include_all = self.options.get('include_all')
        block_size = self.options.get('block_size', 16384)
        surt_ordered = self.options.get('surt_ordered', True)
        minimal = self.options.get('minimal')

        if append_post and minimal:
            raise Exception('Sorry, minimal index option and ' +
                            'append POST options can not be used together')

        for record in arcv_iter.iter_records(block_size):
            entry = None

            if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
                continue

            if record.format == 'warc':
                if (record.rec_type in ('request', 'warcinfo') and
                     not include_all and
                     not append_post):
                    continue

                elif (not include_all and
                      record.content_type == 'application/warc-fields'):
                    continue

                entry = self.parse_warc_record(record)
            elif record.format == 'arc':
                entry = self.parse_arc_record(record)

            if not entry:
                continue

            if entry.get('url') and not entry.get('urlkey'):
                entry['urlkey'] = canonicalize(entry['url'], surt_ordered)

            compute_digest = False

            if (entry.get('digest', '-') == '-' and
                record.rec_type not in ('revisit', 'request', 'warcinfo')):

                compute_digest = True

            elif not minimal and record.rec_type == 'request' and append_post:
                method = record.status_headers.protocol
                len_ = record.status_headers.get_header('Content-Length')

                post_query = extract_post_query(method,
                                                entry.get('mime'),
                                                len_,
                                                record.stream)

                entry['_post_query'] = post_query

            entry.record = record

            self.begin_payload(compute_digest, entry)
            arcv_iter.read_to_end(record, self.handle_payload)

            entry.set_rec_info(*arcv_iter.member_info)
            self.end_payload(entry)

            yield entry