def create_record_iter(self, arcv_iter): append_post = self.options.get('append_post') include_all = self.options.get('include_all') block_size = self.options.get('block_size', 16384) surt_ordered = self.options.get('surt_ordered', True) minimal = self.options.get('minimal') append_post = self.options.get('append_post') if append_post and minimal: raise Exception('Sorry, minimal index option and ' + 'append POST options can not be used together') for record in arcv_iter.iter_records(block_size): entry = None if not include_all and not minimal and ( record.status_headers.get_statuscode() == '-'): continue if record.format == 'warc': if (record.rec_type in ('request', 'warcinfo') and not include_all and not append_post): continue elif (not include_all and record.content_type == 'application/warc-fields'): continue entry = self.parse_warc_record(record) elif record.format == 'arc': entry = self.parse_arc_record(record) if not entry: continue if entry.get('url') and not entry.get('urlkey'): entry['urlkey'] = canonicalize(entry['url'], surt_ordered) compute_digest = False if (entry.get('digest', '-') == '-' and record.rec_type not in ('revisit', 'request', 'warcinfo')): compute_digest = True elif not minimal and record.rec_type == 'request' and append_post: method = record.status_headers.protocol len_ = record.status_headers.get_header('Content-Length') post_query = extract_post_query(method, entry.get('mime'), len_, record.stream) entry['_post_query'] = post_query arcv_iter.read_to_end(record, compute_digest) entry.set_rec_info(*arcv_iter.member_info) entry.record = record yield entry
def create_record_iter(arcv_iter, options): append_post = options.get('append_post') include_all = options.get('include_all') block_size = options.get('block_size', 16384) for record in arcv_iter.iter_records(block_size): entry = None if not include_all and (record.status_headers.get_statuscode() == '-'): continue if record.format == 'warc': if (record.rec_type in ('request', 'warcinfo') and not include_all and not append_post): continue elif (not include_all and record.content_type == 'application/warc-fields'): continue entry = parse_warc_record(record) elif record.format == 'arc': entry = parse_arc_record(record) if not entry: continue if entry.url and not entry.key: entry.key = canonicalize(entry.url, options.get('surt_ordered', True)) compute_digest = False if (entry.digest == '-' and record.rec_type not in ('revisit', 'request', 'warcinfo')): compute_digest = True elif record.rec_type == 'request' and options.get('append_post'): method = record.status_headers.protocol len_ = record.status_headers.get_header('Content-Length') post_query = extract_post_query(method, entry.mime, len_, record.stream) entry.post_query = post_query #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest)) arcv_iter.read_to_end(record, compute_digest) entry.set_rec_info(*arcv_iter.member_info) entry.record = record yield entry
def normalize_post_query(self): if self.method != 'POST': return if not self.wb_url: return mime = self.env.get('CONTENT_TYPE').split(';')[0] length = self.env.get('CONTENT_LENGTH') stream = self.env['wsgi.input'] post_query = extract_post_query('POST', mime, length, stream) if post_query: self.wb_url.url = append_post_query(self.wb_url.url, post_query)
def normalize_post_query(self): if self.method != 'POST': return if not self.wb_url: return mime = self.env.get('CONTENT_TYPE', '').split(';')[0] length = self.env.get('CONTENT_LENGTH') stream = self.env['wsgi.input'] post_query = extract_post_query('POST', mime, length, stream) if post_query: self.wb_url.url = append_post_query(self.wb_url.url, post_query)
def normalize_post_query(self): if self.method != 'POST': return if not self.wb_url: return mime = self.env.get('CONTENT_TYPE', '') length = self.env.get('CONTENT_LENGTH') stream = self.env['wsgi.input'] buffered_stream = BytesIO() post_query = extract_post_query('POST', mime, length, stream, buffered_stream=buffered_stream, environ=self.env) if post_query: self.env['wsgi.input'] = buffered_stream self.wb_url.url = append_post_query(self.wb_url.url, post_query)
def include_post_query(self, url): if not url or self.get_req_method() != 'POST': return url mime = self._get_content_type() #mime = mime.split(';')[0] if mime else '' length = self._get_content_length() stream = self.env['wsgi.input'] buffered_stream = BytesIO() post_query = extract_post_query('POST', mime, length, stream, buffered_stream=buffered_stream, environ=self.env) if post_query: self.env['wsgi.input'] = buffered_stream url = append_post_query(url, post_query) return url
def create_record_iter(self, arcv_iter): append_post = self.options.get('append_post') include_all = self.options.get('include_all') block_size = self.options.get('block_size', 16384) surt_ordered = self.options.get('surt_ordered', True) minimal = self.options.get('minimal') if append_post and minimal: raise Exception('Sorry, minimal index option and ' + 'append POST options can not be used together') for record in arcv_iter.iter_records(block_size): entry = None if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'): continue if record.format == 'warc': if (record.rec_type in ('request', 'warcinfo') and not include_all and not append_post): continue elif (not include_all and record.content_type == 'application/warc-fields'): continue entry = self.parse_warc_record(record) elif record.format == 'arc': entry = self.parse_arc_record(record) if not entry: continue if entry.get('url') and not entry.get('urlkey'): entry['urlkey'] = canonicalize(entry['url'], surt_ordered) compute_digest = False if (entry.get('digest', '-') == '-' and record.rec_type not in ('revisit', 'request', 'warcinfo')): compute_digest = True elif not minimal and record.rec_type == 'request' and append_post: method = record.status_headers.protocol len_ = record.status_headers.get_header('Content-Length') post_query = extract_post_query(method, entry.get('mime'), len_, record.stream) entry['_post_query'] = post_query entry.record = record self.begin_payload(compute_digest, entry) arcv_iter.read_to_end(record, self.handle_payload) entry.set_rec_info(*arcv_iter.member_info) self.end_payload(entry) yield entry