def encode(head, l): k = head.get("content-type") if k: k = headers.parse_content_type(k) if k is not None: try: boundary = k[2]["boundary"].encode("ascii") boundary = quote(boundary) except (KeyError, UnicodeError): return b"" hdrs = [] for key, value in l: file_type = mimetypes.guess_type( str(key))[0] or "text/plain; charset=utf-8" if key: hdrs.append(b"--%b" % boundary.encode('utf-8')) disposition = b'form-data; name="%b"' % key hdrs.append(b"Content-Disposition: %b" % disposition) hdrs.append(b"Content-Type: %b" % file_type.encode('utf-8')) hdrs.append(b'') hdrs.append(value) hdrs.append(b'') if value is not None: # If boundary is found in value then raise ValueError if re.search( rb"^--%b$" % re.escape(boundary.encode('utf-8')), value): raise ValueError(b"boundary found in encoded string") hdrs.append(b"--%b--\r\n" % boundary.encode('utf-8')) temp = b"\r\n".join(hdrs) return temp
def decode(hdrs, content): """ Takes a multipart boundary encoded string and returns list of (key, value) tuples. """ v = hdrs.get("content-type") if v: v = headers.parse_content_type(v) if not v: return [] try: boundary = v[2]["boundary"].encode("ascii") except (KeyError, UnicodeError): return [] rx = re.compile(br'\bname="([^"]+)"') r = [] if content is not None: for i in content.split(b"--" + boundary): parts = i.splitlines() if len(parts) > 1 and parts[0][0:2] != b"--": match = rx.search(parts[1]) if match: key = match.group(1) value = b"".join(parts[3 + parts[2:].index(b""):]) r.append((key, value)) return r return []
def decode(content_type: Optional[str], content: bytes) -> List[Tuple[bytes, bytes]]: """ Takes a multipart boundary encoded string and returns list of (key, value) tuples. """ if content_type: ct = headers.parse_content_type(content_type) if not ct: return [] try: boundary = ct[2]["boundary"].encode("ascii") except (KeyError, UnicodeError): return [] rx = re.compile(br'\bname="([^"]+)"') r = [] if content is not None: for i in content.split(b"--" + boundary): parts = i.splitlines() if len(parts) > 1 and parts[0][0:2] != b"--": match = rx.search(parts[1]) if match: key = match.group(1) value = b"".join(parts[3 + parts[2:].index(b""):]) r.append((key, value)) return r return []
def decode(hdrs, content): """ Takes a multipart boundary encoded string and returns list of (key, value) tuples. """ v = hdrs.get("content-type") if v: v = headers.parse_content_type(v) if not v: return [] try: boundary = v[2]["boundary"].encode("ascii") except (KeyError, UnicodeError): return [] rx = re.compile(br'\bname="([^"]+)"') r = [] for i in content.split(b"--" + boundary): parts = i.splitlines() if len(parts) > 1 and parts[0][0:2] != b"--": match = rx.search(parts[1]) if match: key = match.group(1) value = b"".join(parts[3 + parts[2:].index(b""):]) r.append((key, value)) return r return []
def check_css(content_type, content) -> bool: content_type = parse_content_type(content_type) t = "{}/{}".format(content_type[0], content[1]) css_list = [ "text/css", ] if t in css_list: return True return False
def check_html(content_type, content) -> bool: content_type = parse_content_type(content_type) t = "{}/{}".format(content_type[0], content[1]) html_list = [ 'text/html', ] if t in html_list: return True return False
def check_form(content_type, content) -> bool: content_type = parse_content_type(content_type) t = "{}/{}".format(content_type[0], content[1]) form_list = [ 'multipart/form-data', 'application/x-www-form-urlencoded', ] if t in form_list: return True return False
def set_text(self, text): if text is None: self.content = None return enc = self._guess_encoding() try: self.content = encoding.encode(text, enc) except ValueError: # Fall back to UTF-8 and update the content-type header. ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {}) ct[2]["charset"] = "utf-8" self.headers["content-type"] = headers.assemble_content_type(*ct) enc = "utf8" self.content = text.encode(enc, "surrogateescape")
def decode_params_multipart(self, hdrs, content): v = hdrs.get("content-type") if v: v = headers.parse_content_type(v) if not v: return [] try: boundary = v[2]["boundary"].encode("ascii") except (KeyError, UnicodeError): return [] rx = re.compile(br'\bname="([^"]+)"') rxf = re.compile(br'\bname="([^"]+)";\s*filename="(.*?)"') r = {} for i in content.split(b"--" + boundary): parts = i.splitlines() if len(parts) > 1 and parts[0][0:2] != b"--": match = rxf.search(parts[1]) print(parts) if match: key = match.group(1) value = b"\n".join(parts[3 + parts[2:].index(b""):]) filename = match.group(2) if (not key in r): r[key] = [] r[key].append({ 'type': 'file', 'filename': filename, 'value': value }) else: match = rx.search(parts[1]) if match: key = match.group(1) value = b"\n".join(parts[3 + parts[2:].index(b""):]) if (not key in r): r[key] = [] r[key].append({ 'type': 'var', 'filename': None, 'value': value }) return r return {}
def response(self, flow: HTTPFlow): if flow.request.url.startswith(self.url_pattrn): # 获取文本编码 enc = headers.parse_content_type( flow.response.headers.get("content-type", "")) or ("text", "plain", {}) # 数据加载成json类 content_text_json = json.loads(flow.response.text) data_list = content_text_json.get('data') for data in data_list: item = dict() # 获取数据content data_content = data.get('content') data_content_json = json.loads(data_content) # 判断时候为广告 data_label = data_content_json.get( 'label') if 'label' in data_content_json else "" if data_label == '广告': item['display_url'] = data_content_json.get( 'display_url' ) if 'display_url' in data_content_json else "" large_image_list = data_content_json.get( 'large_image_list' ) if 'large_image_list' in data_content_json else "" item['large_image_list_url'] = large_image_list[0].get( 'url' ) if large_image_list[ 0] is not None and 'url' in large_image_list[0] else "" item['title'] = data_content_json.get( 'title') if 'title' in data_content_json else "" item['source'] = data_content_json.get( 'source') if 'source' in data_content_json else "" jstr = json.dumps(item, ensure_ascii=False) print(jstr) jstr = json.dumps(item, ensure_ascii=False) try: self.producer.send(topic=self.topic, value=jstr.encode(encoding='utf-8'), partition=random.choice( list(self.partition))) except: self.logger.error('kafka 发送失败') pass else: continue pass pass
def check_font(content_type, content) -> bool: content_type = parse_content_type(content_type) if content_type[0].endwith("font"): return True font_list = [ 'application/font-cff', 'application/font-off', 'application/font-sfnt', 'application/font-ttf', 'application/font-woff', 'application/vnd.ms-fontobject', 'application/vnd.ms-opentype', ] t = "{}/{}".format(content_type[0], content[1]) if t in font_list: return True return False
def response(self, flow: HTTPFlow): if flow.request.url.startswith(self.url_pattrn): # 获取文本编码 enc = headers.parse_content_type( flow.response.headers.get("content-type", "")) or ("text", "plain", {}) # 数据加载成json类 content_text_json = json.loads(flow.response.text) card_list = content_text_json.get('cards') for card in card_list: item = WeiboItems() try: self._param_card(card, item) item_dic = dict(item) jstr = json.dumps(item_dic, ensure_ascii=False) if jstr != '{}': self.producer.send(topic=self.topic, value=jstr.encode('utf-8'), partition=random.choice( list(self.partition))) except Exception as err: self.logger.error(str(err.with_traceback())) print(item)
def check_js(content_type, content) -> bool: content_type = parse_content_type(content_type) t = "{}/{}".format(content_type[0], content[1]) js_list = [ "application/ecmascript", "application/javascript", "application/x-ecmascript", "application/x-javascript", "text/ecmascript", "text/javascript", "text/javascript1.0", "text/javascript1.1", "text/javascript1.2", "text/javascript1.3", "text/javascript1.4", "text/javascript1.5", "text/jscript", "text/livescript", "text/x-ecmascript", "text/x-javascript", ] if t in js_list: return True return False
def _get_content_type_charset(self) -> Optional[str]: ct = headers.parse_content_type(self.headers.get("content-type", "")) if ct: return ct[2].get("charset")
def check_json(content_type, content) -> bool: content_type = parse_content_type(content_type) if content_type[1].endwith("json"): return True return False
def check_image(content_type, content) -> bool: content_type = parse_content_type(content_type) if content_type[0].endwith("image"): return True return False
def _get_content_type_charset(self) -> Optional[str]: ct = parse_content_type(self.headers.get("content-type", "")) if ct: return ct[2].get("charset") return None
def response(self, flow: HTTPFlow): if flow.request.url.startswith(self.url_pattrn): # 获取文本编码 enc = headers.parse_content_type( flow.response.headers.get("content-type", "")) or ("text", "plain", {}) # 数据加载成json类 content_text_json = json.loads(flow.response.text) data_list = content_text_json.get('data') for data in data_list: item = dict() # 获取数据content data_content = data.get('content') data_content_json = json.loads(data_content) # 判断时候为广告 data_label = data_content_json.get( 'label') if 'label' in data_content_json else "" if data_label == '广告': item['display_url'] = data_content_json.get( 'display_url' ) if 'display_url' in data_content_json else "" large_image_list = data_content_json.get( 'large_image_list' ) if 'large_image_list' in data_content_json else "" item['large_image_list_url'] = large_image_list[0].get( 'url' ) if large_image_list[ 0] is not None and 'url' in large_image_list[0] else "" item['title'] = data_content_json.get( 'title') if 'title' in data_content_json else "" item['source'] = data_content_json.get( 'source') if 'source' in data_content_json else "" jstr = "" try: jstr = json.dumps(item, ensure_ascii=False) meta_data = json.dumps(data_content_json, ensure_ascii=False) self.producer.send( topic=self.topic, value=jstr.encode(encoding='utf-8'), partition=random.choice( list(self.producer.partitions_for( self.topic)))) self.producer.send( topic=self.meta_topic, value=data_content.encode(encoding='utf-8'), partition=random.choice( list( self.producer.partitions_for( self.meta_topic)))) except Exception as err: self.logger.error( 'kafka send failure:host:{}, topic:{}, values:'. format(self.server, self.topic, jstr)) sql = 'insert into android_toutiao_app(meta_data, display_url, large_image_list, title, `source`, create_time) value (%s,%s,%s,%s,%s, %s);' data_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) args = (meta_data, item.get('display_url'), item.get('large_image_list_url'), item.get('title'), item.get('source'), data_time) pool.insert(sql, args) else: continue pass pass