Example #1
0
def encode(head, l):
    k = head.get("content-type")
    if k:
        k = headers.parse_content_type(k)
        if k is not None:
            try:
                boundary = k[2]["boundary"].encode("ascii")
                boundary = quote(boundary)
            except (KeyError, UnicodeError):
                return b""
            hdrs = []
            for key, value in l:
                file_type = mimetypes.guess_type(
                    str(key))[0] or "text/plain; charset=utf-8"

                if key:
                    hdrs.append(b"--%b" % boundary.encode('utf-8'))
                    disposition = b'form-data; name="%b"' % key
                    hdrs.append(b"Content-Disposition: %b" % disposition)
                    hdrs.append(b"Content-Type: %b" %
                                file_type.encode('utf-8'))
                    hdrs.append(b'')
                    hdrs.append(value)
                hdrs.append(b'')

                if value is not None:
                    # If boundary is found in value then raise ValueError
                    if re.search(
                            rb"^--%b$" % re.escape(boundary.encode('utf-8')),
                            value):
                        raise ValueError(b"boundary found in encoded string")

            hdrs.append(b"--%b--\r\n" % boundary.encode('utf-8'))
            temp = b"\r\n".join(hdrs)
            return temp
Example #2
0
def decode(hdrs, content):
    """
        Takes a multipart boundary encoded string and returns list of (key, value) tuples.
    """
    v = hdrs.get("content-type")
    if v:
        v = headers.parse_content_type(v)
        if not v:
            return []
        try:
            boundary = v[2]["boundary"].encode("ascii")
        except (KeyError, UnicodeError):
            return []

        rx = re.compile(br'\bname="([^"]+)"')
        r = []
        if content is not None:
            for i in content.split(b"--" + boundary):
                parts = i.splitlines()
                if len(parts) > 1 and parts[0][0:2] != b"--":
                    match = rx.search(parts[1])
                    if match:
                        key = match.group(1)
                        value = b"".join(parts[3 + parts[2:].index(b""):])
                        r.append((key, value))
        return r
    return []
Example #3
0
def decode(content_type: Optional[str],
           content: bytes) -> List[Tuple[bytes, bytes]]:
    """
        Takes a multipart boundary encoded string and returns list of (key, value) tuples.
    """
    if content_type:
        ct = headers.parse_content_type(content_type)
        if not ct:
            return []
        try:
            boundary = ct[2]["boundary"].encode("ascii")
        except (KeyError, UnicodeError):
            return []

        rx = re.compile(br'\bname="([^"]+)"')
        r = []
        if content is not None:
            for i in content.split(b"--" + boundary):
                parts = i.splitlines()
                if len(parts) > 1 and parts[0][0:2] != b"--":
                    match = rx.search(parts[1])
                    if match:
                        key = match.group(1)
                        value = b"".join(parts[3 + parts[2:].index(b""):])
                        r.append((key, value))
        return r
    return []
Example #4
0
def decode(hdrs, content):
    """
        Takes a multipart boundary encoded string and returns list of (key, value) tuples.
    """
    v = hdrs.get("content-type")
    if v:
        v = headers.parse_content_type(v)
        if not v:
            return []
        try:
            boundary = v[2]["boundary"].encode("ascii")
        except (KeyError, UnicodeError):
            return []

        rx = re.compile(br'\bname="([^"]+)"')
        r = []

        for i in content.split(b"--" + boundary):
            parts = i.splitlines()
            if len(parts) > 1 and parts[0][0:2] != b"--":
                match = rx.search(parts[1])
                if match:
                    key = match.group(1)
                    value = b"".join(parts[3 + parts[2:].index(b""):])
                    r.append((key, value))
        return r
    return []
Example #5
0
def check_css(content_type, content) -> bool:
    content_type = parse_content_type(content_type)
    t = "{}/{}".format(content_type[0], content[1])
    css_list = [
        "text/css",
    ]
    if t in css_list:
        return True
    return False
Example #6
0
def check_html(content_type, content) -> bool:
    content_type = parse_content_type(content_type)
    t = "{}/{}".format(content_type[0], content[1])
    html_list = [
        'text/html',
    ]
    if t in html_list:
        return True
    return False
Example #7
0
def check_form(content_type, content) -> bool:
    content_type = parse_content_type(content_type)
    t = "{}/{}".format(content_type[0], content[1])
    form_list = [
        'multipart/form-data',
        'application/x-www-form-urlencoded',
    ]
    if t in form_list:
        return True
    return False
Example #8
0
    def set_text(self, text):
        if text is None:
            self.content = None
            return
        enc = self._guess_encoding()

        try:
            self.content = encoding.encode(text, enc)
        except ValueError:
            # Fall back to UTF-8 and update the content-type header.
            ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {})
            ct[2]["charset"] = "utf-8"
            self.headers["content-type"] = headers.assemble_content_type(*ct)
            enc = "utf8"
            self.content = text.encode(enc, "surrogateescape")
    def decode_params_multipart(self, hdrs, content):

        v = hdrs.get("content-type")
        if v:
            v = headers.parse_content_type(v)
            if not v:
                return []
            try:
                boundary = v[2]["boundary"].encode("ascii")
            except (KeyError, UnicodeError):
                return []

            rx = re.compile(br'\bname="([^"]+)"')
            rxf = re.compile(br'\bname="([^"]+)";\s*filename="(.*?)"')
            r = {}

            for i in content.split(b"--" + boundary):
                parts = i.splitlines()
                if len(parts) > 1 and parts[0][0:2] != b"--":
                    match = rxf.search(parts[1])
                    print(parts)
                    if match:
                        key = match.group(1)
                        value = b"\n".join(parts[3 + parts[2:].index(b""):])
                        filename = match.group(2)
                        if (not key in r):
                            r[key] = []
                        r[key].append({
                            'type': 'file',
                            'filename': filename,
                            'value': value
                        })

                    else:
                        match = rx.search(parts[1])
                        if match:
                            key = match.group(1)
                            value = b"\n".join(parts[3 +
                                                     parts[2:].index(b""):])
                            if (not key in r):
                                r[key] = []
                            r[key].append({
                                'type': 'var',
                                'filename': None,
                                'value': value
                            })
            return r
        return {}
Example #10
0
 def response(self, flow: HTTPFlow):
     if flow.request.url.startswith(self.url_pattrn):
         # 获取文本编码
         enc = headers.parse_content_type(
             flow.response.headers.get("content-type",
                                       "")) or ("text", "plain", {})
         # 数据加载成json类
         content_text_json = json.loads(flow.response.text)
         data_list = content_text_json.get('data')
         for data in data_list:
             item = dict()
             # 获取数据content
             data_content = data.get('content')
             data_content_json = json.loads(data_content)
             # 判断时候为广告
             data_label = data_content_json.get(
                 'label') if 'label' in data_content_json else ""
             if data_label == '广告':
                 item['display_url'] = data_content_json.get(
                     'display_url'
                 ) if 'display_url' in data_content_json else ""
                 large_image_list = data_content_json.get(
                     'large_image_list'
                 ) if 'large_image_list' in data_content_json else ""
                 item['large_image_list_url'] = large_image_list[0].get(
                     'url'
                 ) if large_image_list[
                     0] is not None and 'url' in large_image_list[0] else ""
                 item['title'] = data_content_json.get(
                     'title') if 'title' in data_content_json else ""
                 item['source'] = data_content_json.get(
                     'source') if 'source' in data_content_json else ""
                 jstr = json.dumps(item, ensure_ascii=False)
                 print(jstr)
                 jstr = json.dumps(item, ensure_ascii=False)
                 try:
                     self.producer.send(topic=self.topic,
                                        value=jstr.encode(encoding='utf-8'),
                                        partition=random.choice(
                                            list(self.partition)))
                 except:
                     self.logger.error('kafka 发送失败')
                 pass
             else:
                 continue
         pass
     pass
Example #11
0
def check_font(content_type, content) -> bool:
    content_type = parse_content_type(content_type)
    if content_type[0].endwith("font"):
        return True
    font_list = [
        'application/font-cff',
        'application/font-off',
        'application/font-sfnt',
        'application/font-ttf',
        'application/font-woff',
        'application/vnd.ms-fontobject',
        'application/vnd.ms-opentype',
    ]
    t = "{}/{}".format(content_type[0], content[1])
    if t in font_list:
        return True
    return False
Example #12
0
    def response(self, flow: HTTPFlow):
        if flow.request.url.startswith(self.url_pattrn):
            # 获取文本编码
            enc = headers.parse_content_type(
                flow.response.headers.get("content-type",
                                          "")) or ("text", "plain", {})
            # 数据加载成json类
            content_text_json = json.loads(flow.response.text)
            card_list = content_text_json.get('cards')

            for card in card_list:
                item = WeiboItems()
                try:
                    self._param_card(card, item)
                    item_dic = dict(item)
                    jstr = json.dumps(item_dic, ensure_ascii=False)
                    if jstr != '{}':
                        self.producer.send(topic=self.topic,
                                           value=jstr.encode('utf-8'),
                                           partition=random.choice(
                                               list(self.partition)))
                except Exception as err:
                    self.logger.error(str(err.with_traceback()))
                print(item)
Example #13
0
def check_js(content_type, content) -> bool:
    content_type = parse_content_type(content_type)
    t = "{}/{}".format(content_type[0], content[1])
    js_list = [
        "application/ecmascript",
        "application/javascript",
        "application/x-ecmascript",
        "application/x-javascript",
        "text/ecmascript",
        "text/javascript",
        "text/javascript1.0",
        "text/javascript1.1",
        "text/javascript1.2",
        "text/javascript1.3",
        "text/javascript1.4",
        "text/javascript1.5",
        "text/jscript",
        "text/livescript",
        "text/x-ecmascript",
        "text/x-javascript",
    ]
    if t in js_list:
        return True
    return False
Example #14
0
 def _get_content_type_charset(self) -> Optional[str]:
     ct = headers.parse_content_type(self.headers.get("content-type", ""))
     if ct:
         return ct[2].get("charset")
Example #15
0
def check_json(content_type, content) -> bool:
    content_type = parse_content_type(content_type)
    if content_type[1].endwith("json"):
        return True
    return False
Example #16
0
def check_image(content_type, content) -> bool:
    content_type = parse_content_type(content_type)
    if content_type[0].endwith("image"):
        return True
    return False
Example #17
0
 def _get_content_type_charset(self) -> Optional[str]:
     ct = parse_content_type(self.headers.get("content-type", ""))
     if ct:
         return ct[2].get("charset")
     return None
Example #18
0
 def response(self, flow: HTTPFlow):
     if flow.request.url.startswith(self.url_pattrn):
         # 获取文本编码
         enc = headers.parse_content_type(
             flow.response.headers.get("content-type",
                                       "")) or ("text", "plain", {})
         # 数据加载成json类
         content_text_json = json.loads(flow.response.text)
         data_list = content_text_json.get('data')
         for data in data_list:
             item = dict()
             # 获取数据content
             data_content = data.get('content')
             data_content_json = json.loads(data_content)
             # 判断时候为广告
             data_label = data_content_json.get(
                 'label') if 'label' in data_content_json else ""
             if data_label == '广告':
                 item['display_url'] = data_content_json.get(
                     'display_url'
                 ) if 'display_url' in data_content_json else ""
                 large_image_list = data_content_json.get(
                     'large_image_list'
                 ) if 'large_image_list' in data_content_json else ""
                 item['large_image_list_url'] = large_image_list[0].get(
                     'url'
                 ) if large_image_list[
                     0] is not None and 'url' in large_image_list[0] else ""
                 item['title'] = data_content_json.get(
                     'title') if 'title' in data_content_json else ""
                 item['source'] = data_content_json.get(
                     'source') if 'source' in data_content_json else ""
                 jstr = ""
                 try:
                     jstr = json.dumps(item, ensure_ascii=False)
                     meta_data = json.dumps(data_content_json,
                                            ensure_ascii=False)
                     self.producer.send(
                         topic=self.topic,
                         value=jstr.encode(encoding='utf-8'),
                         partition=random.choice(
                             list(self.producer.partitions_for(
                                 self.topic))))
                     self.producer.send(
                         topic=self.meta_topic,
                         value=data_content.encode(encoding='utf-8'),
                         partition=random.choice(
                             list(
                                 self.producer.partitions_for(
                                     self.meta_topic))))
                 except Exception as err:
                     self.logger.error(
                         'kafka send failure:host:{}, topic:{}, values:'.
                         format(self.server, self.topic, jstr))
                 sql = 'insert into android_toutiao_app(meta_data, display_url, large_image_list, title, `source`, create_time) value (%s,%s,%s,%s,%s, %s);'
                 data_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                           time.localtime(time.time()))
                 args = (meta_data, item.get('display_url'),
                         item.get('large_image_list_url'),
                         item.get('title'), item.get('source'), data_time)
                 pool.insert(sql, args)
             else:
                 continue
         pass
     pass