def parse(self, response): if not response.body: logger.error(msg='there is no response body ,please go and check it ') return json_object = json.loads(response.body_as_unicode()) if not json_object: logger.error(msg='there is no json object') return result = json_object.get('result', None) if not result: return item_list = result.get('list') if not items: return for tmp_item in item_list: item = items.CnbetaspiderItem() item['catid'] = tmp_item.get('catid', None) item['comments'] = tmp_item.get('comments', None) item['counter'] = tmp_item.get('counter', None) item['mview'] = tmp_item.get('mview', None) item['rate_sum'] = tmp_item.get('rate_sum', None) item['source'] = tmp_item.get('source', None) item['score'] = tmp_item.get('score', None) item['thumb'] = tmp_item.get('thumb', None) item['topic'] = tmp_item.get('topic', None) item['inputtime'] = tmp_item.get('inputtime', None) item['hometext'] = tmp_item.get('hometext', None) item['title'] = tmp_item.get('title', None) item['url_show'] = 'http://www.cnbeta.com' + tmp_item.get('url_show', '') item['crawled_datetime'] = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') yield item
def parse(self, response): if not response.body: logger.error( msg='there is no response body ,please go and check it ') return json_object = json.loads(response.body_as_unicode()) if not json_object: logger.error(msg='there is no json object') return result = json_object.get('result', None) if not result: return item_list = result.get('list') if not items: return for tmp_item in item_list: item = items.CnbetaspiderItem() item['catid'] = tmp_item.get('catid', None) item['comments'] = tmp_item.get('comments', None) item['counter'] = tmp_item.get('counter', None) item['mview'] = tmp_item.get('mview', None) item['rate_sum'] = tmp_item.get('rate_sum', None) item['source'] = tmp_item.get('source', None) item['score'] = tmp_item.get('score', None) item['thumb'] = tmp_item.get('thumb', None) item['topic'] = tmp_item.get('topic', None) item['inputtime'] = tmp_item.get('inputtime', None) item['hometext'] = tmp_item.get('hometext', None) item['title'] = tmp_item.get('title', None) item['url_show'] = 'http://www.cnbeta.com' + tmp_item.get( 'url_show', '') item['crawled_datetime'] = datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S') yield item
def parse(self, response): if not response.body: logger.error( msg='there is no response body ,please go and check it ') return nodes = response.xpath('//div[@class="artlist clearfix"]/DL/DT') if nodes: pass else: nodes = response.xpath('//div[@class="artlist clearfix"]/dl/dt') for node in nodes: pubdate = node.xpath('.//span/text()').extract_first() pubdate = re.sub('日期:', '', pubdate) title = node.xpath('.//a/text()').extract_first() url = node.xpath('.//a/@href').extract_first() full_url = 'https://www.jb51.net{}'.format(url) item = SandboxItem() item['pubdate'] = pubdate item['url'] = full_url item['title'] = title item['category'] = self.category yield item
def parse(self, response): if not response.body: logger.error( msg='there is no response body ,please go and check it ') return all_items = Selector(response).xpath('//ul[@class = "item-lists"]/li') for i in range(len(all_items)): item = items.XianyuItem() item['title'] = all_items[i].xpath( '//h4[@class ="item-title"]//a[@target = "_blank"]/text()' ).extract()[i] item['price'] = all_items[i].xpath( '//span[@class ="price"]//em/text()').extract()[i] item['description'] = all_items[i].xpath( '//div[@class = "item-description"]/text()').extract()[i] item['pic'] = ("https:" + str(all_items[i].xpath( '//div[@class = "item-pic sh-pic120"]//img/@src').extract()[i]) ).replace('_120x120', '').strip() item['area'] = all_items[i].xpath( '//div[@class="seller-location"]/text()').extract()[i] item["info"] = "https:" + all_items[i].xpath( '//a[@target = "_blank"]/@href').extract()[i] yield item
def parse_weibo_context(self, soup, uid): weibo_info = WeiboItem() if self.first_flag_home: self.first_flag_home = False return None else: contexts = soup.find_all("div", class_="c") for item in contexts: try: context = item.find("span", class_="ctt") if not context: continue weibo_text = context.text.encode("utf-8", "ignore").replace(" ", "").\ replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\ replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "") parent_ele = context.parent.parent like_ele = parent_ele.find(text=re.compile(u"^赞\[\d*\]$")) relay_ele = parent_ele.find(text=re.compile(u"^转发\[\d*\]$")) comment_ele = parent_ele.find(text=re.compile(u"^评论\[\d*\]$")) issue_time_ele = parent_ele.find("span", class_="ct") issue_time = issue_time_ele.text issue_time = issue_time.encode("utf-8") issue = issue_time.split("来自") issue_datetime = "" if len(issue) > 0: if "分钟" in issue[0]: min = filter(str.isdigit, issue[0]) t = datetime.datetime.now() - datetime.timedelta(minutes=int(min)) issue_datetime = t.strftime("%Y-%m-%d %H:%M:%S") elif "今天" in issue[0]: time = issue[0].replace("今天 ", "").replace("\xc2\xa0", "") issue_datetime = datetime.datetime.now().strftime("%Y-%m-%d ") + time else: issue_datetime = issue[0].replace("月", "-").replace("日", "").replace("\xc2\xa0", "") if issue[0].count("-") < 2: issue_datetime =datetime.datetime.now().strftime("%Y-") + issue_datetime issue_device = issue[1] if len(issue) > 1 else None weibo_info["context"] = weibo_text weibo_info["user_id"] = uid weibo_info["issue_time"] = issue_datetime.strip() weibo_info["get_time"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") weibo_info["like_count"] = filter(str.isdigit, like_ele.encode("utf-8")) weibo_info["relay_count"] = filter(str.isdigit, relay_ele.encode("utf-8")) weibo_info["comment_count"] = filter(str.isdigit, comment_ele.encode("utf-8")) weibo_info["device"] = issue_device # print issue_datetime, issue_device, weibo_text # print like_ele.encode("utf-8"), relay_ele.encode("utf-8"), comment_ele.encode("utf-8") return weibo_info # 只爬去第一条微博 except Exception, e: logger.error(e)
def get_doc_list(self, response): """ 实现翻页,解析doc_id,返回案件基本信息 :param response: :return: """ cookie = response.meta['cookie'] vjkl5 = response.meta['vjkl5'] Param = response.meta['Param'] try: result = json.loads(json.loads(response.text)) except BaseException as exc: logger.error(exc) yield response.request else: if not response.meta.get('key'): format_key_str = result[0]['RunEval'].encode('utf-8') key = getkey(format_key_str).encode('utf-8') page_count = int(result[0]['Count']) / 20 if int( result[0]['Count']) / 20 == 0 else int( result[0]['Count']) / 20 + 1 for page in range(2, page_count + 1): data = { 'Param': Param, 'Index': str(page), 'Page': '20', 'Order': u'法院层级', 'Direction': 'asc', 'vl5x': vjkl5 } yield scrapy.FormRequest( 'http://wenshu.court.gov.cn/List/ListContent', headers={'Cookie': cookie}, callback=self.get_doc_list, formdata=data, meta={ 'cookie': cookie, 'vjkl5': vjkl5, 'Param': Param, 'type_list': [], 'key': key }) else: key = response.meta['key'] for x in result[1:]: iid = x[u'文书ID'].encode('utf-8') docid = decode_docid(iid, key) item = DocInfo() item['doc_id'] = docid item['doc_name'] = x[u'案件名称'] item['doc_date'] = x[u'裁判日期'] yield item
def get_tree_list(self, response): """ 根据分类数量进行循环获取数据量少于200的查询条件,并请求案件信息列表 :param response: :return: """ cookie = response.meta['cookie'] vjkl5 = response.meta['vjkl5'] try: html = json.loads(json.loads(response.text)) except BaseException as exc: logger.error(exc) yield response.request else: for d in html: if d['Key'] == response.meta['type_list'][0]: for dd in d['Child']: if not dd['Key'] or not dd['IntValue']: continue Param = response.meta['Param'] + u',{}:{}'.format( d['Key'], dd['Key']) data = { 'Param': Param, 'Index': '1', 'Page': '20', 'Order': u'法院层级', 'Direction': 'asc', 'vl5x': vjkl5 } if dd['IntValue'] <= 200 or len( response.meta['type_list']) == 1: yield scrapy.FormRequest( 'http://wenshu.court.gov.cn/List/ListContent', headers={'Cookie': cookie}, callback=self.get_doc_list, formdata=data, meta={ 'cookie': cookie, 'vjkl5': vjkl5, 'Param': Param }) else: yield scrapy.FormRequest( 'http://wenshu.court.gov.cn/List/TreeContent', headers={'Cookie': cookie}, callback=self.get_tree_list, formdata=data, meta={ 'cookie': cookie, 'vjkl5': vjkl5, 'Param': Param, 'type_list': response.meta['type_list'][1:] })
def process_social_data(self, item): sql = ''' INSERT INTO tab_social_network (user_id, weibo_count, follows_count, fans_count) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE weibo_count = %s, follows_count = %s, fans_count = %s ''' cursor = self.__conn.cursor() try: cursor.execute(sql, (item["user_id"], item["weibo"], item["follow"], item["fans"], item["weibo"], item["follow"], item["fans"],)) except Exception, e: logger.error("social data insert error %s" % e)
def process_weibo_context(self, item): sql = ''' INSERT INTO tab_context_info (user_id, issue_time, get_time, context, like_count, relay_count, comment_count, device) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) ''' cursor = self.__conn.cursor() try: cursor.execute("SET NAMES utf8mb4") cursor.execute(sql, (item["user_id"], item["issue_time"], item["get_time"], item["context"], item["like_count"], item["relay_count"], item["comment_count"], item["device"])) except Exception, e: logger.error("weibo context data insert error %s" % e)
def parse(self, response): if not response.body: logger.error( msg='there is no response body ,please go and check it ') return html = str(response.body) # item = items.jdItem() # item['html'] = html # yield item with open("page.txt", "a") as f: f.write(html) f.close()
def parse(self, response): if not response.body: logger.error( msg='there is no response body ,please go and check it ') return all_item_urls = Selector(response).xpath('//div[@class = "hd"]/a') for i in all_item_urls: item = items.Top250ItemUrl() item['url'] = i.xpath('./@href').extract()[0] yield item
def process_weibo_context(self, item): sql = ''' INSERT INTO tab_context_info (user_id, issue_time, get_time, context, like_count, relay_count, comment_count, device) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) ''' cursor = self.__conn.cursor() try: cursor.execute("SET NAMES utf8mb4") cursor.execute( sql, (item["user_id"], item["issue_time"], item["get_time"], item["context"], item["like_count"], item["relay_count"], item["comment_count"], item["device"])) except Exception, e: logger.error("weibo context data insert error %s" % e)
def convert_api(pattarn, tag_name, css_url): pattarn = 'uni' + repr(pattarn).strip("'")[-4:] pathes = get_all_path() # print(pathes) if css_url[-10:] not in pathes: write_file(css_url, 'w') font_url = get_font_url(css_url[-10:], tag_name) if font_url[-10:] not in pathes: write_file(font_url, 'wb') xml_tag_list = font_xml(font_url[-10:]) # print(pattarn,xml_tag_list) if pattarn in xml_tag_list: p_index = xml_tag_list.index(pattarn) woffs = get_woffs(woff_string) print('{}>>{}'.format(pattarn, woffs[p_index])) return woffs[p_index] else: logger.error('解析出错')
def parse(self, response): t1 = time.time() html = scrapy.Selector(text=response.text) divs = html.css("#content_left > div .f13 .c-tools::attr(data-tools)") for div in divs: data_str = div.extract() data_dict = json.loads(data_str) url = None try: url = requests.get(data_dict['url'], timeout=5).url schame = urllib.parse.urlparse(url).netloc sql = f"insert into seed(url,title,site_name,type) values('{url}','{data_dict['title']}','{schame}',1)" self.mysql.excute_sql(sql) except Exception as e: logger.error( f"requests.get(data_dict['url']).url ===>>> {str(e)}") t2 = time.time() logger.info(f"执行===>>> {response.url} 花费时间{str(t2-t1)}")
def process_social_data(self, item): sql = ''' INSERT INTO tab_social_network (user_id, weibo_count, follows_count, fans_count) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE weibo_count = %s, follows_count = %s, fans_count = %s ''' cursor = self.__conn.cursor() try: cursor.execute(sql, ( item["user_id"], item["weibo"], item["follow"], item["fans"], item["weibo"], item["follow"], item["fans"], )) except Exception, e: logger.error("social data insert error %s" % e)
def process_user_info(self, item): sql = ''' INSERT INTO tab_base_info (user_id, user_name, sex, province, city, birthday, abstract) VALUES (%s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE user_name = %s, sex = %s, province = %s, city = %s, birthday = %s, abstract = %s ''' cursor = self.__conn.cursor() try: province = item.get("province", None) city = item.get("city", None) birthday = item.get("birthday", None) abstract = item.get("abstract", None) sex = item.get("sex", "-1") cursor.execute("SET NAMES utf8mb4") cursor.execute(sql, (item["user_id"], item["user_name"], sex, province, city, birthday, abstract, item["user_name"], sex, province, city, birthday, abstract)) except Exception, e: logger.error("user info data insert error %s" % e)
def init(self): logger.debug("Initial the database") try: CREDIT = json.loads(os.environ.get("VCAP_SERVICES"))['cloudantNoSQLDB'][0]['credentials'] couch = couchdb.Server("https://%s.cloudant.com" % setting.CREDIT['username']) couch.resource.credentials = (setting.CREDIT['username'], setting.CREDIT['password']) try: db = couch.create(self.db_name) logger.debug("Create a new database " + self.db_name) except: db = couch.__getitem__(self.db_name) logger.debug("Use Data Base" + self.db_name) logger.debug("Create datadase successfully") self.__dict__.update(db.__dict__) except: logger.error('cannot find the credentials pls bind a CloudantDB Service') return self
def parse(self, response): t1=time.time() html=scrapy.Selector(text=response.text) divs=html.css("div.results > div") for div in divs: vrwrap=div.css("div.vrwrap") if len(vrwrap)==0: title = "".join(div.css("div.rb h3 a::text").extract()) url = "https://www.sogou.com" + div.css("div.rb h3 a::attr(href)").extract()[0] else: title="".join(div.css("div.vrwrap h3 a::text").extract()) url = "https://www.sogou.com"+div.css("div.vrwrap h3 a::attr(href)").extract()[0] try: _html=scrapy.Selector(text=requests.get(url,verify=False).text) url = _html.re("window.location.replace\(\"(.*?)\"\)")[0] schame = urllib.parse.urlparse(url).netloc sql = f"insert into seed(url,title,site_name,type) values('{url}','{title}','{schame}',1)" self.mysql.excute_sql(sql) except Exception as e: logger.error(f"requests.get(data_dict['url']).url ===>>> {str(e)}") t2=time.time() logger.info(f"执行===>>> {response.url} 花费时间{str(t2-t1)}")
def init(self): logger.debug("Initial the database") try: CREDIT = json.loads(os.environ.get( "VCAP_SERVICES"))['cloudantNoSQLDB'][0]['credentials'] couch = couchdb.Server("https://%s.cloudant.com" % setting.CREDIT['username']) couch.resource.credentials = (setting.CREDIT['username'], setting.CREDIT['password']) try: db = couch.create(self.db_name) logger.debug("Create a new database " + self.db_name) except: db = couch.__getitem__(self.db_name) logger.debug("Use Data Base" + self.db_name) logger.debug("Create datadase successfully") self.__dict__.update(db.__dict__) except: logger.error( 'cannot find the credentials pls bind a CloudantDB Service') return self
def parse(self, response): if not response.body: logger.error( msg='there is no response body ,please go and check it ') return i = Selector(response) item = items.Top250Item() item['rank'] = i.xpath( '//span[@class = "top250-no"]/text()').extract()[0] item['title'] = i.xpath( '//span[@property="v:itemreviewed"]/text()').extract()[0] item['year'] = i.xpath('//span[@class = "year"]/text()').extract()[0] item['area'] = i.xpath( '//div[@id = "info"]/br[4]/following-sibling::text()').extract()[1] item['rating'] = i.xpath( '//strong[@class="ll rating_num"]/text()').extract()[0] item['rating_people'] = i.xpath( '//span[@property="v:votes"]/text()').extract()[0] item['intro'] = i.xpath( '//span[@property="v:summary"]/text()').extract() item['style'] = i.xpath('//span[@property="v:genre"]/text()').extract() yield item
def start_requests(self): driver = webdriver.Chrome( executable_path="/Users/yuanlang/work/javascript/chromedriver") driver.get( "https://www.toutiao.com/search/?keyword=2018%E5%B9%B48%E6%9C%88%E5%9B%9B%E5%B7%9D%E8%BE%BE%E5%B7%9E%E5%B8%82%E5%87%BA%E7%A7%9F%E8%BD%A6%E7%BD%A2%E5%B7%A5%E4%BA%8B%E4%BB%B6" ) time.sleep(2) for url in self.start_urls: for page in range(0, 8): driver.get( url= f"{url}&offset={20*page}×tamp={'%d'%(time.time()*1000)}" ) time.sleep(5) html = scrapy.Selector(text=driver.page_source) content = html.css("body > pre::text").extract_first() data = json.loads(content)["data"] for item in data: try: if "article_url" not in item: if "display" not in item: print(item) continue print(item["display"]) _url = item["display"]["info"]["url"] title = item["display"]["emphasized"]["title"] else: title = item["abstract"] _url = item["article_url"] schame = urllib.parse.urlparse(_url).netloc sql = f"insert into seed(url,title,site_name,type) values('{_url}','{title}','{schame}',1)" self.mysql.excute_sql(sql) except Exception as e: logger.error( f"requests.get(data_dict['url']).url ===>>> {str(e)}" )
def parse_weibo_context(self, soup, uid): weibo_info = WeiboItem() if self.first_flag_home: self.first_flag_home = False return None else: contexts = soup.find_all("div", class_="c") for item in contexts: try: context = item.find("span", class_="ctt") if not context: continue weibo_text = context.text.encode("utf-8", "ignore").replace(" ", "").\ replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\ replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "") parent_ele = context.parent.parent like_ele = parent_ele.find(text=re.compile(u"^赞\[\d*\]$")) relay_ele = parent_ele.find( text=re.compile(u"^转发\[\d*\]$")) comment_ele = parent_ele.find( text=re.compile(u"^评论\[\d*\]$")) issue_time_ele = parent_ele.find("span", class_="ct") issue_time = issue_time_ele.text issue_time = issue_time.encode("utf-8") issue = issue_time.split("来自") issue_datetime = "" if len(issue) > 0: if "分钟" in issue[0]: min = filter(str.isdigit, issue[0]) t = datetime.datetime.now() - datetime.timedelta( minutes=int(min)) issue_datetime = t.strftime("%Y-%m-%d %H:%M:%S") elif "今天" in issue[0]: time = issue[0].replace("今天 ", "").replace( "\xc2\xa0", "") issue_datetime = datetime.datetime.now().strftime( "%Y-%m-%d ") + time else: issue_datetime = issue[0].replace( "月", "-").replace("日", "").replace("\xc2\xa0", "") if issue[0].count("-") < 2: issue_datetime = datetime.datetime.now( ).strftime("%Y-") + issue_datetime issue_device = issue[1] if len(issue) > 1 else None weibo_info["context"] = weibo_text weibo_info["user_id"] = uid weibo_info["issue_time"] = issue_datetime.strip() weibo_info["get_time"] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") weibo_info["like_count"] = filter(str.isdigit, like_ele.encode("utf-8")) weibo_info["relay_count"] = filter( str.isdigit, relay_ele.encode("utf-8")) weibo_info["comment_count"] = filter( str.isdigit, comment_ele.encode("utf-8")) weibo_info["device"] = issue_device # print issue_datetime, issue_device, weibo_text # print like_ele.encode("utf-8"), relay_ele.encode("utf-8"), comment_ele.encode("utf-8") return weibo_info # 只爬去第一条微博 except Exception, e: logger.error(e)
def parse_error(self, response): logger.error("post:%s" % response.url)
def get_char(js): all_var = {} # 判断混淆 无参数 返回常量 函数 if_else_no_args_return_constant_function_functions = [] """ function zX_() { function _z() { return '09'; }; if (_z() == '09,') { return 'zX_'; } else { return _z(); } } """ constant_function_regex4 = re.compile( """ function\s+\w+\(\)\s*\{\s* function\s+\w+\(\)\s*\{\s* return\s+[\'\"][^\'\"]+[\'\"];\s* \};\s* if\s*\(\w+\(\)\s*==\s*[\'\"][^\'\"]+[\'\"]\)\s*\{\s* return\s*[\'\"][^\'\"]+[\'\"];\s* \}\s*else\s*\{\s* return\s*\w+\(\);\s* \}\s* \} """, re.X) l = constant_function_regex4.findall(js) for i in l: function_name = re.search( """ function\s+(\w+)\(\)\s*\{\s* function\s+\w+\(\)\s*\{\s* return\s+[\'\"]([^\'\"]+)[\'\"];\s* \};\s* if\s*\(\w+\(\)\s*==\s*[\'\"]([^\'\"]+)[\'\"]\)\s*\{\s* return\s*[\'\"]([^\'\"]+)[\'\"];\s* \}\s*else\s*\{\s* return\s*\w+\(\);\s* \}\s* \} """, i, re.X) if_else_no_args_return_constant_function_functions.append( function_name.groups()) js = js.replace(i, "") # 替换全文 a, b, c, d = function_name.groups() all_var["%s()" % a] = d if b == c else b # 判断混淆 无参数 返回函数 常量 if_else_no_args_return_function_constant_functions = [] """ function wu_() { function _w() { return 'wu_'; }; if (_w() == 'wu__') { return _w(); } else { return '5%'; } } """ constant_function_regex5 = re.compile( """ function\s+\w+\(\)\s*\{\s* function\s+\w+\(\)\s*\{\s* return\s+[\'\"][^\'\"]+[\'\"];\s* \};\s* if\s*\(\w+\(\)\s*==\s*[\'\"][^\'\"]+[\'\"]\)\s*\{\s* return\s*\w+\(\);\s* \}\s*else\s*\{\s* return\s*[\'\"][^\'\"]+[\'\"];\s* \}\s* \} """, re.X) l = constant_function_regex5.findall(js) for i in l: function_name = re.search( """ function\s+(\w+)\(\)\s*\{\s* function\s+\w+\(\)\s*\{\s* return\s+[\'\"]([^\'\"]+)[\'\"];\s* \};\s* if\s*\(\w+\(\)\s*==\s*[\'\"]([^\'\"]+)[\'\"]\)\s*\{\s* return\s*\w+\(\);\s* \}\s*else\s*\{\s* return\s*[\'\"]([^\'\"]+)[\'\"];\s* \}\s* \} """, i, re.X) if_else_no_args_return_function_constant_functions.append( function_name.groups()) js = js.replace(i, "") # 替换全文 a, b, c, d = function_name.groups() all_var["%s()" % a] = b if b == c else d # var 参数等于返回值函数 var_args_equal_value_functions = [] """ var ZA_ = function(ZA__) { 'return ZA_'; return ZA__; }; """ constant_function_regex1 = re.compile( "var\s+[^=]+=\s*function\(\w+\)\{\s*[\'\"]return\s*\w+\s*[\'\"];\s*return\s+\w+;\s*\};" ) l = constant_function_regex1.findall(js) for i in l: function_name = re.search("var\s+([^=]+)", i).group(1) var_args_equal_value_functions.append(function_name) js = js.replace(i, "") # 替换全文 a = function_name js = re.sub("%s\(([^\)]+)\)" % a, r"\1", js) # var 无参数 返回常量 函数 var_no_args_return_constant_functions = [] """ var Qh_ = function() { 'return Qh_'; return ';'; }; """ constant_function_regex2 = re.compile( """ var\s+[^=]+=\s*function\(\)\{\s* [\'\"]return\s*\w+\s*[\'\"];\s* return\s+[\'\"][^\'\"]+[\'\"];\s* \}; """, re.X) l = constant_function_regex2.findall(js) for i in l: function_name = re.search( """ var\s+([^=]+)=\s*function\(\)\{\s* [\'\"]return\s*\w+\s*[\'\"];\s* return\s+[\'\"]([^\'\"]+)[\'\"];\s* \}; """, i, re.X) var_no_args_return_constant_functions.append(function_name.groups()) js = js.replace(i, "") # 替换全文 a, b = function_name.groups() all_var["%s()" % a] = b # 无参数 返回常量 函数 no_args_return_constant_functions = [] """ function ZP_() { 'return ZP_'; return 'E'; } """ constant_function_regex3 = re.compile( """ function\s*\w+\(\)\s*\{\s* [\'\"]return\s*[^\'\"]+[\'\"];\s* return\s*[\'\"][^\'\"]+[\'\"];\s* \}\s* """, re.X) l = constant_function_regex3.findall(js) for i in l: function_name = re.search( """ function\s*(\w+)\(\)\s*\{\s* [\'\"]return\s*[^\'\"]+[\'\"];\s* return\s*[\'\"]([^\'\"]+)[\'\"];\s* \}\s* """, i, re.X) no_args_return_constant_functions.append(function_name.groups()) js = js.replace(i, "") # 替换全文 a, b = function_name.groups() all_var["%s()" % a] = b # 无参数 返回常量 函数 中间无混淆代码 no_args_return_constant_sample_functions = [] """ function do_() { return ''; } """ constant_function_regex3 = re.compile( """ function\s*\w+\(\)\s*\{\s* return\s*[\'\"][^\'\"]*[\'\"];\s* \}\s* """, re.X) l = constant_function_regex3.findall(js) for i in l: function_name = re.search( """ function\s*(\w+)\(\)\s*\{\s* return\s*[\'\"]([^\'\"]*)[\'\"];\s* \}\s* """, i, re.X) no_args_return_constant_sample_functions.append(function_name.groups()) js = js.replace(i, "") # 替换全文 a, b = function_name.groups() all_var["%s()" % a] = b # 字符串拼接时使无参常量函数 """ (function() { 'return sZ_'; return '1' })() """ constant_function_regex6 = re.compile( """ \(function\(\)\s*\{\s* [\'\"]return[^\'\"]+[\'\"];\s* return\s*[\'\"][^\'\"]*[\'\"];? \}\)\(\) """, re.X) l = constant_function_regex6.findall(js) for i in l: function_name = re.search( """ \(function\(\)\s*\{\s* [\'\"]return[^\'\"]+[\'\"];\s* return\s*([\'\"][^\'\"]*[\'\"]);? \}\)\(\) """, i, re.X) js = js.replace(i, function_name.group(1)) # 字符串拼接时使用返回参数的函数 """ (function(iU__) { 'return iU_'; return iU__; })('9F') """ constant_function_regex6 = re.compile( """ \(function\(\w+\)\s*\{\s* [\'\"]return[^\'\"]+[\'\"];\s* return\s*\w+; \}\)\([\'\"][^\'\"]*[\'\"]\) """, re.X) l = constant_function_regex6.findall(js) for i in l: function_name = re.search( """ \(function\(\w+\)\s*\{\s* [\'\"]return[^\'\"]+[\'\"];\s* return\s*\w+; \}\)\(([\'\"][^\'\"]*[\'\"])\) """, i, re.X) js = js.replace(i, function_name.group(1)) # 获取所有变量 var_regex = "var\s+(\w+)=(.*?);\s" for var_name, var_value in re.findall(var_regex, js): var_value = var_value.strip("\'\"").strip() if "(" in var_value: var_value = ";" all_var[var_name] = var_value # 注释掉 此正则可能会把关键js语句删除掉 # js = re.sub(var_regex, "", js) for var_name, var_value in all_var.items(): js = js.replace(var_name, var_value) js = re.sub("[\s+']", "", js) # 寻找%E4%B8%AD%E5%80%92%E 密集区域 #string_region = re.findall("((?:%\w\w)+)", js) string_region = re.findall("((?:%\w\w|[A-Za-z\d])+)", js) # 去重 string_region = set(string_region) # 判断是否存在汉字 chinese_flag = 0 for string_ in string_region: if re.search("%\w\w", string_): chinese_flag = 1 if not chinese_flag: # 可能混淆字符为纯英文 。。。尚未解决 return [] string_str = "" for string_ in string_region: if not re.search("%\w\w", string_): continue # 过滤 try: string = unquote(string_) except: continue if len(string_) > len(string_str): string_str = string_ string = unquote(string_str) string_list = list(string) # 从 字符串密集区域后面开始寻找索引区域 index_m = re.search("([\d,]+(;[\d,]+)+)", js[js.find(string_str) + len(string_str):]) index_list = index_m.group(1).split(";") _word_list = [] for word_index_list in index_list: _word = "" if "," in word_index_list: word_index_list = word_index_list.split(",") word_index_list = [int(x) for x in word_index_list] else: word_index_list = [int(word_index_list)] for word_index in word_index_list: try: _word += string_list[word_index] except IndexError: logger.error('IndexError found: {} {}'.format( string_list, len(string_list), word_index)) pass _word_list.append(_word) return _word_list