def get_encrypt_json(*, start_date: str, end_date: str, keywords: List[List[str]], type: str, area: int, cookies: str) -> Dict: pre_url_map = { 'search': 'http://index.baidu.com/api/SearchApi/index?', 'live': 'http://index.baidu.com/api/LiveApi/getLive?', 'news': 'http://index.baidu.com/api/NewsApi/getNewsIndex?', 'feed': 'http://index.baidu.com/api/FeedSearchApi/getFeedIndex?' } pre_url = pre_url_map[type] word_list = [[{ 'name': keyword, 'wordType': 1 } for keyword in keyword_list] for keyword_list in keywords] if type == 'live': request_args = {'word': json.dumps(word_list), 'region': area} else: request_args = { 'word': json.dumps(word_list), 'startDate': start_date.strftime('%Y-%m-%d'), 'endDate': end_date.strftime('%Y-%m-%d'), 'area': area } url = pre_url + urlencode(request_args) html = http_get(url, cookies) datas = json.loads(html) if datas['status'] == 10000: raise QdataError(ErrorCode.NO_LOGIN) if datas['status'] != 0: raise QdataError(ErrorCode.UNKNOWN) return datas
def http_get(url: str, cookies: str) -> str: """ 发送get请求, 程序中所有的get都是调这个方法 如果想使用多cookies抓取, 和请求重试功能 在这自己添加 """ _headers = headers.copy() _headers['Cookie'] = cookies try: response = requests.get(url, headers=_headers, timeout=5) except requests.Timeout: raise QdataError(ErrorCode.NETWORK_ERROR) if response.status_code != 200: raise QdataError(ErrorCode.NETWORK_ERROR) return response.text
def get_search_index(*, keywords_list: List[List[str]], start_date: str, end_date: str, cookies: str, area: int = 0): if len(keywords_list) > 5: raise QdataError(ErrorCode.KEYWORD_LIMITED) # print(keywords_list) for start_date, end_date in common.get_time_range_list( start_date, end_date): encrypt_json = common.get_encrypt_json(start_date=start_date, end_date=end_date, keywords=keywords_list, type='search', area=area, cookies=cookies) encrypt_datas = encrypt_json['data']['userIndexes'] uniqid = encrypt_json['data']['uniqid'] key = common.get_key(uniqid, cookies) for encrypt_data in encrypt_datas: for kind in ALL_KIND: encrypt_data[kind]['data'] = common.decrypt_func( key, encrypt_data[kind]['data']) for formated_data in format_data(encrypt_data): yield formated_data
def get_extended_index( *, keywords_list: List[List[str]], start_date: str, end_date: str, cookies: str, area: int, type: str ): if len(keywords_list) > 5: raise QdataError(ErrorCode.KEYWORD_LIMITED) for start_date, end_date in common.get_time_range_list(start_date, end_date): encrypt_json = common.get_encrypt_json( start_date=start_date, end_date=end_date, keywords=keywords_list, type=type, area=area, cookies=cookies ) encrypt_datas = encrypt_json['data']['index'] uniqid = encrypt_json['data']['uniqid'] key = common.get_key(uniqid, cookies) for encrypt_data in encrypt_datas: encrypt_data['data'] = common.decrypt_func(key, encrypt_data['data']) for formated_data in format_data(encrypt_data): formated_data['type'] = type yield formated_data
def get_live_search_index(*, keywords_list: List[List[str]], cookies: str, area: int = 0): if len(keywords_list) > 5: raise QdataError(ErrorCode.KEYWORD_LIMITED) encrypt_json = common.get_encrypt_json( start_date='', end_date='', keywords=keywords_list, type='live', area=area, cookies=cookies, ) encrypt_datas = encrypt_json['data']['result'] uniqid = encrypt_json['data']['uniqid'] key = common.get_key(uniqid, cookies) for encrypt_data in encrypt_datas: keyword = [ keyword_info['name'] for keyword_info in encrypt_data['key'] ] if area != 0: encrypt_data = encrypt_data['index'][str(area)] else: encrypt_data = encrypt_data['index'][0] for kind in ALL_KIND: encrypt_data[kind] = common.decrypt_func(key, encrypt_data[kind]) for formated_data in format_data(encrypt_data, keyword): yield formated_data
def get_cookie_by_qr_login() -> str: print("扫完码记得关闭弹出的图片框...") try: qrcode_link, sign, callback = get_qrcode_info() show_qrcode(qrcode_link) except Exception: raise QdataError(ErrorCode.GET_QR_FAIL) try: bduss = get_bduss(sign, callback) cookies = get_login_cookie(bduss) except Exception: raise QdataError(ErrorCode.LOGIN_FAIL) try: cookies = cookies + get_exin() except Exception: raise QdataError(ErrorCode.INDEX_LOGIN_FAIL) return cookies
def get_exin() -> str: """ 拿恶心的东西 """ url = "https://miao.baidu.com/abdr" resp = session.post(url, data=EXIN_TOKEN, headers=HEADERS) resp_data = json.loads(resp.text) if isinstance(resp_data['data'], dict): return "; __yjsv5_shitong={}_{}_{}_{}_{}_{}_{}".format( resp_data['data']['ver'], resp_data['key_id'], resp_data['data']['lid'], resp_data['data']['ret_code'], resp_data['data']['server_time'], resp_data['data']['ip'], resp_data['sign'] ) elif isinstance(resp_data['data'], str): __yjs_st = b64encode(quote("_".join([resp_data['data'], resp_data['key_id'], resp_data['sign']])).encode()).decode() return "; __yjs_st=2_{}".format(__yjs_st) else: raise QdataError(ErrorCode.LOGIN_FAIL)
def get_company_count(*, area_code: List[str] = None, category: List[str] = None, reg_capital_range: List[Tuple[int, int]] = None, establish_time_range: List[Tuple[int, int]] = None, reg_status: List[str] = None, capital_unit: List[str] = None, company_type: List[str] = None, institution_type: List[str] = None, staff_num_range: List[Tuple[int, int]] = None, financing_round: List[str] = None, listed_type: List[str] = None, has_phone: bool = None, has_mobile: bool = None, has_email: bool = None, has_brand: bool = None, has_dishonest: bool = None, has_website: bool = None, has_chattel_mortage: bool = None, has_copyright: bool = None, has_soft_copyright: bool = None, is_high_tech_company: bool = None, is_tax_a_level: bool = None, is_general_taxpayer: bool = None, has_bid: bool = None) -> int: """ area_code: 所在地区\n category: 行业分类\n reg_capital_range: 注册资本范围(万元)\n establish_time_range: 成立时间范围(毫秒)\n reg_status: 注册状态\n capital_unit: 资本类型\n company_type: 企业类型\n institution_type: 机构类型\n staff_num_range: 员工参保人数范围(人)\n financing_round: 融资轮次\n listed_type: 上市类型\n has_phone: 有无联系方式\n has_mobile: 有无手机号\n has_email: 有无邮箱\n has_brand: 有无商标\n has_dishonest: 有无失信\n has_website: 有无网址\n has_chattel_mortage: 有无动产抵押\n has_copyright: 有无作品著作\n has_soft_copyright: 有无软件著作\n is_high_tech_company: 是否是高新技术企业\n is_tax_a_level: 是否税务评级为A\n is_general_taxpayer: 是否为一般纳税人\n has_bid: 是否有招投标\n """ if reg_capital_range: reg_capital_range = [ num for num_tuple in reg_capital_range for num in num_tuple ] if establish_time_range: establish_time_range = [ num for num_tuple in establish_time_range for num in num_tuple ] if staff_num_range: staff_num_range = [ num for num_tuple in staff_num_range for num in num_tuple ] query = { "areaCodeSet": area_code, "categoryGuobiao2017Set": category, "regCapitalRangeSet": reg_capital_range, "establishTimeRangeSet": establish_time_range, "regStatusSet": reg_status, "capitalUnitSet": capital_unit, "companyTypeSet": company_type, "institutionTypeSet": institution_type, "staffNumRangeSet": staff_num_range, "financingRoundList": financing_round, "listedTypeSet": listed_type, "hasPhone": has_phone, "hasMobile": has_mobile, "hasEmail": has_email, "hasBrand": has_brand, "hasDishonest": has_dishonest, "hasWebSite": has_website, "hasChattelMortage": has_chattel_mortage, "hasCopyright": has_copyright, "hasSoftCopyright": has_soft_copyright, "isHighTechCompany": is_high_tech_company, "taxLevel": is_tax_a_level, "isGeneralTaxpayer": is_general_taxpayer, "hasBid": has_bid } final_query = {"searchType": 2} for key, value in query.items(): if value is None: continue if isinstance(value, bool): final_query[key] = str(int(value)) else: final_query[key] = value url = "https://capi.tianyancha.com/cloud-tempest/advance" try: resp = requests.post(url, json=final_query, headers=headers) except Exception: raise QdataError(ErrorCode.TYC_COMPANY_COUNT_FAIL) resp_data = json.loads(resp.text) return int(resp_data['data']['realTotal'])