def request_detail(keyword): list_jd = [] list_1688 = [] list_b2bbaidu = [] tasks_list_result = [] url = "http://www.baidu.com/s?wd=" + keyword try: r = requests.get(url, headers=Header, timeout=6) except Exception as e: logging.info("eeeee:{}".format(e)) else: # zloop = asyncio.set_event_loop(zloop) zloop = asyncio.get_event_loop() tasks = [] tasks_result = [] if r: tasks_list = [] # logging.info("请求关键字成功:{}".format(url)) print("请求关键字成功:{}".format(url)) selector = getXpath(r.text) node_list = selector.xpath('''//div[contains(@class,"c-container")]''') for l in node_list: url_baidu_detail = l.xpath('''h3/a[1]/@href''') lingshi_title = l.xpath('''string(h3/a[1])''') title = str(lingshi_title) if len(url_baidu_detail) > 0: url_baidu_detail = url_baidu_detail[0] if url_baidu_detail == "" or len(url_baidu_detail) < 3: continue if str(url_baidu_detail).startswith("/sf/"): continue index_id = 0 index_id = l.xpath("@id") if len(index_id) > 0: index_id = int(index_id[0]) # asyncio.ensure_future(fetch(url_baidu_detail,title,keyword,index_id)) tasks.append(asyncio.ensure_future(fetch(url_baidu_detail, title, keyword, index_id))) zloop.run_until_complete(asyncio.wait(tasks)) for val in tasks: values = val.result() print("xjl") print(values) print("xjl2")
def requests_baidu_keyword(q): # flag = True # while flag: keyword = q tasks_list = [] url = "http://www.baidu.com/s?wd=" + keyword r = requests.get(url, headers=Header) if r: logging.info("请求关键字成功:{}".format(url)) selector = getXpath(r.text) node_list = selector.xpath('''//div[contains(@class,"c-container")]''') global flag_jd flag_jd = {} global flag_1688 flag_1688 = {} global flag_b2bbaidu flag_b2bbaidu = {} for l in node_list: url_baidu_detail = l.xpath('''h3/a[1]/@href''') lingshi_title = l.xpath('''string(h3/a[1])''') title = str(lingshi_title) if len(url_baidu_detail) > 0: url_baidu_detail = url_baidu_detail[0] if url_baidu_detail == "" or len(url_baidu_detail) < 3: continue if str(url_baidu_detail).startswith("/sf/"): continue index_id = 0 index_id = l.xpath("@id") if len(index_id) > 0: index_id = int(index_id[0]) tasks_list.append( gevent.spawn(fetch, url_baidu_detail, title, keyword, index_id)) gevent.joinall(tasks_list) # flag = False print(flag_jd) print(flag_1688) print(flag_b2bbaidu)
def fetch(url, title, keyword, index_id): dicts_list = [] try: r2 = requests.get(url, headers=Header, timeout=5) except Exception as e: logging.info("eeeee2:{}".format(e)) else: if r2 and r2.status_code < 400: print(r2.url) if r2.apparent_encoding == "utf-8" or r2.apparent_encoding.startswith( "UTF-8") or r2.apparent_encoding == "utf8": r2.encoding = "utf-8" elif r2.apparent_encoding == "GB2312" or r2.apparent_encoding.startswith( "ISO-8859") or r2.apparent_encoding.startswith("Windows"): r2.encoding = "gbk" if supervisory[0] in r2.url: logging.info("关键字{}-- jd 详情页url:{}".format(keyword, r2.url)) if "..." in title: html = getXpath(r2.text) title = html.xpath('''//head/title/text()''') if len(title) > 0: title = title[0] # dicts_list = [] dicts = {} # dicts[str(index_id)] = {"tag": "jd", "title": title, "url": r2.url} dicts["tag"] = "jd" dicts["keyword"] = keyword dicts["title"] = title dicts["url"] = r2.url dicts["index"] = index_id return dicts if supervisory[1] in r2.url: logging.info("关键字{}-- 1688 详情页url:{}".format(keyword, r2.url)) if "..." in title: html = getXpath(r2.text) title = html.xpath('''//head/title/text()''') if len(title) > 0: title = title[0] # global flag_1688 # flag_1688[str(index_id)] = {"tag":"1688","title": title, "url": r2.url} dicts = {} dicts["index"] = index_id dicts["tag"] = "1688" dicts["keyword"] = keyword dicts["title"] = title dicts["url"] = r2.url return dicts if supervisory[2] in r2.url: logging.info("关键字{}-- b2bbaidu 详情页url:{}".format( keyword, r2.url)) if "..." in title: html = getXpath(r2.text) title = html.xpath('''//head/title/text()''') if len(title) > 0: title = title[0] flag_b2bbaidu[str(index_id)] = { "tag": "b2bbaidu", "title": title, "url": r2.url } dicts = {} dicts["tag"] = "b2b.baidu.com" dicts["keyword"] = keyword dicts["title"] = title dicts["url"] = r2.url dicts["index"] = index_id return dicts
def request_detail(keyword): list_jd = [] list_1688 = [] list_b2bbaidu = [] tasks_list_result = [] jd = True s_1688 = True b2bbaidu = True url = "http://www.baidu.com/s?wd=" + keyword try: r = requests.get(url, headers=Header, timeout=6) except Exception as e: logging.info("eeeee:{}".format(e)) else: if r: tasks_list = [] logging.info("请求关键字成功:{}".format(url)) selector = getXpath(r.text) node_list = selector.xpath( '''//div[contains(@class,"c-container")]''') for l in node_list: url_baidu_detail = l.xpath('''h3/a[1]/@href''') lingshi_title = l.xpath('''string(h3/a[1])''') title = str(lingshi_title) if len(url_baidu_detail) > 0: url_baidu_detail = url_baidu_detail[0] if url_baidu_detail == "" or len(url_baidu_detail) < 3: continue if str(url_baidu_detail).startswith("/sf/"): continue index_id = 0 index_id = l.xpath("@id") if len(index_id) > 0: index_id = int(index_id[0]) tasks_list.append( pool_task.spawn(fetch, url_baidu_detail, title, keyword, index_id)) gevent.joinall(tasks_list) for tasks_result in tasks_list: result = tasks_result.value if result: if result["tag"] == "jd": list_jd.append(result) if result["tag"] == "1688": list_1688.append(result) if result["tag"] == "b2b.baidu.com": list_b2bbaidu.append(result) if len(list_jd) > 0: s = sorted(list_jd, key=lambda x: x['index'], reverse=False) dict_jd = s[0] tasks_list_result.append(dict_jd) if len(list_1688) > 0: s = sorted(list_1688, key=lambda x: x['index'], reverse=False) dict_1688 = s[0] tasks_list_result.append(dict_1688) if len(list_b2bbaidu) > 0: s = sorted(list_b2bbaidu, key=lambda x: x['index'], reverse=False) dict_b2bbaidu = s[0] tasks_list_result.append(dict_b2bbaidu) # for dict_param in tasks_list: # executor_thread.submit(insert_db, dict_param) # print(tasks_list_result) return tasks_list_result
def requests_baidu_keyword(q): while True: keyword = q.get() flag_jd = False flag_1688 = False flag_b2bbaidu = False url = "http://www.baidu.com/s?wd=" + keyword r = requests.get(url, headers=Header) if r: logging.info("请求关键字成功:{}".format(url)) selector = getXpath(r.text) node_list = selector.xpath('''//div[contains(@class,"c-container")]''') for l in node_list: url_baidu_detail = l.xpath('''h3/a[1]/@href''') lingshi_title = l.xpath('''string(h3/a[1])''') title = str(lingshi_title) if len(url_baidu_detail) > 0: url_baidu_detail = url_baidu_detail[0] if url_baidu_detail == "" or len(url_baidu_detail) < 3: continue if str(url_baidu_detail).startswith("/sf/"): continue try: r2 = requests.get(url_baidu_detail, headers=Header, verify=False, timeout=8) except Exception as e: logging.warning(e) else: if r2: if r2.apparent_encoding == "utf-8" or r2.apparent_encoding.startswith( "UTF-8") or r2.apparent_encoding == "utf8": r2.encoding = "utf-8" elif r2.apparent_encoding == "GB2312" or r2.apparent_encoding.startswith( "ISO-8859") or r2.apparent_encoding.startswith("Windows"): r2.encoding = "gbk" if r2: print(r2.url) if supervisory[0] in r2.url and flag_jd == False: flag_jd = True index_id = 0 index_id = l.xpath("@id") if len(index_id) > 0: index_id = int(index_id[0]) logging.info("关键字{}-- jd 详情页url:{}".format(keyword, r2.url)) dicts = {} dicts["tag"] = "jd" dicts["keyword"] = keyword if "..." in title: html = getXpath(r2.text) title = html.xpath('''//head/title/text()''') if len(title) > 0: title = title[0] dicts["title"] = title dicts["url"] = r2.url dicts["index"] = index_id insert_db(dicts) if supervisory[1] in r2.url and flag_1688 == False: flag_1688 = True index_id = 0 index_id = l.xpath("@id") if len(index_id) > 0: index_id = int(index_id[0]) logging.info("关键字{}-- 1688 详情页url:{}".format(keyword, r2.url)) dicts = {} dicts["tag"] = "1688" dicts["keyword"] = keyword if "..." in title: html = getXpath(r2.text) title = html.xpath('''//head/title/text()''') if len(title) > 0: title = title[0] dicts["title"] = title dicts["url"] = r2.url dicts["index"] = index_id insert_db(dicts) if supervisory[2] in r2.url and flag_b2bbaidu == False: flag_b2bbaidu = True index_id = 0 index_id = l.xpath("@id") if len(index_id) > 0: index_id = int(index_id[0]) logging.info("关键字{}-- b2bbaidu 详情页url:{}".format(keyword, r2.url)) dicts = {} dicts["tag"] = "b2b.baidu.com" dicts["keyword"] = keyword if "..." in title: html = getXpath(r2.text) title = html.xpath('''//head/title/text()''') if len(title) > 0: title = title[0] dicts["title"] = title dicts["url"] = r2.url dicts["index"] = index_id insert_db(dicts)
# -*-coding: utf-8-*- # **************************file desc***************************** __author__ = 'yushanshan' # createTime : 2019/7/18 19:54 # desc : this is new py file, please write your desc for this file # **************************************************************** from requests_url import getXpath import requests Header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36", } r = requests.get("https://www.aizhan.com/cha/xinnet.com/", headers=Header, verify=False) html = getXpath(r.text)
def fetch(url, title, keyword, index_id): r2 = requests.get(url, headers=Header) if r2 and r2.status_code < 400: print(r2.url) if r2.apparent_encoding == "utf-8" or r2.apparent_encoding.startswith( "UTF-8") or r2.apparent_encoding == "utf8": r2.encoding = "utf-8" elif r2.apparent_encoding == "GB2312" or r2.apparent_encoding.startswith( "ISO-8859") or r2.apparent_encoding.startswith("Windows"): r2.encoding = "gbk" if supervisory[0] in r2.url: logging.info("关键字{}-- jd 详情页url:{}".format(keyword, r2.url)) if "..." in title: html = getXpath(r2.text) title = html.xpath('''//head/title/text()''') if len(title) > 0: title = title[0] global flag_jd flag_jd[str(index_id)] = { "tag": "jd", "title": title, "url": r2.url } # dicts = {} # dicts["tag"] = "jd" # dicts["keyword"] = keyword # dicts["title"] = title # dicts["url"] = r2.url # dicts["index"] = index_id if supervisory[1] in r2.url: logging.info("关键字{}-- 1688 详情页url:{}".format(keyword, r2.url)) if "..." in title: html = getXpath(r2.text) title = html.xpath('''//head/title/text()''') if len(title) > 0: title = title[0] global flag_1688 flag_1688[str(index_id)] = { "tag": "1688", "title": title, "url": r2.url } # dicts = {} # dicts["index"] = index_id # dicts["tag"] = "1688" # dicts["keyword"] = keyword # dicts["title"] = title # dicts["url"] = r2.url if supervisory[2] in r2.url: logging.info("关键字{}-- b2bbaidu 详情页url:{}".format(keyword, r2.url)) if "..." in title: html = getXpath(r2.text) title = html.xpath('''//head/title/text()''') if len(title) > 0: title = title[0] global flag_b2bbaidu flag_b2bbaidu[str(index_id)] = { "tag": "b2bbaidu", "title": title, "url": r2.url }
def request_detail(keyword): list_jd = [] list_1688 = [] list_b2bbaidu = [] tasks_list_result = [] url = "http://www.baidu.com/s?wd=" + keyword try: r = requests.get(url, headers=Header, timeout=6) except Exception as e: logging.info("eeeee:{}".format(e)) else: # zloop = asyncio.set_event_loop(zloop) zloop = asyncio.get_event_loop() tasks = [] tasks_result = [] if r: tasks_list = [] # logging.info("请求关键字成功:{}".format(url)) print("请求关键字成功:{}".format(url)) selector = getXpath(r.text) node_list = selector.xpath( '''//div[contains(@class,"c-container")]''') for l in node_list: url_baidu_detail = l.xpath('''h3/a[1]/@href''') lingshi_title = l.xpath('''string(h3/a[1])''') title = str(lingshi_title) if len(url_baidu_detail) > 0: url_baidu_detail = url_baidu_detail[0] if url_baidu_detail == "" or len(url_baidu_detail) < 3: continue if str(url_baidu_detail).startswith("/sf/"): continue index_id = 0 index_id = l.xpath("@id") if len(index_id) > 0: index_id = int(index_id[0]) # asyncio.ensure_future(fetch(url_baidu_detail,title,keyword,index_id)) tasks.append( asyncio.ensure_future( fetch(url_baidu_detail, title, keyword, index_id))) zloop.run_until_complete(asyncio.wait(tasks)) for val in tasks: values = val.result() # print(values[0]) if values[0] != "1": title = values[2] if "..." in title: html = getXpath(values[0]) title = html.xpath('''//head/title/text()''') if len(title) > 0: title = title[0] dicts = {} dicts["tag"] = values[5] dicts["keyword"] = values[3] dicts["title"] = title dicts["url"] = values[1] dicts["index"] = values[4] if dicts["tag"] == "jd": list_jd.append(dicts) if dicts["tag"] == "1688": list_1688.append(dicts) if dicts["tag"] == "b2b.baidu.com": list_b2bbaidu.append(dicts) else: # logging.info("error2 stop "+str(values[1])) print("error2 stop " + str(values[1])) if len(list_jd) > 0: s = sorted(list_jd, key=lambda x: x['index'], reverse=False) dict_jd = s[0] tasks_list_result.append(dict_jd) if len(list_1688) > 0: s = sorted(list_1688, key=lambda x: x['index'], reverse=False) dict_1688 = s[0] tasks_list_result.append(dict_1688) if len(list_b2bbaidu) > 0: s = sorted(list_b2bbaidu, key=lambda x: x['index'], reverse=False) dict_b2bbaidu = s[0] tasks_list_result.append(dict_b2bbaidu) # for d in tasks_list_result: # insert_db(d) print(tasks_list_result)
def requests_baidu_keyword(keyword): # global q # print(111111111111111111111) # flag = True # while flag: # keyword = q.get() executor = ThreadPoolExecutor(max_workers=16) tasks_list = [] list_jd = [] list_1688 = [] list_b2bbaidu = [] url = "http://www.baidu.com/s?wd=" + keyword try: r = requests.get(url, headers=Header, timeout=6) except Exception as e: logging.info("eeeee:{}".format(e)) else: if r: logging.info("请求关键字成功:{}".format(url)) selector = getXpath(r.text) node_list = selector.xpath( '''//div[contains(@class,"c-container")]''') for l in node_list: url_baidu_detail = l.xpath('''h3/a[1]/@href''') lingshi_title = l.xpath('''string(h3/a[1])''') title = str(lingshi_title) if len(url_baidu_detail) > 0: url_baidu_detail = url_baidu_detail[0] if url_baidu_detail == "" or len(url_baidu_detail) < 3: continue if str(url_baidu_detail).startswith("/sf/"): continue index_id = 0 index_id = l.xpath("@id") if len(index_id) > 0: index_id = int(index_id[0]) # tasks_list.append(gevent.spawn(fetch, url_baidu_detail, title, keyword, index_id)) # gevent.joinall(tasks_list) future = executor.submit(fetch, url_baidu_detail, title, keyword, index_id) if future.result(): lingshi = future.result() if lingshi["tag"] == "jd": list_jd.append(lingshi) if lingshi["tag"] == "1688": list_1688.append(lingshi) if lingshi["tag"] == "b2b.baidu.com": list_b2bbaidu.append(lingshi) if len(list_jd) > 0: s = sorted(list_jd, key=lambda x: x['index'], reverse=False) dict_jd = s[0] tasks_list.append(dict_jd) if len(list_1688) > 0: s = sorted(list_1688, key=lambda x: x['index'], reverse=False) dict_1688 = s[0] tasks_list.append(dict_1688) if len(list_b2bbaidu) > 0: s = sorted(list_b2bbaidu, key=lambda x: x['index'], reverse=False) dict_b2bbaidu = s[0] tasks_list.append(dict_b2bbaidu) for dict_param in tasks_list: future = executor.submit(insert_db, dict_param)