def parse_qianyann_item(url): sess.headers[ "Cookie"] = "UM_distinctid=16bf50a8450476-00b7d0ed2a109b-e343166-1fa400-16bf50a8451895; _gscu_1516296093=631843228bhl3k13; _gscbrs_1516296093=1; Hm_lvt_062d51b4dcc0576135b683257033659a=1563184338; Hm_lpvt_062d51b4dcc0576135b683257033659a=1563242618; _gscs_1516296093=t6324261780strk14|pv:1" r = sess.get(url) soup = BeautifulSoup(r.text, "lxml") table = soup.find_all("table", { "cellpadding": "0", "cellspacing": "0", "width": "98%" }) trs = table[0].find_all("tr") for item in trs: res = {} tds = item.find_all("td") title = tds[1].a.text publish_date = tds[2].span.text.strip() res["title"] = title res["publishDate"] = publish_date de_url = qianyan_home + tds[1].a.get("href") res["url"] = de_url ess = es_search("govnews", de_url) if ess[0] and ess[1]: pass else: resu = parse_qianyan_detail(de_url) res.update(resu) gw = GoverNews(**res) with session_scope() as sess1: sess1.add(gw) EsBackends("govnews").index_data({ "link": de_url, "status": 1, "date": time.time() })
def parse_jiehun_item(session, url): ess = es_search("meituan", url) if ess[0] and ess[1]: pass else: time.sleep(random.uniform(1, 3)) print("pase jiehun url {}".format(url)) resu = {} jiehun_url = "https://www.meituan.com/jiehun/{}/" # session.headers[ # "Cookie"] = "__mta=146208011.1562725971505.1562821920182.1562822162903.6; _lxsdk_cuid=16bd9b99c9ec8-035ab9a6954478-36664c08-1fa400-16bd9b99ca0c8; client-id=047f9384-30b4-4cce-aedb-773f7a31fd8a; mtcdn=K; uuid=3b49df191ddb4094bc3c.1562729907.1.0.0; _lx_utm=utm_source%3Dso.com%26utm_medium%3Dorganic; ci=45; rvct=45%2C114; IJSESSIONID=vguqtn4rvu70q8m6y7wyaoli; iuuid=8369B0074906E31235D094B1D10CB5398B04DC92AAFDBADB7477CB96EEFF986E; cityname=%E9%87%8D%E5%BA%86; _lxsdk=8369B0074906E31235D094B1D10CB5398B04DC92AAFDBADB7477CB96EEFF986E; _hc.v=10962146-cd2f-a7a9-c15a-d942a6e12989.1562744821; __mta=146208011.1562725971505.1562742466866.1562812609604.4; lat=29.535538; lng=106.512486; _lxsdk_s=16bdf56dfcd-a48-5e0-95b%7C%7C18" r = session.get(url, timeout=5) rule = r'window.AppData = (.+?);</script>' slotList = re.findall(rule, r.text) if slotList: res = json.loads(slotList[0]) # print(res) # if res.get("poiParam").get("uuid"): # session.headers["Cookie"] = cookies.format(res.get("poiParam").get("uuid")) shoplist = res.get("searchResult").get("searchResult") for item in shoplist: resu["score"] = item.get("avgscore") resu["shop"] = item.get("title") resu["address"] = item.get("address") shop_id = item.get("id") target = jiehun_url.format(shop_id) resu["url"] = target res = parse_jiehun_phone(session, target) resu.update(res) mt = MeiTuanShop(**resu) with session_scope() as session1: session1.add(mt) if not ess[1] and ess[0]: EsBackends("meituan").update_data(id=ess[2], body={ "link": url, "status": 1, "date": time.time() }) if not ess[0]: EsBackends("meituan").index_data({ "link": url, "status": 1, "date": time.time() }) else: print("获取不到值 {}".format(url)) if not ess[0]: EsBackends("meituan").index_data({ "link": url, "status": 0, "date": time.time() }) else: EsBackends("meituan").update_data(id=ess[2], body={ "link": url, "status": 0, "date": time.time() })
def gov_news(self, url): r = self.session.get(url) soup = BeautifulSoup(r.text, "lxml") res = {} ul = soup.find("ul", class_="pList01") lis = ul.find_all("li") for item in lis: date = item.span.text date = date[1:-1] res["publishDate"] = date if "http" not in item.a.get("href"): new_url = self.home + item.a.get("href") else: new_url = item.a.get("href") ess = es_search("govnews", new_url) if ess[0] and ess[1]: pass else: try: resu = self.parse_detail(new_url) except Exception as e: print(e) continue res.update(resu) gw = GoverNews(**res) with session_scope() as sess: sess.add(gw) EsBackends("govnews").index_data({ "link": new_url, "status": 1, "date": time.time() })
def get_hotel_detail(url): ess = es_search("meituan", url) if ess[0] and ess[1]: pass else: result = {} time.sleep(random.uniform(1, 3)) r = session.get(url) print("parse hotel {}".format(url)) r.encoding = "utf-8" soup = BeautifulSoup(r.text, 'lxml') naspan = soup.find("div", {"class": "breadcrumb-nav"}) result["shop"] = naspan.text.strip() result["url"] = url result["openTime"] = "全天" div = soup.find("div", {"class": "mb10"}) span = div.find("span") result["address"] = span.text.strip() li = soup.find("li", {"class": "fs14"}) divs = li.find_all("div", {"class": "mb10"}) item = divs[-1] if "电话" in item.text: phone = item.text[item.text.find(":") + 1:] result["phone"] = phone score = soup.find("div", {"class": "other-detail-line1-score"}) result["score"] = score.text.strip() mt = MeiTuanShop(**result) if result: result["url"] = url with session_scope() as session1: session1.add(mt) if not ess[1] and ess[0]: EsBackends("meituan").update_data(id=ess[2], body={ "link": url, "status": 1, "date": time.time() }) if not ess[0]: EsBackends("meituan").index_data({ "link": url, "status": 1, "date": time.time() }) else: if not ess[0]: EsBackends("meituan").index_data({ "link": url, "status": 0, "date": time.time() }) else: EsBackends("meituan").update_data(id=ess[2], body={ "link": url, "status": 0, "date": time.time() }) print("获取值为空 {}".format(url))
def parse_detail(self, url): ess = es_search("qiyeminglu", url) if not ess[1] or not ess[0]: time.sleep(random.uniform(1.5, 2)) print("parse url {}".format(url)) r = self.session.get(url) soup = BeautifulSoup(r.text, "lxml") fs = soup.find("fieldset", {"class": "ad_biger"}) lis = fs.div.find_all("li") res = {} for li in lis: name = li.find("span", {"class": "field-label"}).text.strip() value = li.find("span", {"class": "field-item"}).text.strip() if "点击" in value: index = value.find("点击") value = value[:index - 1] if "统一社会信用" in name: value = re.findall(patern, value)[0] res["socialCreditCode"] = value if "名称" in name: res["enterpriseName"] = value if "地址" in name: res["address"] = value if "地区" in name: res["area"] = value.strip() if "日期" in name: res["registerDate"] = value if "范围" in name: res["businessScope"] = value if "代表人" in name: res["legalRepresentative"] = value if "资金" in name: res["registeredFunds"] = value if "类型" in name: if value: res["enterpriseType"] = value else: value = lis[-1].find("span", { "class": "field-item" }).span if value: res["enterpriseType"] = value.text.strip() print(value.text) ecq = EnterpriseCq(**res) with session_scope() as session1: session1.add(ecq) if not ess[0]: EsBackends("qiyeminglu").index_data({ "link": url, "status": 1, "date": time.time() }) else: EsBackends("qiyeminglu").update_data({ "link": url, "status": 1, "date": time.time() })
def parse_peixun(driver, url): ess = es_search("meituan", url) if ess[0] and ess[1]: pass else: time.sleep(random.uniform(2, 4)) res = {} res["url"] = url driver.get(url) try: comment_link = WebDriverWait(driver, 15).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="lego-widget-mtpc-shop-head-001-000"]/div/div[1]/div[3]/p[2]/span' ))) except TimeoutException: driver.get(url) comment_link = WebDriverWait(driver, 15).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="lego-widget-mtpc-shop-head-001-000"]/div/div[1]/div[3]/p[2]/span' ))) comment_link.click() html = driver.page_source soup = BeautifulSoup(html, "lxml") div = soup.find("div", {"class": "mb-flex-1"}) name = div.find("h1", {"class": "shop-name-title"}) rank = div.find("div", {"class": "shop-review"}) ap = div.find("div", class_="shop-address") opentime = soup.find_all("div", class_="merchant-intro-item clear-both") for item in opentime: ti = item.find("div", class_="merchant-intro-title") valu = item.find('div', class_="merchant-intro-content") if "营业时间" in ti.text: opentime = " ".join(valu.text.strip().split()) res["openTime"] = opentime temp = ap.text.split() address = temp[0][temp[0].find(":") + 1:] phone = temp[1][temp[1].find(":") + 1:] res["address"] = address.strip() res["phone"] = phone.strip() res["score"] = rank.text.strip() res["shop"] = name.text.strip() ms = MeiTuanShop(**res) print(res) with session_scope() as sess: sess.add(ms) if not ess[1] and ess[0]: EsBackends("meituan").update_data(id=ess[2], body={ "link": url, "status": 1, "date": time.time() }) if not ess[0]: EsBackends("meituan").index_data({ "link": url, "status": 1, "date": time.time() })
def parse_shop(url): ess = es_search("meituan", url) if ess[0] and ess[1]: pass else: print("parse shop url {}".format(url)) time.sleep(random.uniform(1, 3)) result = {} r = session.get(url, timeout=5) soup = BeautifulSoup(r.text, 'lxml') head = soup.find("div", {"class": "seller-info-head"}) if not head: resu = parse_shop2(url) result.update(resu) else: name = head.find("h1", {"class": "seller-name"}) result["shop"] = name.text.strip() score = head.find("span", {"class": "score"}) result["score"] = score.text.split()[0] div = head.find("div", {"class": "seller-info-body"}) items = div.find_all("div", {"class": "item"}) for item in items: if "地址" in item.text.strip(): address = item.text[item.text.find(":") + 1:] result["address"] = address if "电话" in item.text: phone = item.text[item.text.find(":") + 1:] result["phone"] = phone if "时间" in item.text: time1 = item.text[item.text.find(":") + 1:] opentime = " ".join(time1.split()) result["openTime"] = opentime if result: result["url"] = url mt = MeiTuanShop(**result) with session_scope() as session1: session1.add(mt) if not ess[1] and ess[0]: EsBackends("meituan").update_data(id=ess[2], body={ "link": url, "status": 1, "date": time.time() }) if not ess[0]: EsBackends("meituan").index_data({ "link": url, "status": 1, "date": time.time() }) else: count = 0 while True: result = {} print("第{}次重试 {}".format(count + 1, url)) time.sleep(random.uniform(1, 3)) session.headers["User-Agent"] = random.choices(USERAGETNS)[0] r = session.get(url, timeout=5) soup = BeautifulSoup(r.text, 'lxml') head = soup.find("div", {"class": "seller-info-head"}) try: if not head: resu = parse_shop2(url) result.update(resu) else: name = head.find("h1", {"class": "seller-name"}) result["shop"] = name.text.strip() score = head.find("span", {"class": "score"}) result["score"] = score.text.split()[0] div = head.find("div", {"class": "seller-info-body"}) items = div.find_all("div", {"class": "item"}) for item in items: if "地址" in item.text.strip(): address = item.text[item.text.find(":") + 1:] result["address"] = address if "电话" in item.text: phone = item.text[item.text.find(":") + 1:] result["phone"] = phone if "时间" in item.text: time1 = item.text[item.text.find(":") + 1:] opentime = " ".join(time1.split()) result["openTime"] = opentime if result: result["url"] = url mt = MeiTuanShop(**result) # print(result) with session_scope() as session1: session1.add(mt) if not ess[1] and ess[0]: EsBackends("meituan").update_data(id=ess[2], body={ "link": url, "status": 1, "date": time.time() }) if not ess[0]: EsBackends("meituan").index_data({ "link": url, "status": 1, "date": time.time() }) else: if count >= 3: break count = count + 1 except Exception as e: print(count, e) break if not ess[0]: EsBackends("meituan").index_data({ "link": url, "status": 0, "date": time.time() }) else: EsBackends("meituan").update_data(id=ess[2], body={ "link": url, "status": 0, "date": time.time() }) print("获取值为空{}".format(url))