Exemple #1
0
def parse_qianyann_item(url):
    sess.headers[
        "Cookie"] = "UM_distinctid=16bf50a8450476-00b7d0ed2a109b-e343166-1fa400-16bf50a8451895; _gscu_1516296093=631843228bhl3k13; _gscbrs_1516296093=1; Hm_lvt_062d51b4dcc0576135b683257033659a=1563184338; Hm_lpvt_062d51b4dcc0576135b683257033659a=1563242618; _gscs_1516296093=t6324261780strk14|pv:1"
    r = sess.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    table = soup.find_all("table", {
        "cellpadding": "0",
        "cellspacing": "0",
        "width": "98%"
    })
    trs = table[0].find_all("tr")
    for item in trs:
        res = {}
        tds = item.find_all("td")
        title = tds[1].a.text
        publish_date = tds[2].span.text.strip()
        res["title"] = title
        res["publishDate"] = publish_date
        de_url = qianyan_home + tds[1].a.get("href")
        res["url"] = de_url
        ess = es_search("govnews", de_url)
        if ess[0] and ess[1]:
            pass
        else:
            resu = parse_qianyan_detail(de_url)
            res.update(resu)
            gw = GoverNews(**res)
            with session_scope() as sess1:
                sess1.add(gw)
            EsBackends("govnews").index_data({
                "link": de_url,
                "status": 1,
                "date": time.time()
            })
Exemple #2
0
def parse_jiehun_item(session, url):
    ess = es_search("meituan", url)
    if ess[0] and ess[1]:
        pass
    else:
        time.sleep(random.uniform(1, 3))
        print("pase jiehun url {}".format(url))
        resu = {}
        jiehun_url = "https://www.meituan.com/jiehun/{}/"
        # session.headers[
        #     "Cookie"] = "__mta=146208011.1562725971505.1562821920182.1562822162903.6; _lxsdk_cuid=16bd9b99c9ec8-035ab9a6954478-36664c08-1fa400-16bd9b99ca0c8; client-id=047f9384-30b4-4cce-aedb-773f7a31fd8a; mtcdn=K; uuid=3b49df191ddb4094bc3c.1562729907.1.0.0; _lx_utm=utm_source%3Dso.com%26utm_medium%3Dorganic; ci=45; rvct=45%2C114; IJSESSIONID=vguqtn4rvu70q8m6y7wyaoli; iuuid=8369B0074906E31235D094B1D10CB5398B04DC92AAFDBADB7477CB96EEFF986E; cityname=%E9%87%8D%E5%BA%86; _lxsdk=8369B0074906E31235D094B1D10CB5398B04DC92AAFDBADB7477CB96EEFF986E; _hc.v=10962146-cd2f-a7a9-c15a-d942a6e12989.1562744821; __mta=146208011.1562725971505.1562742466866.1562812609604.4; lat=29.535538; lng=106.512486; _lxsdk_s=16bdf56dfcd-a48-5e0-95b%7C%7C18"
        r = session.get(url, timeout=5)
        rule = r'window.AppData = (.+?);</script>'
        slotList = re.findall(rule, r.text)
        if slotList:
            res = json.loads(slotList[0])
            # print(res)
            # if res.get("poiParam").get("uuid"):
            #     session.headers["Cookie"] = cookies.format(res.get("poiParam").get("uuid"))
            shoplist = res.get("searchResult").get("searchResult")
            for item in shoplist:
                resu["score"] = item.get("avgscore")
                resu["shop"] = item.get("title")
                resu["address"] = item.get("address")
                shop_id = item.get("id")
                target = jiehun_url.format(shop_id)
                resu["url"] = target
                res = parse_jiehun_phone(session, target)
                resu.update(res)
                mt = MeiTuanShop(**resu)
                with session_scope() as session1:
                    session1.add(mt)
                if not ess[1] and ess[0]:
                    EsBackends("meituan").update_data(id=ess[2],
                                                      body={
                                                          "link": url,
                                                          "status": 1,
                                                          "date": time.time()
                                                      })
                if not ess[0]:
                    EsBackends("meituan").index_data({
                        "link": url,
                        "status": 1,
                        "date": time.time()
                    })
        else:
            print("获取不到值 {}".format(url))
            if not ess[0]:
                EsBackends("meituan").index_data({
                    "link": url,
                    "status": 0,
                    "date": time.time()
                })
            else:
                EsBackends("meituan").update_data(id=ess[2],
                                                  body={
                                                      "link": url,
                                                      "status": 0,
                                                      "date": time.time()
                                                  })
Exemple #3
0
 def gov_news(self, url):
     r = self.session.get(url)
     soup = BeautifulSoup(r.text, "lxml")
     res = {}
     ul = soup.find("ul", class_="pList01")
     lis = ul.find_all("li")
     for item in lis:
         date = item.span.text
         date = date[1:-1]
         res["publishDate"] = date
         if "http" not in item.a.get("href"):
             new_url = self.home + item.a.get("href")
         else:
             new_url = item.a.get("href")
         ess = es_search("govnews", new_url)
         if ess[0] and ess[1]:
             pass
         else:
             try:
                 resu = self.parse_detail(new_url)
             except Exception as e:
                 print(e)
                 continue
             res.update(resu)
             gw = GoverNews(**res)
             with session_scope() as sess:
                 sess.add(gw)
             EsBackends("govnews").index_data({
                 "link": new_url,
                 "status": 1,
                 "date": time.time()
             })
Exemple #4
0
def get_hotel_detail(url):
    ess = es_search("meituan", url)
    if ess[0] and ess[1]:
        pass
    else:
        result = {}
        time.sleep(random.uniform(1, 3))
        r = session.get(url)
        print("parse hotel {}".format(url))
        r.encoding = "utf-8"
        soup = BeautifulSoup(r.text, 'lxml')
        naspan = soup.find("div", {"class": "breadcrumb-nav"})
        result["shop"] = naspan.text.strip()
        result["url"] = url
        result["openTime"] = "全天"
        div = soup.find("div", {"class": "mb10"})
        span = div.find("span")
        result["address"] = span.text.strip()
        li = soup.find("li", {"class": "fs14"})
        divs = li.find_all("div", {"class": "mb10"})
        item = divs[-1]
        if "电话" in item.text:
            phone = item.text[item.text.find(":") + 1:]
            result["phone"] = phone
        score = soup.find("div", {"class": "other-detail-line1-score"})
        result["score"] = score.text.strip()
        mt = MeiTuanShop(**result)
        if result:
            result["url"] = url
            with session_scope() as session1:
                session1.add(mt)
            if not ess[1] and ess[0]:
                EsBackends("meituan").update_data(id=ess[2],
                                                  body={
                                                      "link": url,
                                                      "status": 1,
                                                      "date": time.time()
                                                  })
            if not ess[0]:
                EsBackends("meituan").index_data({
                    "link": url,
                    "status": 1,
                    "date": time.time()
                })
        else:
            if not ess[0]:
                EsBackends("meituan").index_data({
                    "link": url,
                    "status": 0,
                    "date": time.time()
                })
            else:
                EsBackends("meituan").update_data(id=ess[2],
                                                  body={
                                                      "link": url,
                                                      "status": 0,
                                                      "date": time.time()
                                                  })
            print("获取值为空 {}".format(url))
Exemple #5
0
 def parse_detail(self, url):
     ess = es_search("qiyeminglu", url)
     if not ess[1] or not ess[0]:
         time.sleep(random.uniform(1.5, 2))
         print("parse url {}".format(url))
         r = self.session.get(url)
         soup = BeautifulSoup(r.text, "lxml")
         fs = soup.find("fieldset", {"class": "ad_biger"})
         lis = fs.div.find_all("li")
         res = {}
         for li in lis:
             name = li.find("span", {"class": "field-label"}).text.strip()
             value = li.find("span", {"class": "field-item"}).text.strip()
             if "点击" in value:
                 index = value.find("点击")
                 value = value[:index - 1]
             if "统一社会信用" in name:
                 value = re.findall(patern, value)[0]
                 res["socialCreditCode"] = value
             if "名称" in name:
                 res["enterpriseName"] = value
             if "地址" in name:
                 res["address"] = value
             if "地区" in name:
                 res["area"] = value.strip()
             if "日期" in name:
                 res["registerDate"] = value
             if "范围" in name:
                 res["businessScope"] = value
             if "代表人" in name:
                 res["legalRepresentative"] = value
             if "资金" in name:
                 res["registeredFunds"] = value
             if "类型" in name:
                 if value:
                     res["enterpriseType"] = value
                 else:
                     value = lis[-1].find("span", {
                         "class": "field-item"
                     }).span
                     if value:
                         res["enterpriseType"] = value.text.strip()
                         print(value.text)
         ecq = EnterpriseCq(**res)
         with session_scope() as session1:
             session1.add(ecq)
         if not ess[0]:
             EsBackends("qiyeminglu").index_data({
                 "link": url,
                 "status": 1,
                 "date": time.time()
             })
         else:
             EsBackends("qiyeminglu").update_data({
                 "link": url,
                 "status": 1,
                 "date": time.time()
             })
Exemple #6
0
def parse_peixun(driver, url):
    ess = es_search("meituan", url)
    if ess[0] and ess[1]:
        pass
    else:
        time.sleep(random.uniform(2, 4))
        res = {}
        res["url"] = url
        driver.get(url)
        try:
            comment_link = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((
                    By.XPATH,
                    '//*[@id="lego-widget-mtpc-shop-head-001-000"]/div/div[1]/div[3]/p[2]/span'
                )))
        except TimeoutException:
            driver.get(url)
            comment_link = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((
                    By.XPATH,
                    '//*[@id="lego-widget-mtpc-shop-head-001-000"]/div/div[1]/div[3]/p[2]/span'
                )))
        comment_link.click()
        html = driver.page_source
        soup = BeautifulSoup(html, "lxml")
        div = soup.find("div", {"class": "mb-flex-1"})
        name = div.find("h1", {"class": "shop-name-title"})
        rank = div.find("div", {"class": "shop-review"})
        ap = div.find("div", class_="shop-address")
        opentime = soup.find_all("div",
                                 class_="merchant-intro-item clear-both")
        for item in opentime:
            ti = item.find("div", class_="merchant-intro-title")
            valu = item.find('div', class_="merchant-intro-content")
            if "营业时间" in ti.text:
                opentime = " ".join(valu.text.strip().split())
                res["openTime"] = opentime
        temp = ap.text.split()
        address = temp[0][temp[0].find(":") + 1:]
        phone = temp[1][temp[1].find(":") + 1:]
        res["address"] = address.strip()
        res["phone"] = phone.strip()
        res["score"] = rank.text.strip()
        res["shop"] = name.text.strip()
        ms = MeiTuanShop(**res)
        print(res)
        with session_scope() as sess:
            sess.add(ms)
        if not ess[1] and ess[0]:
            EsBackends("meituan").update_data(id=ess[2],
                                              body={
                                                  "link": url,
                                                  "status": 1,
                                                  "date": time.time()
                                              })
        if not ess[0]:
            EsBackends("meituan").index_data({
                "link": url,
                "status": 1,
                "date": time.time()
            })
Exemple #7
0
def parse_shop(url):
    ess = es_search("meituan", url)
    if ess[0] and ess[1]:
        pass
    else:
        print("parse shop url {}".format(url))
        time.sleep(random.uniform(1, 3))
        result = {}
        r = session.get(url, timeout=5)
        soup = BeautifulSoup(r.text, 'lxml')
        head = soup.find("div", {"class": "seller-info-head"})
        if not head:
            resu = parse_shop2(url)
            result.update(resu)
        else:
            name = head.find("h1", {"class": "seller-name"})
            result["shop"] = name.text.strip()
            score = head.find("span", {"class": "score"})
            result["score"] = score.text.split()[0]
            div = head.find("div", {"class": "seller-info-body"})
            items = div.find_all("div", {"class": "item"})
            for item in items:
                if "地址" in item.text.strip():
                    address = item.text[item.text.find(":") + 1:]
                    result["address"] = address
                if "电话" in item.text:
                    phone = item.text[item.text.find(":") + 1:]
                    result["phone"] = phone
                if "时间" in item.text:
                    time1 = item.text[item.text.find(":") + 1:]
                    opentime = " ".join(time1.split())
                    result["openTime"] = opentime
        if result:
            result["url"] = url
            mt = MeiTuanShop(**result)
            with session_scope() as session1:
                session1.add(mt)
            if not ess[1] and ess[0]:
                EsBackends("meituan").update_data(id=ess[2],
                                                  body={
                                                      "link": url,
                                                      "status": 1,
                                                      "date": time.time()
                                                  })
            if not ess[0]:
                EsBackends("meituan").index_data({
                    "link": url,
                    "status": 1,
                    "date": time.time()
                })
        else:
            count = 0
            while True:
                result = {}
                print("第{}次重试 {}".format(count + 1, url))
                time.sleep(random.uniform(1, 3))
                session.headers["User-Agent"] = random.choices(USERAGETNS)[0]
                r = session.get(url, timeout=5)
                soup = BeautifulSoup(r.text, 'lxml')
                head = soup.find("div", {"class": "seller-info-head"})
                try:
                    if not head:
                        resu = parse_shop2(url)
                        result.update(resu)
                    else:
                        name = head.find("h1", {"class": "seller-name"})
                        result["shop"] = name.text.strip()
                        score = head.find("span", {"class": "score"})
                        result["score"] = score.text.split()[0]
                        div = head.find("div", {"class": "seller-info-body"})
                        items = div.find_all("div", {"class": "item"})
                        for item in items:
                            if "地址" in item.text.strip():
                                address = item.text[item.text.find(":") + 1:]
                                result["address"] = address
                            if "电话" in item.text:
                                phone = item.text[item.text.find(":") + 1:]
                                result["phone"] = phone
                            if "时间" in item.text:
                                time1 = item.text[item.text.find(":") + 1:]
                                opentime = " ".join(time1.split())
                                result["openTime"] = opentime
                    if result:
                        result["url"] = url
                        mt = MeiTuanShop(**result)
                        # print(result)
                        with session_scope() as session1:
                            session1.add(mt)
                        if not ess[1] and ess[0]:
                            EsBackends("meituan").update_data(id=ess[2],
                                                              body={
                                                                  "link":
                                                                  url,
                                                                  "status":
                                                                  1,
                                                                  "date":
                                                                  time.time()
                                                              })
                        if not ess[0]:
                            EsBackends("meituan").index_data({
                                "link":
                                url,
                                "status":
                                1,
                                "date":
                                time.time()
                            })
                    else:
                        if count >= 3:
                            break
                        count = count + 1
                except Exception as e:
                    print(count, e)
                    break
            if not ess[0]:
                EsBackends("meituan").index_data({
                    "link": url,
                    "status": 0,
                    "date": time.time()
                })
            else:
                EsBackends("meituan").update_data(id=ess[2],
                                                  body={
                                                      "link": url,
                                                      "status": 0,
                                                      "date": time.time()
                                                  })
            print("获取值为空{}".format(url))