コード例 #1
0
def input_city():
    print("输入要查询的城市:")
    city = input()
    data = {
        "q": city,
        "t": "mdd",
        "seid": "F2D12C42-3811-4E9F-AAE7-121E9F9FE148"
    }
    url = "http://www.mafengwo.cn/search/s.php?" + urlencode(data)
    resp = req.get_html_content(url)
    bs = BeautifulSoup(resp, "lxml")
    s = bs.find_all("div", attrs={"class": "lst-nub"})[0].find_all("a")
    url1 = s[1]['href']
    url2 = s[2]['href']
    url3 = s[3]['href']
    url4 = s[4]['href']
    iMddid = str(url1).split("/")[-2]
    print("选择要查询的具体内容:1、所有景点 2、酒店 3、机场+酒店 4、当地游")
    c = int(input())
    if c == 1:
        get_json_page(iMddid)
    elif c == 2:
        get_allcity(url2)
    elif c == 3:
        pass
    else:
        pass
コード例 #2
0
ファイル: tieba_spider.py プロジェクト: xuzhifeng24/spider
def get_province_school(href):
    resp = req.get_html_content(href)
    bs = BeautifulSoup(resp, "lxml")
    s = bs.find_all("div", attrs={"id": "dir_content_main"})[0].find_all("td")
    for i in s[1:]:
        href = i.find('a')['href']
        parse_html(href)
コード例 #3
0
ファイル: tieba_spider.py プロジェクト: xuzhifeng24/spider
def get_page(href):
    threads = []
    resp = req.get_html_content(href)
    bs = BeautifulSoup(resp, "lxml")
    s = bs.find_all("div", class_="pagination")[0].find_all('a')[-1]
    page = re.search(r"&amp;pn=([0-9]*)\">尾页</a>", str(s)).group(1)
    pool = mp.Pool(15)
    for i in range(1, int(page) + 1):
        link = href + "&pn={}".format(i)
        pool.apply_async(get_province_school, args=(link, ))
    pool.close()
    pool.join()
コード例 #4
0
ファイル: tieba_spider.py プロジェクト: xuzhifeng24/spider
def parse_html(href):
    resp = req.get_html_content(href)
    bs = BeautifulSoup(resp, "lxml")
    s = bs.find_all("div", attrs={"class": "card_title"})
    for i in s:
        school = str(i.find("a").get_text()).strip().replace("吧", "")
        guanzhu = re.search(r'<span class="card_menNum">(.*)</span>',
                            str(i)).group(1)
        tiezi = re.search(r'<span class="card_infoNum">(.*)</span>',
                          str(i)).group(1)
        print(school, "关注:", guanzhu, "帖子:", tiezi)
        conn = get_connect()
        with conn.cursor() as cursor:
            sql = "insert into gz_db(school,guanzhu,tieshu) values(%s,%s,%s)"
            cursor.execute(sql, (school, guanzhu, tiezi))
        conn.commit()
        cursor.close()
コード例 #5
0
ファイル: tieba_spider.py プロジェクト: xuzhifeng24/spider
def get_province_name(href):
    resp = req.get_html_content(href)
    bs = BeautifulSoup(resp, "lxml")
    return bs.find_all("h2")[1].find("span").text
コード例 #6
0
ファイル: tieba_spider.py プロジェクト: xuzhifeng24/spider
def get_province_allschool(province_school):
    data = {"fd": "高等院校", "ie": "utf-8", "sd": province_school}
    url = "http://tieba.baidu.com/f/fdir?" + urlencode(data)
    resp = req.get_html_content(url)
    return resp