Esempio n. 1
0
def get_comments(initFilePath, filePath, ip_num):
    list_loc = get_starter(initFilePath, filePath)
    total_number = len(list_loc)

    ip_list = getProxy(ip_num)

    url_base = "https://cn.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_SUR_REVIEWS_RESP&metaReferer=ShowUserReviewsAttractions&reviewId="

    threading_num = len(ip_list)
    post_url_pool = []
    infor_pool = []
    for i in range(total_number):
        url = url_base + list_loc[i][2][1:]
        if len(post_url_pool) < threading_num and i != total_number - 1:
            post_url_pool.append(url)
            infor_pool.append((list_loc[i][0], list_loc[i][1], list_loc[i][2]))
            print("准备爬取第 {} 个的评论全文 共有 {} 个评论".format(i, total_number))
            continue

        # 最后一页既要进url池,又要启动线程
        if i == total_number - 1:
            print("准备爬取第 {} 个的评论全文 共有 {} 个评论".format(i, total_number))
            post_url_pool.append(url)

        threads = multi_threading(post_url_pool, ip_list)

        with open(filePath, 'a+', newline='', encoding='utf-8-sig') as csvfile:
            writer = csv.writer(csvfile)
            num = 0
            change_ip = False
            for thread in threads:
                try:
                    (title, content) = thread.get_result()
                    print("title: {}".format(title))
                    # print("content: {}".format(content))
                    writer.writerow([
                        infor_pool[num][0], infor_pool[num][1],
                        infor_pool[num][2], title, content
                    ])
                except Exception as e:
                    print("[Get_List]Error ", e)
                    change_ip = True
                num += 1
            if change_ip:
                ip_list = getProxy(ip_num)

        # 准备下一次输送给线程的url
        if i != total_number - 1:
            print("准备爬取第 {} 个的评论全文 共有 {} 个评论".format(i, total_number))
            post_url_pool = [url]
            infor_pool = [(list_loc[i][0], list_loc[i][1], list_loc[i][2])]
Esempio n. 2
0
def get_list(province_id, filePath):
    ip_list = getProxy()
    proxies = choice(ip_list)
    print("[Get_List]The valid IP: ", ip_list)

    url_base_1 = 'https://cn.tripadvisor.com/Attractions-' + str(
        province_id) + '-Activities-oa'
    url_base_2 = '-New_York_City_New_York.html'

    page = get_starter(filePath)

    while page <= 50:
        num = (page - 1) * 30
        url = url_base_1 + str(num) + url_base_2

        # 使用自定义的opener对象,调用open()方法来发送请求
        try:
            response = requests.get(url, proxies=proxies)
        except Exception as e:
            print("[Get_List]Error ", e)
            print("[Get_List]False to spider page " + str(page))
            page -= 1
            ip_list = getProxy()
            proxies = choice(ip_list)
        else:
            print("[Get_List]Success to spider page " + str(page))

            html_code = response.text
            # print(html_code)
            hp = MyHTMLParser()
            hp.feed(html_code)
            hp.close()
            print(hp.links)
            print("[Get_List]Success to spider poi " + str(len(hp.links)))

            # 判断是否爬完
            if len(hp.links) == 0:
                break

            for loc in hp.links:
                with open(filePath, 'a+', encoding='utf-8') as f:
                    loc_list = loc.split("-")
                    # print(loc_list[4])
                    f.write(
                        str(loc_list[2]) + "\t" + str(loc_list[4]) + "\t" +
                        str(loc) + "\t" + str(page) + "\n")
        page += 1
    print("[Get_List]Done spider all the list")
Esempio n. 3
0
def get_comments(initFilePath, filePath):
    list_loc = get_starter(initFilePath, filePath)
    total_number = len(list_loc)
    ip_list = getProxy(10)
    print("[Get_List]The valid IP: ", ip_list)

    for i in range(total_number):
        with open(filePath, 'a+', newline='', encoding='utf-8-sig') as csvfile:
            writer = csv.writer(csvfile)
            base_url = list_loc[i][2]
            links = getPoiComments_id(base_url, ip_list)
            for link in links:
                link_sec = link.split("-")
                writer.writerow([link_sec[1], link_sec[2], link_sec[3], link])
            print("[Get_Comments]Done write file " + str(list_loc[i][1]) +
                  " page number is " + str(list_loc[i][3]))
Esempio n. 4
0
            result = thread.get_result()
            print(result)
            try:
                if len(result) == 0:
                    return comments_id
            except Exception as e:
                print("[Get_List]Error ", e)
                continue
            comments_id.extend(result)
            print(len(comments_id))
            print("已经爬到了 {} 个的评论".format(len(comments_id)))

        # 准备下一次输送给线程的url
        if page != page_num:
            print("准备爬从第 {} 页开始的评论".format(page))
            post_url_pool = [origin + base_sec[0] + "-Reviews-or" + str(num) + base_sec[0]]

        page += 1

    return comments_id


if __name__ == '__main__':
    base_url = "/Attraction_Review-g60763-d1687489-Reviews-The_National_9_11_Memorial_Museum-New_York_City_New_York.html	"
    ip_list = getProxy(10)
    print("[Get_List]The valid IP: ", ip_list)
    links = getPoiComments_id(base_url, ip_list)
    print(len(links))
    for item in links:
        print(item)
get_url_base = "http://wxapp.mafengwo.cn/gonglve/poi/?jsondata="

json_base_1 = "{%22data_style%22:%22comment_list%22,%22filter_style%22:%22comment%22,%22filter%22:{%22poiid%22:%22"
json_base_2 = "%22,%22tag%22:0},%22page%22:{%22no%22:"
json_base_3 = ",%22num%22:30}}"
poiid = 3474
page = 1

json_str = json_base_1 + str(poiid) + json_base_2 + str(page) + json_base_3

get_url = get_url_base + json_str
print(get_url)


ip_list = getProxy()


filePath = "./data/comment_all.txt"
with open(filePath, 'a+',encoding='utf-8') as f:
    headers['User-Agent'] = choice(userAgent)
    req = urllib.request.Request(get_url, headers=headers)
    proxy_handler = urllib.request.ProxyHandler(choice(ip_list))
    opener = urllib.request.build_opener(proxy_handler)
    try:
        response = opener.open(req)
    except:
        print("[Get_Comments]False to spider")
    else:
        print("[Get_Comments]Success to spider")
        # 返回的是一个json格式的字符串,将字符串转为dict对象
def get_subpoi(province_id, initFilePath, filePath):
    headers, userAgent = head_useragent()

    initFilePath = "./data/" + province_id + "_list_all.txt"
    filePath = "./data/" + province_id + "_list_all_sub.txt"

    list_loc, fatherId, sub_page = get_starter(initFilePath, filePath)

    ip_list = getProxy()

    # 循环对每一个景点判断是否有子景点
    for poi in list_loc:
        print("[Get_List]Start to spider:" + str(poi[2] + " Page " + str(poi[-1])))
        with open(filePath, 'a+', encoding='utf-8') as f:
            # 天才的想法:
            # 如果不是第一次进这个循环(fatherId就是列表第一项),那么需要输入父节点信息;
            # 如果是第一次进来,但是文件只有表头(fatherId == 0),也需要输入父节点信息;
            if poi[2] != str(fatherId):
                f.write(str(poi[0]) + "\t" + str(poi[1]) + "\t" +
                        str(poi[2]) + "\t" + str(poi[3]) + "\t" +
                        str(poi[4]) + "\t" + str(poi[5]) + "\t0\t0\t0\n")
        hasMore = True  # 判断该父景点是否还有下一页子景点
        while hasMore:
            # 准备请求内容以及请求URL
            sub_page += 1
            opener, req = prepare_request(poi[2], sub_page, ip_list, userAgent, headers)

            # 判断该景点(父景点)是否有子景点,如果有则继续,没有则跳过
            try:
                response = opener.open(req)
            except Exception as e:
                print("[Get_List]Error ", e)
                print("[Get_List]False to spider " + str(poi[2]) + " Page " + str(poi[-1]) + " sub_page " + str(
                    sub_page))
                sub_page -= 1
                ip_list = getProxy()
            else:
                # 返回的是一个json格式的字符串,将字符串转为dict对象
                data_json = json.loads(response.read().decode("utf8"))
                data_all = data_json.get("data")
                html = data_all["html"]
                # 如果没有controller_data说明该景点(分类不是景点的)没有子景点
                if "controller_data" not in data_all:
                    print("[Get_List]This location has no sub-locations")
                    break
                controller_data = data_all["controller_data"]
                # 判断是否该父景点是否还有下页
                hasMore = controller_data["hasMore"]
                # 得到当前父景点爬的子景点的页数
                curPage = controller_data["curPage"]
                hp = MyHTMLParser()
                hp.feed(html)
                hp.close()
                # 分类是景点但是没有子景点的有controller_data,但是html中没有相应数据
                if len(hp.href) == 0:
                    print("[Get_List]This location has no sub-locations")
                    break
                # 严谨,防止信息不匹配
                if not len(hp.href) == len(hp.target) == len(hp.people):
                    raise Exception("[Get_List]子景点信息不匹配")

                for i in range(len(hp.href)):
                    # 子景点去过的人数少于5个就不写入文件中
                    if int(hp.people[i]) <= 5:
                        continue
                    pattern = re.compile("/poi/(.*).html", re.IGNORECASE)
                    sub_poi = pattern.findall(hp.href[i])[0]
                    with open(filePath, 'a+', encoding='utf-8') as f:
                        f.write(str(hp.target[i]) + "\t" + str(poi[1]) + "\t" +
                                str(sub_poi) + "\t" + str(poi[3]) + "\t" +
                                str(poi[4]) + "\t" + str(poi[5]) + "\t" +
                                str(poi[0]) + "\t" + str(poi[2]) + "\t" +
                                str(sub_page) + "\n")

                print("[Get_List]Success to spider " + str(poi[2]) +
                      " Page " + str(poi[-1]) +
                      " sub_page " + str(sub_page))
        # 爬完该父景点的所有子景点之后将sub_page置零,不放在循环开始因为第一次进入循环的时候sub_page不为零。
        sub_page = 0
        print("[Get_List]Finish to spider:" + str(poi[2] + " Page " + str(poi[-1])))
    print("[Get_List]Done spider all the list")

    # 因为有的景点可能既是一个独立的景点,又同时属于某一个景点的子景点,那么它就会存在两次
    clean_file(filePath)
def multi_thread(user_cut, token_pool, Flag, Flag_geo):
    thread_num = len(token_pool) * 2
    geo = set()
    non_geo = set()
    num = len(user_cut)
    batch_string = ""
    user_last = []
    batch_string_list = []
    i = 0
    ip_list = getProxy(7)
    for single in user_cut:
        i += 1
        batch_string_after = '分割'.join([batch_string, single])
        if len(batch_string_after) <= 10000:
            batch_string = batch_string_after
        else:
            batch_string_list.append(batch_string)
            batch_string = single
            if len(batch_string_list) == thread_num:
                print("[仅仅留下名词,构建地理与非地理名词词表] {}/{}".format(i, num))
                try:
                    threads = multi_threading(batch_string_list, token_pool,
                                              ip_list)
                except Exception:
                    print("本次请求失败")
                    # break
                    continue

                k = 0
                for thread in threads:
                    k += 1
                    data_fenci = thread.get_result()
                    # print(data_fenci)

                    if data_fenci is None:
                        print("[解析词表] 第 {} 个线程抓取失败".format(k))
                        ip_list = getProxy(7)
                        continue

                    if not data_fenci.__contains__("items"):
                        print(data_fenci)
                        print("[解析词表] 第 {} 个线程抓取无结果".format(k))
                        ip_list = getProxy(7)
                        continue

                    # print(data_fenci["text"])
                    print("[解析词表] 正在解析第 {} 个线程抓取的结果".format(k))

                    seg_save = []
                    for item in data_fenci["items"]:
                        ci = item["item"]
                        if len(ci) <= 1 or ci in stopwords or is_uchar(
                                ci) is False:
                            # if len(ci) <= 1 or ci in stopwords:
                            continue
                        if ci == "文章":
                            user_last.append("文章")
                            continue
                        elif item["pos"] in Flag:
                            seg_save.append(ci)
                            non_geo.add(ci)
                        elif item["ne"] in Flag_geo or item["pos"] in Flag_geo:
                            seg_save.append(ci)
                            geo.add(ci)
                        elif ci == "分割":
                            if seg_save:
                                comment = '/'.join(seg_save)
                                user_last.append(comment)
                                seg_save = []
                batch_string_list = []
                # break

    return user_last, geo, non_geo