def get_comments(initFilePath, filePath, ip_num): list_loc = get_starter(initFilePath, filePath) total_number = len(list_loc) ip_list = getProxy(ip_num) url_base = "https://cn.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_SUR_REVIEWS_RESP&metaReferer=ShowUserReviewsAttractions&reviewId=" threading_num = len(ip_list) post_url_pool = [] infor_pool = [] for i in range(total_number): url = url_base + list_loc[i][2][1:] if len(post_url_pool) < threading_num and i != total_number - 1: post_url_pool.append(url) infor_pool.append((list_loc[i][0], list_loc[i][1], list_loc[i][2])) print("准备爬取第 {} 个的评论全文 共有 {} 个评论".format(i, total_number)) continue # 最后一页既要进url池,又要启动线程 if i == total_number - 1: print("准备爬取第 {} 个的评论全文 共有 {} 个评论".format(i, total_number)) post_url_pool.append(url) threads = multi_threading(post_url_pool, ip_list) with open(filePath, 'a+', newline='', encoding='utf-8-sig') as csvfile: writer = csv.writer(csvfile) num = 0 change_ip = False for thread in threads: try: (title, content) = thread.get_result() print("title: {}".format(title)) # print("content: {}".format(content)) writer.writerow([ infor_pool[num][0], infor_pool[num][1], infor_pool[num][2], title, content ]) except Exception as e: print("[Get_List]Error ", e) change_ip = True num += 1 if change_ip: ip_list = getProxy(ip_num) # 准备下一次输送给线程的url if i != total_number - 1: print("准备爬取第 {} 个的评论全文 共有 {} 个评论".format(i, total_number)) post_url_pool = [url] infor_pool = [(list_loc[i][0], list_loc[i][1], list_loc[i][2])]
def get_list(province_id, filePath): ip_list = getProxy() proxies = choice(ip_list) print("[Get_List]The valid IP: ", ip_list) url_base_1 = 'https://cn.tripadvisor.com/Attractions-' + str( province_id) + '-Activities-oa' url_base_2 = '-New_York_City_New_York.html' page = get_starter(filePath) while page <= 50: num = (page - 1) * 30 url = url_base_1 + str(num) + url_base_2 # 使用自定义的opener对象,调用open()方法来发送请求 try: response = requests.get(url, proxies=proxies) except Exception as e: print("[Get_List]Error ", e) print("[Get_List]False to spider page " + str(page)) page -= 1 ip_list = getProxy() proxies = choice(ip_list) else: print("[Get_List]Success to spider page " + str(page)) html_code = response.text # print(html_code) hp = MyHTMLParser() hp.feed(html_code) hp.close() print(hp.links) print("[Get_List]Success to spider poi " + str(len(hp.links))) # 判断是否爬完 if len(hp.links) == 0: break for loc in hp.links: with open(filePath, 'a+', encoding='utf-8') as f: loc_list = loc.split("-") # print(loc_list[4]) f.write( str(loc_list[2]) + "\t" + str(loc_list[4]) + "\t" + str(loc) + "\t" + str(page) + "\n") page += 1 print("[Get_List]Done spider all the list")
def get_comments(initFilePath, filePath): list_loc = get_starter(initFilePath, filePath) total_number = len(list_loc) ip_list = getProxy(10) print("[Get_List]The valid IP: ", ip_list) for i in range(total_number): with open(filePath, 'a+', newline='', encoding='utf-8-sig') as csvfile: writer = csv.writer(csvfile) base_url = list_loc[i][2] links = getPoiComments_id(base_url, ip_list) for link in links: link_sec = link.split("-") writer.writerow([link_sec[1], link_sec[2], link_sec[3], link]) print("[Get_Comments]Done write file " + str(list_loc[i][1]) + " page number is " + str(list_loc[i][3]))
result = thread.get_result() print(result) try: if len(result) == 0: return comments_id except Exception as e: print("[Get_List]Error ", e) continue comments_id.extend(result) print(len(comments_id)) print("已经爬到了 {} 个的评论".format(len(comments_id))) # 准备下一次输送给线程的url if page != page_num: print("准备爬从第 {} 页开始的评论".format(page)) post_url_pool = [origin + base_sec[0] + "-Reviews-or" + str(num) + base_sec[0]] page += 1 return comments_id if __name__ == '__main__': base_url = "/Attraction_Review-g60763-d1687489-Reviews-The_National_9_11_Memorial_Museum-New_York_City_New_York.html " ip_list = getProxy(10) print("[Get_List]The valid IP: ", ip_list) links = getPoiComments_id(base_url, ip_list) print(len(links)) for item in links: print(item)
get_url_base = "http://wxapp.mafengwo.cn/gonglve/poi/?jsondata=" json_base_1 = "{%22data_style%22:%22comment_list%22,%22filter_style%22:%22comment%22,%22filter%22:{%22poiid%22:%22" json_base_2 = "%22,%22tag%22:0},%22page%22:{%22no%22:" json_base_3 = ",%22num%22:30}}" poiid = 3474 page = 1 json_str = json_base_1 + str(poiid) + json_base_2 + str(page) + json_base_3 get_url = get_url_base + json_str print(get_url) ip_list = getProxy() filePath = "./data/comment_all.txt" with open(filePath, 'a+',encoding='utf-8') as f: headers['User-Agent'] = choice(userAgent) req = urllib.request.Request(get_url, headers=headers) proxy_handler = urllib.request.ProxyHandler(choice(ip_list)) opener = urllib.request.build_opener(proxy_handler) try: response = opener.open(req) except: print("[Get_Comments]False to spider") else: print("[Get_Comments]Success to spider") # 返回的是一个json格式的字符串,将字符串转为dict对象
def get_subpoi(province_id, initFilePath, filePath): headers, userAgent = head_useragent() initFilePath = "./data/" + province_id + "_list_all.txt" filePath = "./data/" + province_id + "_list_all_sub.txt" list_loc, fatherId, sub_page = get_starter(initFilePath, filePath) ip_list = getProxy() # 循环对每一个景点判断是否有子景点 for poi in list_loc: print("[Get_List]Start to spider:" + str(poi[2] + " Page " + str(poi[-1]))) with open(filePath, 'a+', encoding='utf-8') as f: # 天才的想法: # 如果不是第一次进这个循环(fatherId就是列表第一项),那么需要输入父节点信息; # 如果是第一次进来,但是文件只有表头(fatherId == 0),也需要输入父节点信息; if poi[2] != str(fatherId): f.write(str(poi[0]) + "\t" + str(poi[1]) + "\t" + str(poi[2]) + "\t" + str(poi[3]) + "\t" + str(poi[4]) + "\t" + str(poi[5]) + "\t0\t0\t0\n") hasMore = True # 判断该父景点是否还有下一页子景点 while hasMore: # 准备请求内容以及请求URL sub_page += 1 opener, req = prepare_request(poi[2], sub_page, ip_list, userAgent, headers) # 判断该景点(父景点)是否有子景点,如果有则继续,没有则跳过 try: response = opener.open(req) except Exception as e: print("[Get_List]Error ", e) print("[Get_List]False to spider " + str(poi[2]) + " Page " + str(poi[-1]) + " sub_page " + str( sub_page)) sub_page -= 1 ip_list = getProxy() else: # 返回的是一个json格式的字符串,将字符串转为dict对象 data_json = json.loads(response.read().decode("utf8")) data_all = data_json.get("data") html = data_all["html"] # 如果没有controller_data说明该景点(分类不是景点的)没有子景点 if "controller_data" not in data_all: print("[Get_List]This location has no sub-locations") break controller_data = data_all["controller_data"] # 判断是否该父景点是否还有下页 hasMore = controller_data["hasMore"] # 得到当前父景点爬的子景点的页数 curPage = controller_data["curPage"] hp = MyHTMLParser() hp.feed(html) hp.close() # 分类是景点但是没有子景点的有controller_data,但是html中没有相应数据 if len(hp.href) == 0: print("[Get_List]This location has no sub-locations") break # 严谨,防止信息不匹配 if not len(hp.href) == len(hp.target) == len(hp.people): raise Exception("[Get_List]子景点信息不匹配") for i in range(len(hp.href)): # 子景点去过的人数少于5个就不写入文件中 if int(hp.people[i]) <= 5: continue pattern = re.compile("/poi/(.*).html", re.IGNORECASE) sub_poi = pattern.findall(hp.href[i])[0] with open(filePath, 'a+', encoding='utf-8') as f: f.write(str(hp.target[i]) + "\t" + str(poi[1]) + "\t" + str(sub_poi) + "\t" + str(poi[3]) + "\t" + str(poi[4]) + "\t" + str(poi[5]) + "\t" + str(poi[0]) + "\t" + str(poi[2]) + "\t" + str(sub_page) + "\n") print("[Get_List]Success to spider " + str(poi[2]) + " Page " + str(poi[-1]) + " sub_page " + str(sub_page)) # 爬完该父景点的所有子景点之后将sub_page置零,不放在循环开始因为第一次进入循环的时候sub_page不为零。 sub_page = 0 print("[Get_List]Finish to spider:" + str(poi[2] + " Page " + str(poi[-1]))) print("[Get_List]Done spider all the list") # 因为有的景点可能既是一个独立的景点,又同时属于某一个景点的子景点,那么它就会存在两次 clean_file(filePath)
def multi_thread(user_cut, token_pool, Flag, Flag_geo): thread_num = len(token_pool) * 2 geo = set() non_geo = set() num = len(user_cut) batch_string = "" user_last = [] batch_string_list = [] i = 0 ip_list = getProxy(7) for single in user_cut: i += 1 batch_string_after = '分割'.join([batch_string, single]) if len(batch_string_after) <= 10000: batch_string = batch_string_after else: batch_string_list.append(batch_string) batch_string = single if len(batch_string_list) == thread_num: print("[仅仅留下名词,构建地理与非地理名词词表] {}/{}".format(i, num)) try: threads = multi_threading(batch_string_list, token_pool, ip_list) except Exception: print("本次请求失败") # break continue k = 0 for thread in threads: k += 1 data_fenci = thread.get_result() # print(data_fenci) if data_fenci is None: print("[解析词表] 第 {} 个线程抓取失败".format(k)) ip_list = getProxy(7) continue if not data_fenci.__contains__("items"): print(data_fenci) print("[解析词表] 第 {} 个线程抓取无结果".format(k)) ip_list = getProxy(7) continue # print(data_fenci["text"]) print("[解析词表] 正在解析第 {} 个线程抓取的结果".format(k)) seg_save = [] for item in data_fenci["items"]: ci = item["item"] if len(ci) <= 1 or ci in stopwords or is_uchar( ci) is False: # if len(ci) <= 1 or ci in stopwords: continue if ci == "文章": user_last.append("文章") continue elif item["pos"] in Flag: seg_save.append(ci) non_geo.add(ci) elif item["ne"] in Flag_geo or item["pos"] in Flag_geo: seg_save.append(ci) geo.add(ci) elif ci == "分割": if seg_save: comment = '/'.join(seg_save) user_last.append(comment) seg_save = [] batch_string_list = [] # break return user_last, geo, non_geo