def tp_rest_list_page_num(self, index_url, city_id, part): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } page = requests.get(index_url, proxies=proxies, headers=headers) page.encoding = 'utf8' if len(page.text) < 100: update_proxy('Platform', PROXY, x, '23') self.retry() page.encoding = 'utf8' doc = PyQuery(page.text) doc.make_links_absolute(index_url) num_list = [] for item in doc('.pageNumbers a').items(): num = int(rest_oa_pattern.findall(item.attr.href)[0]) num_list.append(num) tp_rest_detail_page_url.delay(index_url, city_id, part) try: for page_num in range(30, max(num_list) + 30, 30): g_num = rest_g_pattern.findall(index_url)[0] tp_rest_detail_page_url.delay(index_url.replace('-g' + g_num, '-g{0}-oa{1}'.format(g_num, page_num)), city_id, part) except: pass
def tp_rest_detail_page_url(self, page_num_url, city_id, part): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } page = requests.get(page_num_url, proxies=proxies, headers=headers) page.encoding = 'utf8' if len(page.text) < 100: update_proxy('Platform', PROXY, x, '23') self.retry() doc = PyQuery(page.text) doc.make_links_absolute(page_num_url) data = [] worker = u'daodao_poi_base_data' for item in doc('.property_title').items(): href = item.attr.href if 'Restaurant_Review' in href: args = json.dumps( {u'target_url': unicode(href), u'city_id': unicode(city_id), u'type': u'rest'}) task_id = get_task_id(worker, args=args) data.append((task_id, worker, args, unicode(part).replace(u'list', u'detail'))) print insert_task(data=data)
def qyer_country_spider(self, country_id, country_link, debug=False, **kwargs): """ 抓取穷游上的城市数据 country_id: int, index country info country_en: str. country_en country_link: str. """ http_tools = init_qyer_session(debug=True) x = time.time() spider_proxy = "socks5://" + get_proxy(source="Platform") qyer_db = QyerModel(**save_db_config) try: spider_ret = http_tools(country_link, proxy=spider_proxy) status_code = spider_ret[1] if status_code != 200 and status_code != 404: raise Exception(str(status_code)) page_html = etree.HTML(spider_ret[0]) country_max_page = find_max_page(page_html) save_data = [country_max_page, country_id] qyer_db.update_country_page(save_data) update_task(kwargs['task_id']) except Exception as exc: update_proxy('Platform', spider_proxy, x, '23') self.retry(exc=traceback.format_exc(exc))
def get_cities(self, gid, country_id, offset): PROXY = get_proxy(source="Platform") x = time.time() proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = {'User-agent': GetUserAgent()} try: target_url = 'http://www.tripadvisor.cn/TourismChildrenAjax?geo={0}&offset={1}&desktop=true'.format( gid, offset) page = requests.get(target_url, headers=headers, proxies=proxies) page.encoding = 'utf8' content = page.text res = re.findall( 'ta.store\(\'tourism.popularCitiesMaxPage\', \'(\d+)\'\);', content) has_next = False if res is not None and res != []: if offset < int(res[0]): has_next = True result = [] for line in _parse_city(content=content, target_url=target_url): per_city = list(line) per_city.append(country_id) result.append(per_city) print insert_db(result) if has_next: get_cities.delay(gid, country_id, offset + 1) except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def tripadvisor_city_query_task(self, city_name, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = {'User-agent': GetUserAgent()} try: conn = pymysql.connect(host='10.10.180.145', user='******', passwd='hourong', db='SuggestName', charset="utf8") with conn as cursor: print(city_name) quote_string = quote(city_name.encode('utf8')) page = requests.get( 'http://www.tripadvisor.cn/TypeAheadJson?interleaved=true&types=geo%2Ctheme_park%2Cair&neighborhood_geos=true&link_type=geo&details=true&max=6&hglt=true&query={0}&action=API&uiOrigin=GEOSCOPE&source=GEOSCOPE' .format(quote_string), proxies=proxies, headers=headers) page.encoding = 'utf8' content = page.text.replace('while(1);', '') for line in get_query_data(content=content, query_string=city_name): cursor.execute( 'insert into TripAdvisorSuggestCity (`QueryName`,`Name`,`coords`,`Url`) VALUES (%s,%s,%s,%s)', line) conn.close() update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def detail_page(self, pid, page_num, city_id, part): x = time.time() PROXY = get_proxy(source="Platform") proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent(), } try: data = { u'page': unicode(page_num), u'type': u'city', u'pid': unicode(pid), u'sort': u'32', u'subsort': u'all', u'isnominate': u'-1', u'haslastm': u'false', u'rank': u'6' } json_page = requests.post(u'http://place.qyer.com/poi.php?action=list_json', data=data, proxies=proxies, headers=headers) json_page.encoding = u'utf8' content = json_page.text j_data = json.loads(content) task_data = [] url_result = [] for attr in j_data[u'data'][u'list']: worker = u'qyer_poi_task' args = json.dumps( {u'target_url': unicode(u'http:' + attr[u'url']), u'city_id': unicode(city_id)}) task_id = get_task_id(worker=worker, args=args) task_data.append((task_id, worker, args, unicode(part.replace('list', 'detail')))) url_result.append(u'http' + attr[u'url']) result = insert_task(data=task_data) print result print url_result return result except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def tp_rest_city_page(self, city_url, city_id, part): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } page = requests.get(city_url, proxies=proxies, headers=headers) page.encoding = 'utf8' if len(page.text) < 100: update_proxy('Platform', PROXY, x, '23') self.retry() doc = PyQuery(page.text) doc.make_links_absolute(city_url) for item in doc('.restaurants.twoLines a').items(): tp_rest_list_page_num.delay(item.attr.href, city_id, part)
def get_long_comment(self, target_url, language, miaoji_id, special_str): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, headers=headers, proxies=proxies, timeout=120) page.encoding = 'utf8' data = long_comment_parse(page.content, target_url, language, miaoji_id) update_proxy('Platform', PROXY, x, '0') print "Success with " + PROXY + ' CODE 0' return insert_db((data,), 'tp_comment_' + special_str) except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def venere_comment(self, target_url, **kwargs): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, proxies=proxies, headers=headers, timeout=120) page.encoding = 'utf8' result = venere_comment_parser(page.text, target_url) if not result: update_proxy('Platform', PROXY, x, '23') self.retry() else: update_task(kwargs['mongo_task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') return result except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def yelp_price_level(self, target_url, mid): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, proxies=proxies, headers=headers, timeout=120) price_level = get_yelp_price_level(page) if not page.text: update_proxy('Platform', PROXY, x, '23') self.retry() else: print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') print yelp_price_level_update_db((price_level, mid)) return price_level except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def hotel_base_data(self, source, url, other_info, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = {'User-agent': GetUserAgent()} try: page = requests.get(url, proxies=proxies, headers=headers, timeout=240) page.encoding = 'utf8' content = page.text # agoda 特殊情况 start url_about = 'https://www.agoda.com/NewSite/zh-cn/Hotel/AboutHotel?hotelId={0}&languageId=8&hasBcomChildPolicy=False'.format( other_info['source_id']) page_about = requests.get(url=url_about, headers=headers) page_about.encoding = 'utf8' about_content = page_about.text other_info['about_content'] = about_content # agoda end result = parse_hotel(content=content, url=url, other_info=other_info, source=source) if not result: update_proxy('Platform', PROXY, x, '23') self.retry() else: update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') return result except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def get_lost_rest_new(self, target_url, city_id, **kwargs): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, headers=headers, proxies=proxies, timeout=15) page.encoding = 'utf8' result = rest_parser(page.content, target_url, city_id) if result == 'Error': self.retry() else: update_task(task_id=kwargs['mongo_task_id']) update_proxy('Platform', PROXY, x, '23') return result except Exception as exc: self.retry(exc=traceback.format_exc(exc))
def shutter_spider(self, vid, search_kw, debug=False, **kwargs): """ shutterstock 图片搜索爬取 """ if search_kw is None or search_kw == "null": # todo logging null key words return None x = time.time() spider_proxy = 'socks5://' + get_proxy(source="Platform") try: spider = ShutterShockPicSpider(search_kw, spider_proxy, debug) pic_ret = spider.pic_search() pic_save_data = shutter_pic_data_assembly(vid, search_kw, pic_ret) spider_db = PicModel(**save_db_config) for _, save_data_map in pic_save_data.items(): spider_db.insert_pic_many(save_data_map["table"], save_data_map["fields"], save_data_map["values"]) update_task(kwargs['task_id']) except Exception as exc: update_proxy('Platform', spider_proxy, x, '23') self.retry(exc=traceback.format_exc(exc))
def get_pid_total_page(self, target_url, city_id, part): x = time.time() PROXY = get_proxy(source="Platform") proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: html_page = requests.get(target_url, proxies=proxies, headers=headers) html_page.encoding = u'utf8' content = html_page.text pid = re.findall(u'PID :\'(\d+)\'', content)[0] total_attr = re.findall(u'景点\((\d+)\)', content)[0] # return pid, (int(total_attr) // 15) + 1 print pid, total_attr for page_num in range(1, (int(total_attr) // 15) + 2): detail_page.delay(pid, page_num, city_id, part) except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def qyer_city_query_task(self, city_name, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = { 'User-agent': GetUserAgent(), 'Referer': "http://www.qyer.com/", } try: conn = pymysql.connect(host='10.10.180.145', user='******', passwd='hourong', db='SuggestName', charset="utf8") with conn as cursor: print(city_name) quote_string = quote(city_name.encode('utf8')) page = requests.get( 'http://www.qyer.com/qcross/home/ajax?action=search&keyword={0}' .format(quote_string), proxies=proxies, headers=headers) page.encoding = 'utf8' content = page.text.replace('while(1);', '') for line in get_query_data(content=content, query_string=city_name): cursor.execute( 'insert into QyerSuggestCity (`QueryName`,`Name`,`BelongName`,`Url`) VALUES (%s,%s,%s,%s)', line) conn.close() update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def qyer_city_spider(self, country_id, country_en, country_link, debug=False, **kwargs): """ 抓取穷游上的城市数据 country_id: int, index country info country_en: str. country_en country_link: str. """ if country_en in city_state: country_type = "city_state" else: country_type = "city_list" http_tools = init_qyer_session(debug=True) x = time.time() country_args = {"country_en": country_en, "country_id": country_id} spider_proxy = "socks5://" + get_proxy(source="Platform") qyer_db = QyerModel(**save_db_config) try: spider_ret = http_tools(country_link, proxy=spider_proxy) status_code = spider_ret[1] if status_code != 200 and status_code != 404: raise Exception(str(status_code)) save_data = platform_page_parse(country_type, spider_ret[0], **country_args) qyer_db.insert_many_data(*save_data) except Exception as exc: update_proxy('Platform', spider_proxy, x, '23') self.retry(exc=traceback.format_exc(exc))
def get_site_url(self, target_url, source_id, table_name): PROXY = get_proxy(source="Platform") x = time.time() try: res = _get_site_url(target_url) if res == 'Error': update_proxy('Platform', PROXY, x, '23') self.retry() else: print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') update_site_url(res, source_id, table_name=table_name) except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def booking_detail_task(self, url, task): try: result = booking_detail_crawl() if not result: x = time.time() update_proxy('Platform', PROXY, x, '23') self.retry() else: x = time.time() print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') return result except Exception as exc: x = time.time() update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def get_comment(self, target_url, language, miaoji_id, special_str, **kwargs): if language == 'en': data = { 'mode': 'filterReviews', 'filterLang': 'en' } elif language == 'zhCN': data = { 'mode': 'filterReviews', 'filterLang': 'zh_CN' } else: return "Error, no such language" PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } if data != '': try: page = requests.post(target_url, data, headers=headers, proxies=proxies, timeout=120) page.encoding = 'utf8' res = parse(page.text, target_url, language, miaoji_id, special_str) if res == 0: update_proxy('Platform', PROXY, x, '23') self.retry(countdown=120) else: # update_task(kwargs['mongo_task_id']) update_proxy('Platform', PROXY, x, '0') print "Success with " + PROXY + ' CODE 0' except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc), countdown=120)
def get_daodao_image_url(self, source_url, mid, **kwargs): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } try: detail_id = re.findall('-d(\d+)', source_url)[0] target_url = 'http://www.tripadvisor.cn/LocationPhotoAlbum?detail=' + detail_id page = requests.get(target_url, proxies=proxies, headers=headers, timeout=240) page.encoding = 'utf8' if not page.text: update_proxy('Platform', PROXY, x, '23') self.retry() else: print "Success with " + PROXY + ' CODE 0' root = PyQuery(page.text) images_list = [] for div in root('.photos.inHeroList div').items(): images_list.append(div.attr['data-bigurl']) img_list = '|'.join(images_list) if img_list == '': self.retry() data = (mid, source_url, img_list) print insert_daodao_image_list(data) update_proxy('Platform', PROXY, x, '0') update_task(kwargs['mongo_task_id']) return data except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))