def GetData(tripType, orig, dest, deptDate, retDate): searchURL = "https://www.bookryanair.com/SkySales/Search.aspx" refererURL = "https://www.bookryanair.com/SkySales/booking.aspx?culture=en-gb&lc=en-gb&cmpid2=Google" data = {"fromaction": "Search.aspx", "SearchInput$TripType": tripType, "SearchInput$Orig": orig, "SearchInput$Dest": dest, "SearchInput$DeptDate": deptDate, "SearchInput$RetDate": retDate, "SearchInput$IsFlexible": "on", "SearchInput$PaxTypeADT": 1, "SearchInput$PaxTypeCHD": 0, "SearchInput$PaxTypeINFANT": 0, "SearchInput$AcceptTerms": "on", "__EVENTTARGET": "SearchInput$ButtonSubmit", } #p = get_proxy() p = get_proxy(type = 'f') resp = request_post_data(searchURL, data, referer = refererURL, proxy = p) if resp == None or len(resp) == 0: #invalid_proxy(p) pass else: return resp return resp
def smartfares_task_parser(taskcontent): result = {} flights = {} tickets = [] result['para'] = {'flight': flights, 'ticket': tickets} result['error'] = 0 taskcontent = taskcontent.encode('utf-8') try: dept_id, dest_id, dept_day, dest_day = taskcontent.strip().split('&')[0], \ taskcontent.strip().split('&')[1], taskcontent.strip().split('&')[2], \ taskcontent.strip().split('&')[3] except: logger.error('smartfaresFlight::Wrong Content Format with %s' % taskcontent) result['error'] = TASK_ERROR return result p = get_proxy(source='smartfaresFlight') #p= None if p == None: result['error'] = PROXY_NONE return result cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) try: search_url = get_search_url(dept_day, dest_day, dept_id, dest_id) content = crawl_single_page(search_url, proxy=p, referer=HOST) search_id = get_search_id(content) if search_id == '' or search_id == None: logger.error('smartfares::Parse search id failed') result['error'] = PROXY_INVALID return result except: logger.error('smartfares::Parse search id failed') result['error'] = PROXY_INVALID return result url_real = URL % search_id i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content_real = crawl_single_page(url=url_real, proxy=p, referer=search_url) content_len = len(content_real) i += 1 if len(content_real) > 100: parser_result = parsePage(content_real) tickets = parser_result['ticket'] flights = parser_result['flight'] result['para'] = {'flight': flights, 'ticket': tickets} return result else: result['error'] = DATA_NONE return result
def yelp_price_level(self, target_url, mid): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, proxies=proxies, headers=headers, timeout=120) price_level = get_yelp_price_level(page) if not page.text: update_proxy('Platform', PROXY, x, '23') self.retry() else: print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') print yelp_price_level_update_db((price_level, mid)) return price_level except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def booking_list_crawl(task): # 将任务进行拆分,拆分成该源上的城市中文名和城市id # eg :黄石国家公园西门&6406®ion , 大雾山国家公园&255516&landmark # eg: 福森&-1773182 # 任务类型, city, region, landmark city_name_zh, source_city_id, search_type = task.content.encode( 'utf8').split('&') # 对城市中文名进行编码 city_name_zh = urllib.quote(city_name_zh) check_in_year = task.check_in[0:7] check_in_day = task.check_in[8:] check_out_year = task.check_out[0:7] check_out_day = task.check_out[8:] # 对首页url进行拼接 # url = get_search_url(check_in, check_out, source_city_id, city_name_zh, 1) # 注意!!!!!!大部分抓的dest_type都是city,黄石国家公园西门是region, 大雾山国家公园大峡谷国家公园都是landmark Id = source_city_id dest_type = search_type destination = city_name_zh if is_alp(Id[0]): url = 'http://www.booking.com/searchresults.zh-cn.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Azh-O%3Aabn-B%3Achrome-N%3Ayes-S%3Abo-U%3Asalo;sid=4cb8e58619e9a15fe212e5b9fbec271b;dcid=12;checkin_monthday=' + check_in_day + ';checkin_year_month=' + check_in_year + ';checkout_monthday=' + check_out_day + ';checkout_year_month=' + check_out_year + ';class_interval=1;dest_id=' + Id + ';dest_type=' + dest_type + ';dtdisc=0;group_adults=2;group_children=0;hlrd=0;hyb_red=0;inac=0;label_click=undef;nha_red=0;no_rooms=1;offset=0;postcard=0;qrhpp=9f9582988e3752a8d34a7f85874afc39-city-0;redirected_from_city=0;redirected_from_landmark=0;redirected_from_region=0;review_score_group=empty;room1=A%2CA;sb_price_type=total;score_min=0;src=index;src_elem=sb;ss=' + destination + ';ss_all=0;ss_raw=' + destination + ';ssb=empty;sshis=0;origin=search;srpos=1&place_id=' + Id else: url = 'http://www.booking.com/searchresults.zh-cn.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Azh-O%3Aabn-B%3Achrome-N%3Ayes-S%3Abo-U%3Asalo;sid=4cb8e58619e9a15fe212e5b9fbec271b;dcid=12;checkin_monthday=' + check_in_day + ';checkin_year_month=' + check_in_year + ';checkout_monthday=' + check_out_day + ';checkout_year_month=' + check_out_year + ';class_interval=1;dest_id=' + Id + ';dest_type=' + dest_type + ';dtdisc=0;group_adults=2;group_children=0;hlrd=0;hyb_red=0;inac=0;label_click=undef;nha_red=0;no_rooms=1;offset=0;postcard=0;qrhpp=9f9582988e3752a8d34a7f85874afc39-city-0;redirected_from_city=0;redirected_from_landmark=0;redirected_from_region=0;review_score_group=empty;room1=A%2CA;sb_price_type=total;score_min=0;src=index;src_elem=sb;ss=' + destination + ';ss_all=0;ss_raw=' + destination + ';ssb=empty;sshis=0;origin=search;srpos=1' print url, '=================' PROXY = get_proxy(source="Platform") headers = {'User-agent': GetUserAgent()} proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} page = requests.get(url, proxies=proxies, headers=headers) page.encoding = 'utf8' content = page.text root = HTML.fromstring(content) hotel = root.xpath('//*[@class="sr_header "]/h1/text()')[0].encode( 'utf-8').replace(',', '').strip() # print hotel # 获取酒店数,获取的当前时间内有空房的酒店数 # 有两个数时取后面的数 temp_count = hotelcount_pat.findall(hotel) hotel_count = temp_count[-1] crawl_page = int(hotel_count) / 15 + 1 # todo data crawl # 对首页进行数据爬取 # parse_each_page(page, city_id, continent) result = list() result.append(url) # 开始进行翻页 for page_index in range(1, crawl_page): offset = 14 + (page_index - 1) * 15 each_page_url = get_search_url(task.check_in, task.check_out, source_city_id, city_name_zh, offset, search_type) result.append(each_page_url) return result
def tp_rest_detail_page_url(self, page_num_url, city_id, part): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } page = requests.get(page_num_url, proxies=proxies, headers=headers) page.encoding = 'utf8' if len(page.text) < 100: update_proxy('Platform', PROXY, x, '23') self.retry() doc = PyQuery(page.text) doc.make_links_absolute(page_num_url) data = [] worker = u'daodao_poi_base_data' for item in doc('.property_title').items(): href = item.attr.href if 'Restaurant_Review' in href: args = json.dumps( {u'target_url': unicode(href), u'city_id': unicode(city_id), u'type': u'rest'}) task_id = get_task_id(worker, args=args) data.append((task_id, worker, args, unicode(part).replace(u'list', u'detail'))) print insert_task(data=data)
def tp_rest_list_page_num(self, index_url, city_id, part): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } page = requests.get(index_url, proxies=proxies, headers=headers) page.encoding = 'utf8' if len(page.text) < 100: update_proxy('Platform', PROXY, x, '23') self.retry() page.encoding = 'utf8' doc = PyQuery(page.text) doc.make_links_absolute(index_url) num_list = [] for item in doc('.pageNumbers a').items(): num = int(rest_oa_pattern.findall(item.attr.href)[0]) num_list.append(num) tp_rest_detail_page_url.delay(index_url, city_id, part) try: for page_num in range(30, max(num_list) + 30, 30): g_num = rest_g_pattern.findall(index_url)[0] tp_rest_detail_page_url.delay(index_url.replace('-g' + g_num, '-g{0}-oa{1}'.format(g_num, page_num)), city_id, part) except: pass
def qyer_country_spider(self, country_id, country_link, debug=False, **kwargs): """ 抓取穷游上的城市数据 country_id: int, index country info country_en: str. country_en country_link: str. """ http_tools = init_qyer_session(debug=True) x = time.time() spider_proxy = "socks5://" + get_proxy(source="Platform") qyer_db = QyerModel(**save_db_config) try: spider_ret = http_tools(country_link, proxy=spider_proxy) status_code = spider_ret[1] if status_code != 200 and status_code != 404: raise Exception(str(status_code)) page_html = etree.HTML(spider_ret[0]) country_max_page = find_max_page(page_html) save_data = [country_max_page, country_id] qyer_db.update_country_page(save_data) update_task(kwargs['task_id']) except Exception as exc: update_proxy('Platform', spider_proxy, x, '23') self.retry(exc=traceback.format_exc(exc))
def hotel_base_data(self, source, url, other_info, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = {'User-agent': GetUserAgent()} try: page = requests.get(url, proxies=proxies, headers=headers, timeout=240) page.encoding = 'utf8' content = page.text # agoda 特殊情况 start url_about = 'https://www.agoda.com/NewSite/zh-cn/Hotel/AboutHotel?hotelId={0}&languageId=8&hasBcomChildPolicy=False'.format( other_info['source_id']) page_about = requests.get(url=url_about, headers=headers) page_about.encoding = 'utf8' about_content = page_about.text other_info['about_content'] = about_content # agoda end result = parse_hotel(content=content, url=url, other_info=other_info, source=source) if not result: update_proxy('Platform', PROXY, x, '23') self.retry() else: update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') return result except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def GetData(tripType, orig, dest, deptDate, retDate): searchURL = "https://www.bookryanair.com/SkySales/Search.aspx" refererURL = "https://www.bookryanair.com/SkySales/booking.aspx?culture=en-gb&lc=en-gb&cmpid2=Google" data = {"fromaction": "Search.aspx", "SearchInput$TripType": tripType, "SearchInput$Orig": orig, "SearchInput$Dest": dest, "SearchInput$DeptDate": deptDate, "SearchInput$RetDate": retDate, "SearchInput$IsFlexible": "on", "SearchInput$PaxTypeADT": 1, "SearchInput$PaxTypeCHD": 0, "SearchInput$PaxTypeINFANT": 0, "SearchInput$AcceptTerms": "on", "__EVENTTARGET": "SearchInput$ButtonSubmit", } # 如果抓起失败,换一个代理IP,然后重试 for i in range(3): p = get_proxy() resp = request_post_data(searchURL, data, referer = refererURL, proxy = p) if resp == None or len(resp) == 0: invalid_proxy(p) else: return resp return resp
def airfrance_parser(postData, dept_city, dest_city, year, month, day): tickets = [] flights = {} result = {} result['para'] = {'ticket': tickets, 'flight': flights} result['error'] = 0 mc = MC() mc.set_debug(True) p = get_proxy(source='airfranceFlight') result['proxy'] = p if p == None or p == '': result['error'] = PROXY_NONE return result #mc.set_proxy(p) try: url0 = 'http://www.airfrance.com.cn/' page0 = mc.req('get', url0, html_flag=True) mc.add_referer(RefererURL_0) url1 = SearchURL_0 page1 = mc.req('post', url1, postData, paras_type=2, html_flag=True) mc.add_referer(RefererURL) url2 = SearchURL page2 = mc.req('post', url2, postData, paras_type=2, html_flag=True) except Exception, e: result['error'] = PROXY_INVALID return result
def venere_comment(self, target_url, **kwargs): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, proxies=proxies, headers=headers, timeout=120) page.encoding = 'utf8' result = venere_comment_parser(page.text, target_url) if not result: update_proxy('Platform', PROXY, x, '23') self.retry() else: update_task(kwargs['mongo_task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') return result except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def get_cities(self, gid, country_id, offset): PROXY = get_proxy(source="Platform") x = time.time() proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = {'User-agent': GetUserAgent()} try: target_url = 'http://www.tripadvisor.cn/TourismChildrenAjax?geo={0}&offset={1}&desktop=true'.format( gid, offset) page = requests.get(target_url, headers=headers, proxies=proxies) page.encoding = 'utf8' content = page.text res = re.findall( 'ta.store\(\'tourism.popularCitiesMaxPage\', \'(\d+)\'\);', content) has_next = False if res is not None and res != []: if offset < int(res[0]): has_next = True result = [] for line in _parse_city(content=content, target_url=target_url): per_city = list(line) per_city.append(country_id) result.append(per_city) print insert_db(result) if has_next: get_cities.delay(gid, country_id, offset + 1) except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def GetData(tripType, orig, dest, deptDate, retDate): searchURL = "https://www.bookryanair.com/SkySales/Search.aspx" refererURL = "https://www.bookryanair.com/SkySales/booking.aspx?culture=en-gb&lc=en-gb&cmpid2=Google" data = {"fromaction": "Search.aspx", "SearchInput$TripType": tripType, "SearchInput$Orig": orig, "SearchInput$Dest": dest, "SearchInput$DeptDate": deptDate, "SearchInput$RetDate": retDate, "SearchInput$IsFlexible": "on", "SearchInput$PaxTypeADT": 1, "SearchInput$PaxTypeCHD": 0, "SearchInput$PaxTypeINFANT": 0, "SearchInput$AcceptTerms": "on", "__EVENTTARGET": "SearchInput$ButtonSubmit", } p = get_proxy() p = '221.181.104.11:8080' resp = request_post_data(searchURL, data, referer = refererURL, proxy = p,Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") if resp == None or len(resp) == 0: #invalid_proxy(p) pass else: return resp return resp
def tripadvisor_city_query_task(self, city_name, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = {'User-agent': GetUserAgent()} try: conn = pymysql.connect(host='10.10.180.145', user='******', passwd='hourong', db='SuggestName', charset="utf8") with conn as cursor: print(city_name) quote_string = quote(city_name.encode('utf8')) page = requests.get( 'http://www.tripadvisor.cn/TypeAheadJson?interleaved=true&types=geo%2Ctheme_park%2Cair&neighborhood_geos=true&link_type=geo&details=true&max=6&hglt=true&query={0}&action=API&uiOrigin=GEOSCOPE&source=GEOSCOPE' .format(quote_string), proxies=proxies, headers=headers) page.encoding = 'utf8' content = page.text.replace('while(1);', '') for line in get_query_data(content=content, query_string=city_name): cursor.execute( 'insert into TripAdvisorSuggestCity (`QueryName`,`Name`,`coords`,`Url`) VALUES (%s,%s,%s,%s)', line) conn.close() update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def elong_task_parser(content): contents = content.split('&') if len(contents) != 2: logger.error('elongFlight: wrong content format with %s' % content) return None location, origdate = contents[0].strip(), contents[1].strip() origday = datetime.datetime(string.atoi(origdate[0:4]), string.atoi(origdate[4:6]), string.atoi(origdate[6:])) urlday = (origday - datetime.datetime.today()).days dept_date = str(origday).split(' ')[0].strip() url = URL % (location, urlday) p = get_proxy() htmlcontent = crawl_single_page(url, proxy=p) if htmlcontent == '': invalid_proxy(p) logger.error( 'elongFlight: Proxy Error: htmlcontent is null with proxy: %s' % p) return [] #判断是否返回导航页,返回导航页说明content没有航班信息 #判断是否找到航班信息,没有返回[] temp_flight_list = flightPattern.findall(htmlcontent) if len(temp_flight_list) == 1: logger.error('elongFilght: Parser Error: cannot find flights with %s' % location) return [] flights = [] flight_list = temp_flight_list[:-1] typ = 0 for item in flight_list: typ = len(typePattern.findall(item)) if typ == 0: pass elif typ != 1: transfer_info = transferFlight_parser(item, dept_date, airports_dict) if transfer_info != []: flights.append(transfer_info) else: direct_info = directFlight_parser(item, dept_date, airports_dict) if direct_info != []: flights.append(direct_info) flights_set = set(flights) flights = [a for a in flights_set] #logger.info('Find %d airlines with %s'%(len(flights),location)) return flights
def smartfares_task_parser(taskcontent): result = {} flights = {} tickets = [] result['para'] = {'flight':flights, 'ticket':tickets} result['error'] = 0 taskcontent = taskcontent.encode('utf-8') try: dept_id, dest_id, dept_day, dest_day = taskcontent.strip().split('&')[0], \ taskcontent.strip().split('&')[1], taskcontent.strip().split('&')[2], \ taskcontent.strip().split('&')[3] except: logger.error('smartfaresFlight::Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result p = get_proxy(source='smartfaresFlight') #p= None if p == None: result['error'] = PROXY_NONE return result cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) try: search_url = get_search_url(dept_day,dest_day,dept_id,dest_id) content = crawl_single_page(search_url, proxy=p, referer=HOST) search_id = get_search_id(content) if search_id == '' or search_id == None: logger.error('smartfares::Parse search id failed') result['error'] = PROXY_INVALID return result except: logger.error('smartfares::Parse search id failed') result['error'] = PROXY_INVALID return result url_real = URL%search_id i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content_real = crawl_single_page(url=url_real, proxy=p, referer=search_url) content_len = len(content_real) i += 1 if len(content_real) > 100: parser_result = parsePage(content_real) tickets = parser_result['ticket'] flights = parser_result['flight'] result['para'] = {'flight':flights, 'ticket':tickets} return result else: result['error'] = DATA_NONE return result
def elong_task_parser(content): contents = content.split('&') if len(contents) != 2: logger.error('elongFlight: wrong content format with %s'%content) return None location, origdate = contents[0].strip(),contents[1].strip() origday = datetime.datetime(string.atoi(origdate[0:4]),string.atoi(origdate[4:6]),string.atoi(origdate[6:])) urlday = (origday - datetime.datetime.today()).days dept_date = str(origday).split(' ')[0].strip() url = URL%(location,urlday) p = get_proxy() htmlcontent = crawl_single_page(url,proxy = p) if htmlcontent == '': invalid_proxy(p) logger.error('elongFlight: Proxy Error: htmlcontent is null with proxy: %s'%p) return [] #判断是否返回导航页,返回导航页说明content没有航班信息 #判断是否找到航班信息,没有返回[] temp_flight_list = flightPattern.findall(htmlcontent) if len(temp_flight_list) == 1: logger.error('elongFilght: Parser Error: cannot find flights with %s'%location) return [] flights = [] flight_list = temp_flight_list[:-1] typ = 0 for item in flight_list: typ = len(typePattern.findall(item)) if typ == 0: pass elif typ != 1: transfer_info = transferFlight_parser(item,dept_date,airports_dict) if transfer_info != []: flights.append(transfer_info) else: direct_info = directFlight_parser(item,dept_date,airports_dict) if direct_info != []: flights.append(direct_info) flights_set = set(flights) flights = [a for a in flights_set] #logger.info('Find %d airlines with %s'%(len(flights),location)) return flights
def easyjet_task_parser(taskcontent): result = {} flights = {} tickets = [] result['para'] = {'flight':flights, 'ticket':tickets} result['error'] = 0 try: dept_id, dest_id, dept_day_temp = taskcontent.strip().split('&')[0], \ taskcontent.strip().split('&')[1], \ taskcontent.strip().split('&')[2] except: logger.error('easyjet::Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result search_url = get_search_url(dept_id, dest_id, dept_day_temp) p = get_proxy(source='easyjet') time_zone_A = airport[dept_id] time_zone_B = airport[dest_id] #print p #print search_url if p == None: result['error'] = PROXY_NONE return result cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content = crawl_single_page(url=search_url, proxy=p, n=1, referer=HOST) content_len = len(content) i += 1 if content == '' or content == None or len(content) < CONTENT_LEN: result['error'] = PROXY_INVALID return result para = parsePage(content, p, time_zone_A, time_zone_B) if para == {'flight':{}, 'ticket':[]}: result['error'] = DATA_NONE return result else: flights = para['flight'] tickets = para['ticket'] result['para'] = {'ticket':tickets, 'flight':flights} return result
def easyjet_task_parser(taskcontent): result = {} flights = {} tickets = [] result['para'] = {'flight': flights, 'ticket': tickets} result['error'] = 0 try: dept_id, dest_id, dept_day_temp = taskcontent.strip().split('&')[0], \ taskcontent.strip().split('&')[1], \ taskcontent.strip().split('&')[2] except: logger.error('easyjet::Wrong Content Format with %s' % taskcontent) result['error'] = TASK_ERROR return result search_url = get_search_url(dept_id, dest_id, dept_day_temp) p = get_proxy(source='easyjet') time_zone_A = airport[dept_id] time_zone_B = airport[dest_id] #print p #print search_url if p == None: result['error'] = PROXY_NONE return result cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content = crawl_single_page(url=search_url, proxy=p, n=1, referer=HOST) content_len = len(content) i += 1 if content == '' or content == None or len(content) < CONTENT_LEN: result['error'] = PROXY_INVALID return result para = parsePage(content, p, time_zone_A, time_zone_B) if para == {'flight': {}, 'ticket': []}: result['error'] = DATA_NONE return result else: flights = para['flight'] tickets = para['ticket'] result['para'] = {'ticket': tickets, 'flight': flights} return result
def wrapper(self, *args, **kw): if not self.flag: return func(self, *args, **kw) else: while self.times < 3: print "重试次数 %d " % self.times self.html, self.error = func(self, *args, **kw) if self.error == '': break else: p = get_proxy(source=self.source) self.set_proxy(p) self.times += 1 return self.html, self.error
def get_site_url(self, target_url, source_id, table_name): PROXY = get_proxy(source="Platform") x = time.time() try: res = _get_site_url(target_url) if res == 'Error': update_proxy('Platform', PROXY, x, '23') self.retry() else: print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') update_site_url(res, source_id, table_name=table_name) except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def GetInterPricePage(queryLogTransNo, cookie, referer, use_proxy = True): priceURL = "http://flights.ctrip.com/international/GetSubstepSearchResults.aspx?IsJSON=T&queryLogTransNo=%s&QueryType=1&cityPairAirline=first&withDirectAirline=T&RdNo=2103213618&ind=347,359,356,370" % queryLogTransNo #if use_proxy: # 如果抓起失败,换一个代理IP,然后重试,次数当前为0 for i in range(1): p = get_proxy() resp = crawl_single_page(priceURL, referer=referer, proxy = p, cookie = cookie) if resp == None or len(resp) == 0: invalid_proxy(p) else: return resp #else: #resp = crawl_single_page(searchURL, cookie = cookie) return
def crawl(url): global PROXY mc = MC() #mc.set_debug(True) mc.set_proxy(PROXY) content = mc.req('get', url, html_flag = True,time_out=15) count = 0 while len(content)<1000: invalid_proxy(PROXY,'Platform') PROXY = get_proxy(source = 'Platform') mc.set_proxy(PROXY) print 'proxy: %s' % PROXY content = mc.req('get', url, html_flag = True,time_out=15) count += 1 if count > 10: break return content
def _get_site_url(target_url): PROXY = get_proxy(source="Platform") proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } page = requests.get(target_url, proxies=proxies, headers=headers, allow_redirects=False) source_site_url = page.headers['location'] print source_site_url # source_site_url = page.url if source_site_url != '' and source_site_url is not None: return source_site_url.replace('#_=_', '') else: return "Error"
def crawl(url): global p mc = MC() #mc.set_debug(True) mc.set_proxy(p) print 'proxy:',p content = mc.req('get', url, html_flag = True,time_out=20) count = 0 while len(content) < 2000: invalid_proxy(p,'Platform') p = get_proxy(source = 'Platform') mc.set_proxy(p) print p content = mc.req('get', url, html_flag = True,time_out=20) count += 1 if count>5: break return content
def crawl(url): global PROXY mc = MC() mc.set_proxy(PROXY) content = mc.req('get', url, html_flag = True) count = 0 while len(content)<1000: invalid_proxy(PROXY,'Platform') PROXY = get_proxy(source = 'Platform') mc.set_proxy(PROXY) print 'proxy: %s' % PROXY content = mc.req('get', url, html_flag = True) count += 1 if count > 10: break #open('test.html','w').write(content) #content = open('test.html','r').read() return content
def GetInterPricePage(queryLogTransNo, cookie, referer, use_proxy=True): priceURL = "http://flights.ctrip.com/international/GetSubstepSearchResults.aspx?IsJSON=T&queryLogTransNo=%s&QueryType=1&cityPairAirline=first&withDirectAirline=T&RdNo=2103213618&ind=347,359,356,370" % queryLogTransNo #if use_proxy: # 如果抓起失败,换一个代理IP,然后重试,次数当前为0 for i in range(1): p = get_proxy() resp = crawl_single_page(priceURL, referer=referer, proxy=p, cookie=cookie) if resp == None or len(resp) == 0: invalid_proxy(p) else: return resp #else: #resp = crawl_single_page(searchURL, cookie = cookie) return
def detail_page(self, pid, page_num, city_id, part): x = time.time() PROXY = get_proxy(source="Platform") proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent(), } try: data = { u'page': unicode(page_num), u'type': u'city', u'pid': unicode(pid), u'sort': u'32', u'subsort': u'all', u'isnominate': u'-1', u'haslastm': u'false', u'rank': u'6' } json_page = requests.post(u'http://place.qyer.com/poi.php?action=list_json', data=data, proxies=proxies, headers=headers) json_page.encoding = u'utf8' content = json_page.text j_data = json.loads(content) task_data = [] url_result = [] for attr in j_data[u'data'][u'list']: worker = u'qyer_poi_task' args = json.dumps( {u'target_url': unicode(u'http:' + attr[u'url']), u'city_id': unicode(city_id)}) task_id = get_task_id(worker=worker, args=args) task_data.append((task_id, worker, args, unicode(part.replace('list', 'detail')))) url_result.append(u'http' + attr[u'url']) result = insert_task(data=task_data) print result print url_result return result except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def vote(self): import httplib httplib.HTTPConnection.debuglevel = 1 httplib.HTTPSConnection.debuglevel = 1 PROXY = get_proxy(source="Platform") proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent(), 'Referer': 'http://www.travelmeetingsawards-china.com/Events/Awards2015Business/Readers-Voting/?cat=5', 'Host': 'www.travelmeetingsawards-china.com', 'Origin': 'http://www.travelmeetingsawards-china.com', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', # 'Cookie': 'EktGUID=91ea164d-e2c6-4748-8e31-33c05e6e5439; EkAnalytics=0; ASP.NET_SessionId=piy2livrdw4nb4vulygiet4y; awardvotes=[{"AwardEventID":7,"AwardCategoryID":5,"AwardSubCategoryID":98,"Datetime":"\/Date(1492764048212)\/"}]; s_cc=true; s_nr=1492766246608-New; _ga=GA1.2.1289463038.1492764050; _gat=1; ecm=user_id=0&isMembershipUser=0&site_id=&username=&new_site=/&unique_id=0&site_preview=0&langvalue=0&DefaultLanguage=2052&NavLanguage=2052&LastValidLanguageID=2052&DefaultCurrency=840&SiteCurrency=840&ContType=&UserCulture=1033&dm=www.travelmeetingsawards-china.com&SiteLanguage=2052; s_sq=ntmntmmcchina%3D%2526pid%253D(5105)%252520%2525E8%2525AF%2525BB%2525E8%252580%252585%2525E6%25258A%252595%2525E7%2525A5%2525A8%252520-%2525202017%2525E4%2525B8%2525AD%2525E5%25259B%2525BD%2525E6%252597%252585%2525E6%2525B8%2525B8%2525E4%2525B8%25259A%2525E7%252595%25258C%2525E5%2525A5%252596%2525EF%2525BC%252588%2525E5%252595%252586%2525E5%25258A%2525A1%2525E7%2525B1%2525BB%2525EF%2525BC%252589%2525E8%2525AF%252584%2525E9%252580%252589%252520%25257C%2526pidt%253D1%2526oid%253DVote%252520%2525E6%25258A%252595%2525E7%2525A5%2525A8%2526oidt%253D3%2526ot%253DSUBMIT' } # data = { # '__VIEWSTATE': '/wEPDwUKLTQ0MDg4MzI3MWRkhc6az5DCGMMce+MYab5BPdm3oOCc0QhMXjgPO+KlHJc=', # '__VIEWSTATEGENERATOR': 'C57773B4', # '__EVENTVALIDATION': '/wEdAApdhN7azgIf7udjNG5rBO36uJWyBmoVrn+KGuzxsc+IdAhrj7iGCUNTOfLFH3a+X2zXZyb9ZhM4Agf2PTEzU0NRt9vByiAtAO532pQGgxLMkPxQ4KIC5CcITHzHErIOKsL+X/4YFsqB/WKj97Ohz20ZIOo7mLBzjoLYCKAW/gNPwcKu4LFvmYccMsvGxcqsoFFypiSNmMf2UIdcHp3gKJUE1+/bEdftTH+meRV6Ro2Ps7Lou2EFvxJCcav33eyACAc=', # 'ctl00$cphMain$ucVoting$rptVotingList$ctl02$rptTopThreeList$ctl02$btnVote': 'Vote 投票' # } data = { '__VIEWSTATE': '/wEPDwUKLTQ0MDg4MzI3MWRkhc6az5DCGMMce+MYab5BPdm3oOCc0QhMXjgPO+KlHJc=', '__VIEWSTATEGENERATOR': 'C57773B4', '__EVENTVALIDATION': '/wEdAApdhN7azgIf7udjNG5rBO36uJWyBmoVrn+KGuzxsc+IdAhrj7iGCUNTOfLFH3a+X2zXZyb9ZhM4Agf2PTEzU0NRt9vByiAtAO532pQGgxLMkPxQ4KIC5CcITHzHErIOKsL+X/4YFsqB/WKj97Ohz20ZIOo7mLBzjoLYCKAW/gNPwcKu4LFvmYccMsvGxcqsoFFypiSNmMf2UIdcHp3gKJUE1+/bEdftTH+meRV6Ro2Ps7Lou2EFvxJCcav33eyACAc=', 'ctl00$cphMain$ucVoting$rptVotingList$ctl02$rptTopThreeList$ctl00$btnVote': 'Vote 投票' } session = requests.session() session.proxies = proxies session.headers.update(headers) ip_page = requests.get('https://api.ipify.org?format=json', proxies=proxies) out_ip = json.loads(ip_page.text)['ip'] page = session.get('http://www.travelmeetingsawards-china.com/Events/Awards2015Business/Readers-Voting/?cat=5') page = session.post('http://www.travelmeetingsawards-china.com/Events/Awards2015Business/Readers-Voting/?cat=5', data=data) save_ip(out_ip, PROXY) return out_ip
def tp_rest_city_page(self, city_url, city_id, part): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } page = requests.get(city_url, proxies=proxies, headers=headers) page.encoding = 'utf8' if len(page.text) < 100: update_proxy('Platform', PROXY, x, '23') self.retry() doc = PyQuery(page.text) doc.make_links_absolute(city_url) for item in doc('.restaurants.twoLines a').items(): tp_rest_list_page_num.delay(item.attr.href, city_id, part)
def get_long_comment(self, target_url, language, miaoji_id, special_str): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, headers=headers, proxies=proxies, timeout=120) page.encoding = 'utf8' data = long_comment_parse(page.content, target_url, language, miaoji_id) update_proxy('Platform', PROXY, x, '0') print "Success with " + PROXY + ' CODE 0' return insert_db((data,), 'tp_comment_' + special_str) except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def get_comment(self, target_url, language, miaoji_id, special_str, **kwargs): if language == 'en': data = { 'mode': 'filterReviews', 'filterLang': 'en' } elif language == 'zhCN': data = { 'mode': 'filterReviews', 'filterLang': 'zh_CN' } else: return "Error, no such language" PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } if data != '': try: page = requests.post(target_url, data, headers=headers, proxies=proxies, timeout=120) page.encoding = 'utf8' res = parse(page.text, target_url, language, miaoji_id, special_str) if res == 0: update_proxy('Platform', PROXY, x, '23') self.retry(countdown=120) else: # update_task(kwargs['mongo_task_id']) update_proxy('Platform', PROXY, x, '0') print "Success with " + PROXY + ' CODE 0' except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc), countdown=120)
def get_lost_rest_new(self, target_url, city_id, **kwargs): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, headers=headers, proxies=proxies, timeout=15) page.encoding = 'utf8' result = rest_parser(page.content, target_url, city_id) if result == 'Error': self.retry() else: update_task(task_id=kwargs['mongo_task_id']) update_proxy('Platform', PROXY, x, '23') return result except Exception as exc: self.retry(exc=traceback.format_exc(exc))
def shutter_spider(self, vid, search_kw, debug=False, **kwargs): """ shutterstock 图片搜索爬取 """ if search_kw is None or search_kw == "null": # todo logging null key words return None x = time.time() spider_proxy = 'socks5://' + get_proxy(source="Platform") try: spider = ShutterShockPicSpider(search_kw, spider_proxy, debug) pic_ret = spider.pic_search() pic_save_data = shutter_pic_data_assembly(vid, search_kw, pic_ret) spider_db = PicModel(**save_db_config) for _, save_data_map in pic_save_data.items(): spider_db.insert_pic_many(save_data_map["table"], save_data_map["fields"], save_data_map["values"]) update_task(kwargs['task_id']) except Exception as exc: update_proxy('Platform', spider_proxy, x, '23') self.retry(exc=traceback.format_exc(exc))
def get_pid_total_page(self, target_url, city_id, part): x = time.time() PROXY = get_proxy(source="Platform") proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: html_page = requests.get(target_url, proxies=proxies, headers=headers) html_page.encoding = u'utf8' content = html_page.text pid = re.findall(u'PID :\'(\d+)\'', content)[0] total_attr = re.findall(u'景点\((\d+)\)', content)[0] # return pid, (int(total_attr) // 15) + 1 print pid, total_attr for page_num in range(1, (int(total_attr) // 15) + 2): detail_page.delay(pid, page_num, city_id, part) except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def qyer_city_spider(self, country_id, country_en, country_link, debug=False, **kwargs): """ 抓取穷游上的城市数据 country_id: int, index country info country_en: str. country_en country_link: str. """ if country_en in city_state: country_type = "city_state" else: country_type = "city_list" http_tools = init_qyer_session(debug=True) x = time.time() country_args = {"country_en": country_en, "country_id": country_id} spider_proxy = "socks5://" + get_proxy(source="Platform") qyer_db = QyerModel(**save_db_config) try: spider_ret = http_tools(country_link, proxy=spider_proxy) status_code = spider_ret[1] if status_code != 200 and status_code != 404: raise Exception(str(status_code)) save_data = platform_page_parse(country_type, spider_ret[0], **country_args) qyer_db.insert_many_data(*save_data) except Exception as exc: update_proxy('Platform', spider_proxy, x, '23') self.retry(exc=traceback.format_exc(exc))
def qyer_city_query_task(self, city_name, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = { 'User-agent': GetUserAgent(), 'Referer': "http://www.qyer.com/", } try: conn = pymysql.connect(host='10.10.180.145', user='******', passwd='hourong', db='SuggestName', charset="utf8") with conn as cursor: print(city_name) quote_string = quote(city_name.encode('utf8')) page = requests.get( 'http://www.qyer.com/qcross/home/ajax?action=search&keyword={0}' .format(quote_string), proxies=proxies, headers=headers) page.encoding = 'utf8' content = page.text.replace('while(1);', '') for line in get_query_data(content=content, query_string=city_name): cursor.execute( 'insert into QyerSuggestCity (`QueryName`,`Name`,`BelongName`,`Url`) VALUES (%s,%s,%s,%s)', line) conn.close() update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def get_daodao_image_url(self, source_url, mid, **kwargs): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } try: detail_id = re.findall('-d(\d+)', source_url)[0] target_url = 'http://www.tripadvisor.cn/LocationPhotoAlbum?detail=' + detail_id page = requests.get(target_url, proxies=proxies, headers=headers, timeout=240) page.encoding = 'utf8' if not page.text: update_proxy('Platform', PROXY, x, '23') self.retry() else: print "Success with " + PROXY + ' CODE 0' root = PyQuery(page.text) images_list = [] for div in root('.photos.inHeroList div').items(): images_list.append(div.attr['data-bigurl']) img_list = '|'.join(images_list) if img_list == '': self.retry() data = (mid, source_url, img_list) print insert_daodao_image_list(data) update_proxy('Platform', PROXY, x, '0') update_task(kwargs['mongo_task_id']) return data except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def youzhan_task_parser(taskcontent): all_info = [] room_list = [] taskcontent = taskcontent.encode('utf-8').strip() hotel_id = taskcontent.split('&')[0] star = taskcontent.split('&')[2] ipathid = taskcontent.split('&')[1] city = taskcontent.split('&')[3] country = taskcontent.split('&')[4] #room_type = taskcontent.split('&')[3] from_date_temp = taskcontent.split('&')[5] from_date = from_date_temp[:4] + '-' + from_date_temp[4:6] + '-' \ + from_date_temp[6:] to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \ int(from_date_temp[6:])) to_date = str(to_date_temp + datetime.timedelta(days = 1))[:10] #获取代理 p = get_proxy() #if p == "": #logger.error("get proxy failed") #return None hotel = Hotel() room = Room() rating_url = get_rating_url(hotel_id) rating_page = crawl_single_page(rating_url, proxy=p) grade_str = grade_parser(rating_page) if grade_str != '': hotel.grade = grade_str[:-1] else: pass #logger.error('Error: No grade_str found!') map_url = get_map_url(hotel_id) map_page = crawl_single_page(map_url, proxy=p) #print map_page map_info_list = staticmap_parser(map_page) if map_info_list != []: hotel.hotel_name = map_info_list[1] if is_alphabet(hotel.hotel_name.decode('utf-8')) == True: hotel.hotel_name_en = hotel.hotel_name else: hotel.hotel_name_en = 'NULL' hotel.map_info = map_info_list[0] else: logger.error('youzhanHotel: Map info do not have hotel name and map_info') return [] info_url = get_info_url(hotel_id,from_date,to_date) info_page = crawl_single_page(info_url,proxy=p) if info_page == '': #invalid_proxy(p) return [] info_list = info_parser(info_page) if info_list != []: hotel.country = country hotel.city = city hotel.address = info_list[1] hotel_desc_temp = info_list[3].replace('<br/>','').replace(''','') if hotel_desc_temp != '': hotel.description = hotel_desc_temp else: hotel.description = 'NULL' hotel.service = info_list[4] if '停车场' in hotel.service: hotel.has_parking = 'Yes' if '无线网络' in hotel.service or 'wifi' in hotel.service: hotel.has_wifi = 'Yes' else: return [] hotel.source = 'youzhan' hotel.source_id = hotel_id hotel.star = star price_url = get_price_url(hotel_id,ipathid,from_date,to_date) price_page = crawl_single_page(price_url,proxy=p) price_list = price_parser(price_page,hotel_id) #print '********' #print price_list if price_list != []: for each_room in price_list: if len(each_room) > 3: room.city = city room.occupancy = 2 room.hotel_name = hotel.hotel_name #print '******' #print each_room room.room_desc = each_room[3] room.real_source = each_room[2] num = each_room[3].find('-') if num > 0: if len(each_room[3][:num]) < 20: room.room_type = each_room[3][:num] else: room.room_type = 'NULL' else: if len(each_room[3]) < 20: room.room_type = each_room[3] else: room.room_type = 'NULL' if each_room[0] != u'nbsp;': room.price = each_room[0] room.has_breakfast = each_room[1] room.room_desc = each_room[3] if '免费WiFi' in room.room_desc: hotel.is_wifi_free = 'Yes' if '免费取消' in room.room_desc: hotel.is_cancel_free = 'Yes' room.currency = 'CNY' room.source = 'youzhan' room.source_hotelid = hotel_id room.check_in = from_date room.check_out = to_date room_tuple = (room.hotel_name,room.city,room.source,room.source_hotelid,\ room.source_roomid,room.real_source,room.room_type,room.occupancy,\ room.bed_type,room.size,room.floor,room.check_in,room.check_out,room.price,\ room.tax,room.currency,room.is_extrabed,room.is_extrabed_free,room.has_breakfast,\ room.is_breakfast_free,room.is_cancel_free,room.room_desc) room_list.append(room_tuple) hotel_tuple = (hotel.hotel_name, hotel.hotel_name_en,hotel.source,hotel.source_id,hotel.brand_name,\ hotel.map_info,hotel.address,hotel.city,hotel.country,hotel.postal_code, \ hotel.star,hotel.grade,hotel.has_wifi,hotel.is_wifi_free,hotel.has_parking,\ hotel.is_parking_free,hotel.service,hotel.img_items,hotel.description) hotel_list = [] hotel_list.append(hotel_tuple) all_info.append(hotel_list) all_info.append(room_list) return all_info
dept_id = infos[0] dest_id = infos[1] day, month, year = infos[2][6:], infos[2][4:6], infos[2][0:4] dept_date = month + '/' + day + '/' + year rday, rmonth, ryear = infos[3][6:], infos[3][4:6], infos[3][0:4] dest_date = rmonth + '/' + rday + '/' + ryear except Exception, e: logger.error('feiquanqiuRoundFlight: Wrong Content Format with %s'%content) result['error'] = TASK_ERROR return result url = URL%(dept_id, dest_id, dept_date, dest_date) referer = REFERER%(dept_id, dest_id, dept_date, dest_date) p = get_proxy(source='feiquanqiuRoundFlight') if p == None: result['error'] = PROXY_NONE return result mc = MechanizeCrawler(p=p, referer=referer) page = mc.get(url, html_flag = True) if page == None: logger.info('feiquanqiuRoundFlight: htmlcontent is null with %s'%p) result['error'] = PROXY_INVALID return result flights = parsePage(page) if flights == None: result['error'] = DATA_NONE
dest_id = infos[1] #机场三字码 day, month, year = infos[2][6:], infos[2][4:6], infos[2][0:4] dept_date = year+'-'+month+'-'+day dept_date_url = year[-2:] + month + day #140627 except Exception, e: logger.error('ceairFlight: Wrong Content Format with %s'%content) result['error'] = TASK_ERROR return result if AIRPORT_CITY_DICT.has_key(dept_id) == False or AIRPORT_CITY_DICT.has_key(dest_id) == False: logger.warning('ceairFlight: airport not in AIRPORT_CITY_DICT') result['error'] = DATA_NONE return result p = get_proxy(source = 'ceairFlight') if p == None: result['error'] = PROXY_NONE return result postdata = getPostData(dept_id,dest_id,dept_date) if postdata == '': result['error'] = UNKNOWN_TYPE return result rand = str(random.random()) referer = RefererURL%(AIRPORT_CITY_DICT[dept_id].lower(), AIRPORT_CITY_DICT[dest_id].lower(), dept_date_url) searchurl = SearchURL%str(rand)
#解析taskcontent 中的出发城市和到达城市的三字码以及出发日期 try: dept_code, dest_code, dept_date = taskcontent.strip().split('&')[0], \ taskcontent.strip().split('&')[1], \ taskcontent.strip().split('&')[2] dept_day = dept_date[:4] + '-' + dept_date[4:6] + '-' + dept_date[6:] dept_year= dept_date[:4] except Exception,e: logger.error('feifanFlight: wrong content format with %s'%taskcontent) result['error'] = TASK_ERROR return result #获取代理 p = get_proxy(source='feifanFlight') if p == None: result['error'] = PROXY_NONE return result #生成URL并判断其是否可用 url = get_url(dept_code, dest_code, dept_day) if url == '' or url == None: logger.error('feifanFlight: Get url failed!') result['error'] = UNKNOWN_TYPE return result #抓取页面并判断其是否可用 #feifan常常要刷新才能获取内容,所以爬取3次
hotel_name = taskcontent.strip().split('&')[1] map_info = taskcontent.strip().split('&')[2] city_name_zh = taskcontent.strip().split('&')[3] city_name_en = taskcontent.strip().split('&')[4] country_name_zh = taskcontent.strip().split('&')[5] check_in_day_temp = taskcontent.strip().split('&')[6] check_in_day = check_in_day_temp[:4] + '-' + check_in_day_temp[4:6] + '-' + check_in_day_temp[6:] check_out_day_temp = datetime.datetime(int(check_in_day_temp[:4]),int(check_in_day_temp[4:6]), int(check_in_day_temp[6:])) check_out_day = str(check_out_day_temp + datetime.timedelta(days = 1))[:10] except Exception, e: logger.error('biyiHotel: Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result p = get_proxy(source='biyiHotel') print p if p == None: result['error'] = PROXY_NONE return result first_url = 'http://www.biyi.cn/' url = get_url(hotel_name, city_name_en, check_in_day, check_out_day) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp = crawl_single_page(url=first_url, proxy=p, Accept=accept, referer=first_url, n=1) #for x in cj: # print x
result['error'] = 0 #解析字符串 content = content.encode('UTF-8').strip() try: info = content.split('&') dept_id = info[0] arr_id = info[1] dept_date = info[2][:4] + '-' + info[2][4:6] + '-' + info[2][6:] except Exception,e: logger.error('wegoFlight Content Error: cannot extract information from %s'%content) result['error'] = TASK_ERROR return result #获取代理 p = get_proxy(source = 'wegoFlight') if p == None: result['error'] = PROXY_NONE return result #获取初始url url_temp = get_url(dept_id,arr_id,dept_date) search_id = get_search_id(url_temp,proxy = p) if search_id == '': logger.error('Search_Id Error: get Search_Id failed') invalid_proxy(proxy=p, source='wegoFlight') result['error'] = PROXY_FORBIDDEN return result
taskcontent.split('&&')[2], taskcontent.split('&&')[3], \ taskcontent.split('&&')[4] except Exception, e: logger.error('haodingHotel::Cannot parse task content with error: ' + str(e)) return {'para':[], 'error':TASK_ERROR} check_in = check_in_temp[:4] + '-' + check_in_temp[4:6] + '-' + check_in_temp[6:] check_out_temp = datetime.datetime(int(check_in_temp[:4]), int(check_in_temp[4:6]), \ int(check_in_temp[6:])) check_out = str(check_out_temp + datetime.timedelta(days=1))[:10] hotel_url = get_hotel_url(city_name_zh,city_id,hotel_id,check_in,check_out) #p = get_proxy() #print p p = get_proxy(source='haodingHotel') if p == '' or p == None: return {'para':[], 'error':NO_PROXY} i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content = crawl_single_page(hotel_url, p) content_len = len(content) i += 1 if content == '' or content == None: invalid_proxy(proxy = p, source='haodingHotel') return {'para':[], 'error':NO_CONTENT} if len(content) < CONTENT_LEN:
taskcontent.encode('utf-8') try: dept_city_zh,dept_city_en,dest_city_zh,dest_city_en,dept_day_temp = \ taskcontent.strip().split('&')[0], \ taskcontent.strip().split('&')[1], \ taskcontent.strip().split('&')[2], \ taskcontent.strip().split('&')[3], \ taskcontent.strip().split('&')[4] dept_day = dept_day_temp[:4] + '-' + dept_day_temp[4:6] + '-' + dept_day_temp[6:] except Exception,e: logger.error('jijitongFlight:Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result p = get_proxy(source='jijitongFlight') if p == None: result['error'] = PROXY_NONE return result first_url = FIRST_URL % (dept_city_en,dest_city_en,dept_day_temp) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp = crawl_single_page(first_url,proxy=p, \ Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', n = 1) if resp.find('404错误') < 0: url = get_url(dept_city_zh,dest_city_zh,dept_day) page = crawl_single_page(url, proxy = p, referer = first_url)
def csair_task_parser(taskcontent): result = {} multi_ticket = [] one_flight = {} result['para'] = {'flight':one_flight, 'ticket':multi_ticket} result['error'] = 0 try: param_list = taskcontent.strip().split('&') url= 'http://b2c.csair.com/B2C40/detail-'+param_list[0]+param_list[1]+'-'+param_list[2]\ +'-1-0-0-0-1-0-1-0-1-0.g2c' except: logger.info('url param is not valid\n') result['error'] = TASK_ERROR return result #Initial all params dic_flightdate = {} multi_price = [] select_time = 0 Flag1 = False Flag2 = False page_flag = False cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) task_content_proxy = get_proxy(source='csairFlight') if task_content_proxy == None: result['error'] = PROXY_NONE return result html = crawl_single_page(url, proxy = task_content_proxy) if html == '' or html == None: result['error'] = PROXY_INVALID return result pattern = re.compile(r'\s*<FLIGHTS>\s*') match = pattern.search(html) if match and len(html) > CONTENT_LEN: dom =etree.fromstring(html) etree.tostring(dom) for ele in dom.iter(): if ele.tag is not None: if ele.tag in word_flightdate: #print ele.tag, ele.text dic_flightdate[ele.tag] = ele.text elif ele.tag in word_parent_list: page_flag = True #node of DateFIGHT multi_flight = [] Flight = nanhang_flight() select_time += 1 flight_num = 0 ticket_dur_list = [] for word in ele: if word.tag in word_list[0]: flight_num += 1 dic_flight = {} EachFlight = nanhang_eachflight() for word_child in word: if word_child.tag in word_child_list[0]: Flag1 = True dic_flight[word_child.tag]= word_child.text #each flight if Flag1 == True: try: Flag1 = False EachFlight.flight_no = dic_flight[word_child_list[0][0]] EachFlight.dept_id = dic_flight[word_child_list[0][1]] EachFlight.dest_id = dic_flight[word_child_list[0][2]] EachFlight.flight_key = EachFlight.flight_no + '_' + EachFlight.dept_id + '_'+ EachFlight.dest_id dept_time = dic_flight[word_child_list[0][3]] EachFlight.dept_time = dept_time[0:10] +'T'+dept_time[-5:len(dept_time)] dest_time = dic_flight[word_child_list[0][4]] EachFlight.dest_time = dest_time[0:10] +'T'+dest_time[-5:len(dest_time)] EachFlight.dur = get_duration(dest_time,EachFlight.dest_id, dept_time,EachFlight.dept_id) EachFlight.dept_time = EachFlight.dept_time + ':00' EachFlight.dest_time = EachFlight.dest_time+ ':00' ticket_dur_list.append(EachFlight.dur) EachFlight.airline = '南方航空公司' EachFlight.plane_no = dic_flight[word_child_list[0][5]] # rebulid and compute flight except KeyError,e: print e else: one_flight[EachFlight.flight_key] = (EachFlight.flight_no, EachFlight.airline, EachFlight.plane_no,EachFlight.dept_id,EachFlight.dest_id,EachFlight.dept_time, EachFlight.dest_time,EachFlight.dur) multi_flight.append((EachFlight.flight_key,EachFlight.flight_no, EachFlight.airline, EachFlight.plane_no,EachFlight.dept_id,EachFlight.dest_id,EachFlight.dept_time, EachFlight.dest_time,EachFlight.dur)) #list of multi flight elif word.tag in word_list[1]: multi_price = [] #node of price for word_child in word: if word_child.tag in word_next_list: dic_ticket = {} for word_next_child in word_child: if word_next_child.tag in word_child_list[1]: Flag2 = True dic_ticket[word_next_child.tag] = word_next_child.text if Flag2 == True: try: Flag2 = False Flight.price = string.atof(dic_ticket[word_child_list[1][0]]) Flight.tax = string.atof(dic_ticket[word_child_list[1][1]]) + string.atof(dic_ticket[word_child_list[1][2]]) + string.atof(dic_ticket[word_child_list[1][3]]) Flight.currency = dic_ticket[word_child_list[1][4]] Flight.seat_type = dic_ticket[word_child_list[1][5]] if Flight.seat_type == 'ECONOMY': Flight.seat_type = '经济舱' if Flight.seat_type =='BUSINESS': Flight.seat_type = '商务舱' if Flight.seat_type == 'FIRST': Flight.seat_type = '头等舱' if Flight.seat_type == 'PREMIUMECONOMY': Flight.seat_type = '超经济舱' Flight.return_rule = 'NULL' Flight.stop = flight_num - 1 Flight.surcharge = -1 Flight.source = 'csair::csair' except KeyError,e: print e else: multi_price.append((Flight.price, Flight.tax, Flight.surcharge, Flight.currency,Flight.seat_type, Flight.source, Flight.return_rule, Flight.stop)) if select_time is not 0: if multi_flight != []: new_flight_no = [] Flight.fight_no = '_'.join([item[1] for item in multi_flight]) Flight.plane_no = '_'.join([item[3] for item in multi_flight]) Flight.airline = '_'.join([item[2]for item in multi_flight]) Flight.dept_id = multi_flight[0][4] Flight.dest_id = multi_flight[len(multi_flight)-1][5] Flight.dept_day = dic_flightdate[word_flightdate[0]][0:4]+'-'+ dic_flightdate[word_flightdate[0]][4:6]+'-'+dic_flightdate[word_flightdate[0]][6:8] Flight.dept_time = multi_flight[0][6] Flight.dest_time = multi_flight[len(multi_flight)-1][7] Flight.dur = get_duration(Flight.dest_time,Flight.dest_id,Flight.dept_time,Flight.dept_id) for i in range(len(multi_price)): multi_ticket.append((Flight.fight_no,Flight.plane_no,Flight.airline,Flight.dept_id,Flight.dest_id,\ Flight.dept_day,Flight.dept_time, Flight.dest_time,Flight.dur, multi_price[i][0], multi_price[i][1],\ multi_price[i][2], multi_price[i][3],multi_price[i][4],multi_price[i][5],multi_price[i][6], multi_price[i][7]))
result['error'] = 0 try: dept_city,dest_city = taskcontent.split('&')[0].strip(),taskcontent.split('&')[1].strip() dept_time = taskcontent.split('&')[2].strip() dept_time = dept_time[0:4] +'/'+ dept_time[4:6] + '/' + dept_time[6:8] except Exception,e: logger.info('url id wrong :'+e) result['error'] = TASK_ERROR return result cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) url_res = get_json_url(dept_city,dest_city,dept_time) if url_res != False: url = 'http://www.expedia.com.hk/Flight-Search-Outbound?c='+ url_res +'&_='+str(time.time()) task_content_proxy = get_proxy(source='expediaFlight') if task_content_proxy == None: result['error'] = PROXY_NONE return result html_res = crawl_single_page(url, proxy = task_content_proxy) if html_res == '' or html_res == None: result['error'] = PROXY_INVALID return result else: result['error'] = TASK_ERROR return result try: json_list = json.loads(html_res) if json_list[key_list[0]] == None: result['error'] = DATA_NONE return result
check_in = check_in_temp[:4] + '-' + check_in_temp[4:6] + '-' + check_in_temp[6:] check_out_temp = datetime.datetime(int(check_in_temp[:4]), int(check_in_temp[4:6]), int(check_in_temp[6:])) check_out = str(check_out_temp + datetime.timedelta(days=1))[:10] hotel_id_temp = hotel_id.split('_')[1] except Exception, e: logger.error('elongHotelParser: Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result if hotel_id_temp == '0': result['error'] = TASK_ERROR return result p = get_proxy(source='elongHotel') if p == None: result['error'] = PROXY_NONE return result post_data = get_post_data(hotel_id_temp, check_in, check_out) page = request_post_data(request_url,data=post_data,proxy=p) if page == None or page == '': invalid_proxy(proxy=p, source='elongHotel') result['error'] = PROXY_INVALID return result room_list = parseRoom(page,hotel_name,city_name_zh,check_in,check_out,hotel_id) if room_list != []:
def crawl(city_url,city_id): global p source = 'daodao' #city_url = city_url.replace('Tourism','Restaurants') print city_url mc = MC() mc.set_proxy(p) print 'proxy: %s' % p page1 = '' page1 = mc.req('get',city_url,html_flag=True, time_out=10) count =0 while len(page1)<1000: invalid_proxy(p,'Platform') p = get_proxy(source='Platform') print 'proxy: %s' % p mc.set_proxy(p) page1 = mc.req('get',city_url,html_flag=True , time_out=10) count += 1 if count > 20: break source_city_id = re.compile(r'-g(\d+)').findall(city_url)[0] root = html.fromstring(page1) # 城市餐厅总数 rating_info = root.find_class('listing')[0].find_class('popIndexDefault')[0].xpath('text()')[0].encode('utf-8').strip().split('(')[1].replace(',','') nums = re.compile(r'(\d+)').findall(rating_info) res_total = int(nums[0]) print "total: %s " % res_total # 第一页的餐厅列表 items = root.find_class('listing') data_list = [] for item in items: res_url = 'http://www.tripadvisor.cn' + item.find_class('title')[0].xpath('a/@href')[0].strip().encode('utf-8') res_id = re.compile(r'd(\d+)').findall(res_url)[0].encode('utf-8') print res_url data = (source,city_id,res_id,res_url) print data data_list.append(data) print 'insert',insert_db(data_list) print '------------next page------------' itag = '10591' # 餐厅的类别id page = 2 data_list = [] for offset in range(30,res_total+1,30): print '-----------page %s-------' % page page += 1 next_url = 'http://www.tripadvisor.cn/RestaurantSearch?Action=PAGE&geo=%s&ajax=1&itags=%s&sortOrder=popularity&o=a%s&availSearchEnabled=false' % (source_city_id,itag,offset) print next_url content2 = '' content2 = mc.req('get',next_url,html_flag = True) while (len(content2) < 1000): p = get_proxy(source='Platform') print 'proxy: %s' % p content2 = mc.req('get',next_url,html_flag = True) no_count = len( re.compile(r'(该餐馆暂无点评,来写第一条)').findall(content2) ) # 如果大部分是“该餐馆暂无点评,来写第一条”,就停止翻页 if int(no_count) >29: break root2 = html.fromstring(content2) items = root2.find_class('listing') data_list2 = [] for item in items: res_url = 'http://www.tripadvisor.cn' + item.find_class('title')[0].xpath('a/@href')[0].strip().encode('utf-8') res_id = re.compile(r'd(\d+)').findall(res_url)[0].encode('utf-8') print res_url data2 = (source,city_id,res_id,res_url) print data2 data_list2.append(data2) print 'insert',insert_db(data_list2) print 'city %s ok' % city_id
POST_DATA_STRING = 'CONTROLGROUPAVAILABILTYSEARCHINPUTSCHEDULESELECTVIEW$AvailabilityScheduleSelectView$' def vueling_task_parser(taskcontent): try: dept_id, dest_id, dept_date_temp = taskcontent.split('&')[0].strip(), \ taskcontent.split('&')[1].strip(), \ taskcontent.split('&')[2].strip() except Exception,e: logger.error('VuelingFlight: Content Error wrong content format' + str(e)) return None dept_time = dept_date_temp[:4] + '-' + dept_date_temp[4:6] + '-' +dept_date_temp[6:] postdata = getPostData(dept_time,dept_id,dest_id) p = get_proxy(source ='vuelingFlight') url = 'http://tickets.vueling.com/ScheduleSelect.aspx' Referer = 'http://tickets.vueling.com/ScheduleSelect.aspx' content = request_post_data(url,postdata,referer=Referer,proxy=p,\ Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") allinfos = [] if content != '' and len(content) > 100: allinfos = vuelingparser(content) else: logger.error('VuelingFlight: Get web content failed!') return allinfos
dept_id = cities_dict[dept_id] dest_id = cities_dict[dest_id] location = dept_id + '-' + dest_id origday = datetime.datetime(string.atoi(dept_date[0:4]),string.atoi(dept_date[5:7]),string.atoi(dept_date[8:])) urlday = (origday - datetime.datetime.today()).days #dept_date = orig_date #logger.info('contents: %s %s %s %s '%(location,flight_no,dept_date,str(urlday))) except Exception,e: logger.error(str(e)) logger.error('Content Error: Wrong content format with %s'%content) return result url = URL%(location,urlday) p = get_proxy(source='elongFlight') htmlcontent = crawl_single_page(url,n=1,proxy = p) if htmlcontent == '': invalid_proxy(p) logger.error('Proxy Error: htmlcontent is null with proxy: %s'%p) return result #判断是否返回导航页,返回导航页说明content没有航班信息 #判断是否找到航班信息,没有返回[] temp_flight_list = flightPattern.findall(htmlcontent) if len(temp_flight_list) == 1: logger.error('Parser Error: cannot find flights with %s'%location) return result
logger.error( "pageParser Error: %s" %str(e)) return page_num def wego_task_parser(content): content = content.encode('UTF-8').strip() try: info = content.split('&') dept_id = info[0] arr_id = info[1] dept_date = info[2][:4] + '-' + info[2][4:6] + '-' + info[2][6:] except Exception,e: logger.error('wegoFlight Content Error: cannot extract information from %s'%content) return None #获取代理 p = get_proxy(type = '') #获取初始url url_temp = get_url(dept_id,arr_id,dept_date) search_id = get_search_id(url_temp,proxy = p) if search_id == '': logger.error('Search_Id Error: get Search_Id failed') return None trip_id = get_trip_id(dept_id,arr_id,dept_date) #使用初始url,获取要爬取的页面,page表示一共有多少页 start_url = get_start_url(search_id,trip_id) content_temp = crawl_single_page(start_url,proxy = p, Host="www.wego.cn", Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") if content_temp == "":
dest_id = infos[1] #机场三字码 dept_day = infos[2] dept_date = dept_day[0:4] + '-' + dept_day[4:6] + '-' + dept_day[6:] except Exception, e: logger.error('lcairFlight: Wrong Content Format with %s'%content) result['error'] = TASK_ERROR return result if AIRPORT_CITY_DICT.has_key(dept_id) == False or AIRPORT_CITY_DICT.has_key(dest_id) == False: logger.warning('ceairFlight: airport not in AIRPORT_CITY_DICT') logger.info(dept_id) logger.info(dest_id) result['error'] = DATA_NONE return result p = get_proxy(source = 'lcairFlight') if p == None: result['error'] = PROXY_NONE return result postdata = getPostData(dept_id, dest_id, dept_date) if postdata == None: result['error'] = UNKNOWN_TYPE return result #referer = Referer%(AIRPORT_CITY_CN_DICT[dept_id], AIRPORT_CITY_DICT[dept_id], AIRPORT_CITY_CN_DICT[dest_id], AIRPORT_CITY_DICT[dest_id], dept_date) uc = UrllibCrawler(p = p) #uc.get(referer)
trip_way = 'Oneway' searchURL = "http://flights.ctrip.com/booking/%s-%s-day-1.html?DCity1=%s&ACity1=%s&DDate1=%s&passengerQuantity=1&SendTicketCity=undefined&PassengerType=ADU&SearchType=S&RouteIndex=1&RelDDate=&RelRDate=" interSearchURL = "http://flights.ctrip.com/international/ShowFareFirst.aspx?flighttype=S&relddate=%s&dcity=%s&acity=%s" is_inter = False searcURL = "" if dept_id.lower() in CN_AIRPORTS and dest_id.lower() in CN_AIRPORTS: searchURL = searchURL %(dept_id, dest_id, dept_id, dest_id, dept_date) else: searchURL = interSearchURL %(dept_date, dept_id, dest_id) is_inter = True refererURL = "http://flights.ctrip.com/booking/" cookie = {} p = get_proxy() resp = crawl_single_page(searchURL, proxy = p, cookie = cookie) if resp == None or len(resp) == 0: invalid_proxy(p) return None # 2. 解析页面 tree = etree.HTML(resp) if is_inter or GetTextByXpath(tree, "//title/text()").endswith("携程国际机票"): # 国际机票 queryLogTransNo = tree.xpath("//input[@id='queryLogTransNo']")[0].get("value") # 抓取机票价格页面 resp = GetInterPricePage(queryLogTransNo, cookie, searchURL)#, use_proxy) return ParseInterPage(resp) else: # 国内机票
reload(sys) sys.setdefaultencoding('utf-8') # ------------------------------- # 需要更新的表 BASIC_TABLE = 'qyer' # 链接任务表格 TASK_TABLE = 'qyer' # 是否入库,True为入库,False为不入库 IS_INSERT = True # 调试,如果为True仅调试一个URL就break DEBUG = True # --------------------------------- PROXY = get_proxy(source="Platform") def get_task(): #sql = "select url from "+TASK_TABLE+" where map_info is null " #sql = "select * from "+TASK_TABLE sql = "select url from qyer where cateid='景点观光' and map_info is null" return db_add.QueryBySQL(sql) class QyerParser(): def __init__(self): pass def crawl(self,url):
to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \ int(from_date_temp[6:8])) to_date = str(to_date_temp + datetime.timedelta(days = 1))[:10] except Exception,e: logger.info('youzhanHotel: Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result room = Room() price_url = get_price_url(hotel_id,ipathid,from_date,to_date) i = 0 content_len = 0 while i < 5 and content_len < CONTENT_LEN: #p = get_proxy() p = get_proxy(source='youzhanHotel') #print p if p == None: result['error'] = PROXY_NONE return result url = price_url + str(int(time.time() * 1000)) price_page = crawl_single_page(url,proxy=p,n=1) content_len = len(price_page) i += 1 if price_page == None or price_page == '': invalid_proxy(proxy=p, source='youzhanHotel') result['error'] = PROXY_INVALID return result #print price_page
result = {} result['para'] = None result['error'] = 0 try: contents = content.split('&') dept_id = contents[0] dest_id = contents[1] dept_date = contents[2][:4] + '-' + contents[2][4:6] + '-' + contents[2][6:] ret_date = str(datetime.datetime.strptime(dept_date[2:], '%y-%m-%d') + datetime.timedelta(10)).split(' ')[0].strip()#do not use this value except Exception,e: logger.error('ryanairFlight: wrong content format with %s'%content) result['error'] = TASK_ERROR return result p = get_proxy(source = 'ryanairFlight') if p == None: result['error'] = PROXY_NONE return result trip_type = 'Oneway' page = GetData(trip_type, dept_id, dest_id, dept_date, ret_date, proxy = p) if page == None: invalid_proxy(proxy = p, source='ctripFlight') result['error'] = PROXY_INVALID return result data = ParsePage(page) if data == None:
from lxml import html import codecs import db_add import urllib import json import math import httplib from common.common import get_proxy,invalid_proxy import time reload(sys) sys.setdefaultencoding('utf-8') CITY_TABLE = 'tp_city' p = get_proxy(source='Platform') #p = '' def insert_db(args): sql = 'insert ignore into tp_rest_basic_0707(source, city_id, id, res_url) values(%s,%s,%s,%s)' return db_add.ExecuteSQLs(sql,args) def crawl(city_url,city_id): global p source = 'daodao' #city_url = city_url.replace('Tourism','Restaurants') print city_url mc = MC() mc.set_proxy(p) print 'proxy: %s' % p page1 = ''
dest_id = infos[1] #机场三字码 dept_day = infos[2] return_day = infos[3] dept_date = dept_day[0:4] + '-' + dept_day[4:6] + '-' + dept_day[6:] return_date = return_day[0:4] + '-' + return_day[4:6] + '-' + return_day[6:] except Exception,e: logger.info('lcairRoundFlight: Wrong Content Format with %s'%content) return result if AIRPORT_CITY_DICT.has_key(dept_id) == False or AIRPORT_CITY_DICT.has_key(dest_id) == False: logger.warning('lcairRoundFlight: airport not in AIRPORT_CITY_DICT') result['error'] = DATA_NONE return result p = get_proxy(source = 'lcairRoundFlight') if p == None: result['error'] = PROXY_NONE return result postdata = getPostData(dept_id, dest_id, dept_date, return_date) if postdata == None: result['error'] = UNKNOWN_TYPE return result uc = UrllibCrawler(p = p) html = uc.post(SearchURL, postdata, html_flag = True) #print html