def GetData(tripType, orig, dest, deptDate, retDate): searchURL = "https://www.bookryanair.com/SkySales/Search.aspx" refererURL = "https://www.bookryanair.com/SkySales/booking.aspx?culture=en-gb&lc=en-gb&cmpid2=Google" data = {"fromaction": "Search.aspx", "SearchInput$TripType": tripType, "SearchInput$Orig": orig, "SearchInput$Dest": dest, "SearchInput$DeptDate": deptDate, "SearchInput$RetDate": retDate, "SearchInput$IsFlexible": "on", "SearchInput$PaxTypeADT": 1, "SearchInput$PaxTypeCHD": 0, "SearchInput$PaxTypeINFANT": 0, "SearchInput$AcceptTerms": "on", "__EVENTTARGET": "SearchInput$ButtonSubmit", } # 如果抓起失败,换一个代理IP,然后重试 for i in range(3): p = get_proxy() resp = request_post_data(searchURL, data, referer = refererURL, proxy = p) if resp == None or len(resp) == 0: invalid_proxy(p) else: return resp return resp
def elong_task_parser(content): contents = content.split('&') if len(contents) != 2: logger.error('elongFlight: wrong content format with %s' % content) return None location, origdate = contents[0].strip(), contents[1].strip() origday = datetime.datetime(string.atoi(origdate[0:4]), string.atoi(origdate[4:6]), string.atoi(origdate[6:])) urlday = (origday - datetime.datetime.today()).days dept_date = str(origday).split(' ')[0].strip() url = URL % (location, urlday) p = get_proxy() htmlcontent = crawl_single_page(url, proxy=p) if htmlcontent == '': invalid_proxy(p) logger.error( 'elongFlight: Proxy Error: htmlcontent is null with proxy: %s' % p) return [] #判断是否返回导航页,返回导航页说明content没有航班信息 #判断是否找到航班信息,没有返回[] temp_flight_list = flightPattern.findall(htmlcontent) if len(temp_flight_list) == 1: logger.error('elongFilght: Parser Error: cannot find flights with %s' % location) return [] flights = [] flight_list = temp_flight_list[:-1] typ = 0 for item in flight_list: typ = len(typePattern.findall(item)) if typ == 0: pass elif typ != 1: transfer_info = transferFlight_parser(item, dept_date, airports_dict) if transfer_info != []: flights.append(transfer_info) else: direct_info = directFlight_parser(item, dept_date, airports_dict) if direct_info != []: flights.append(direct_info) flights_set = set(flights) flights = [a for a in flights_set] #logger.info('Find %d airlines with %s'%(len(flights),location)) return flights
def elong_task_parser(content): contents = content.split('&') if len(contents) != 2: logger.error('elongFlight: wrong content format with %s'%content) return None location, origdate = contents[0].strip(),contents[1].strip() origday = datetime.datetime(string.atoi(origdate[0:4]),string.atoi(origdate[4:6]),string.atoi(origdate[6:])) urlday = (origday - datetime.datetime.today()).days dept_date = str(origday).split(' ')[0].strip() url = URL%(location,urlday) p = get_proxy() htmlcontent = crawl_single_page(url,proxy = p) if htmlcontent == '': invalid_proxy(p) logger.error('elongFlight: Proxy Error: htmlcontent is null with proxy: %s'%p) return [] #判断是否返回导航页,返回导航页说明content没有航班信息 #判断是否找到航班信息,没有返回[] temp_flight_list = flightPattern.findall(htmlcontent) if len(temp_flight_list) == 1: logger.error('elongFilght: Parser Error: cannot find flights with %s'%location) return [] flights = [] flight_list = temp_flight_list[:-1] typ = 0 for item in flight_list: typ = len(typePattern.findall(item)) if typ == 0: pass elif typ != 1: transfer_info = transferFlight_parser(item,dept_date,airports_dict) if transfer_info != []: flights.append(transfer_info) else: direct_info = directFlight_parser(item,dept_date,airports_dict) if direct_info != []: flights.append(direct_info) flights_set = set(flights) flights = [a for a in flights_set] #logger.info('Find %d airlines with %s'%(len(flights),location)) return flights
def GetInterPricePage(queryLogTransNo, cookie, referer, use_proxy = True): priceURL = "http://flights.ctrip.com/international/GetSubstepSearchResults.aspx?IsJSON=T&queryLogTransNo=%s&QueryType=1&cityPairAirline=first&withDirectAirline=T&RdNo=2103213618&ind=347,359,356,370" % queryLogTransNo #if use_proxy: # 如果抓起失败,换一个代理IP,然后重试,次数当前为0 for i in range(1): p = get_proxy() resp = crawl_single_page(priceURL, referer=referer, proxy = p, cookie = cookie) if resp == None or len(resp) == 0: invalid_proxy(p) else: return resp #else: #resp = crawl_single_page(searchURL, cookie = cookie) return
def crawl(url): global PROXY mc = MC() #mc.set_debug(True) mc.set_proxy(PROXY) content = mc.req('get', url, html_flag = True,time_out=15) count = 0 while len(content)<1000: invalid_proxy(PROXY,'Platform') PROXY = get_proxy(source = 'Platform') mc.set_proxy(PROXY) print 'proxy: %s' % PROXY content = mc.req('get', url, html_flag = True,time_out=15) count += 1 if count > 10: break return content
def crawl(url): global p mc = MC() #mc.set_debug(True) mc.set_proxy(p) print 'proxy:',p content = mc.req('get', url, html_flag = True,time_out=20) count = 0 while len(content) < 2000: invalid_proxy(p,'Platform') p = get_proxy(source = 'Platform') mc.set_proxy(p) print p content = mc.req('get', url, html_flag = True,time_out=20) count += 1 if count>5: break return content
def crawl(url): global PROXY mc = MC() mc.set_proxy(PROXY) content = mc.req('get', url, html_flag = True) count = 0 while len(content)<1000: invalid_proxy(PROXY,'Platform') PROXY = get_proxy(source = 'Platform') mc.set_proxy(PROXY) print 'proxy: %s' % PROXY content = mc.req('get', url, html_flag = True) count += 1 if count > 10: break #open('test.html','w').write(content) #content = open('test.html','r').read() return content
def GetInterPricePage(queryLogTransNo, cookie, referer, use_proxy=True): priceURL = "http://flights.ctrip.com/international/GetSubstepSearchResults.aspx?IsJSON=T&queryLogTransNo=%s&QueryType=1&cityPairAirline=first&withDirectAirline=T&RdNo=2103213618&ind=347,359,356,370" % queryLogTransNo #if use_proxy: # 如果抓起失败,换一个代理IP,然后重试,次数当前为0 for i in range(1): p = get_proxy() resp = crawl_single_page(priceURL, referer=referer, proxy=p, cookie=cookie) if resp == None or len(resp) == 0: invalid_proxy(p) else: return resp #else: #resp = crawl_single_page(searchURL, cookie = cookie) return
return result if hotel_id_temp == '0': result['error'] = TASK_ERROR return result p = get_proxy(source='elongHotel') if p == None: result['error'] = PROXY_NONE return result post_data = get_post_data(hotel_id_temp, check_in, check_out) page = request_post_data(request_url,data=post_data,proxy=p) if page == None or page == '': invalid_proxy(proxy=p, source='elongHotel') result['error'] = PROXY_INVALID return result room_list = parseRoom(page,hotel_name,city_name_zh,check_in,check_out,hotel_id) if room_list != []: result['para'] = room_list return result else: result['error'] = DATA_NONE return result def elong_room_request_parser(content):
except Exception, e: logger.error('ryanairFlight: wrong content format with %s' % content) result['error'] = TASK_ERROR return result p = get_proxy(source='ryanairFlight') if p == None: result['error'] = PROXY_NONE return result trip_type = 'Oneway' page = GetData(trip_type, dept_id, dest_id, dept_date, ret_date, proxy=p) if page == None: invalid_proxy(proxy=p, source='ctripFlight') result['error'] = PROXY_INVALID return result data = ParsePage(page) if data == None: result['error'] = DATA_NONE return result currency = GetCurrency(page) allinfo = [] data = jsonlib.read(data) for k, v in data.items(): for one_day_flights in v: for one_day_flight in one_day_flights[1]:
if url == '' or url == None: logger.error('feifanFlight: Get url failed!') result['error'] = UNKNOWN_TYPE return result #抓取页面并判断其是否可用 #feifan常常要刷新才能获取内容,所以爬取3次 for i in range(3): page = crawl_single_page(url, proxy=p) if page != '' and len(page) > 300: flights = parsePage(page, dept_year) if flights == []: if page.find('非凡旅行网-抱歉,您没有权限访问') != -1: invalid_proxy(proxy=p, source='feifanFlight') result['error'] = PROXY_FORBIDDEN else: result['error'] = DATA_NONE return result else: result['para'] = flights return result else: continue invalid_proxy(proxy=p, source='feifanFlight') logger.error('feifanFlight: Get page content failed!') result['error'] = PROXY_INVALID return result
(taskcontent)) result['error'] = TASK_ERROR return result p = get_proxy(source='tongchengFlight') #p = crawl_single_page('http://114.215.168.168:8086/proxy') #p = None if p == None: result['error'] = PROXY_NONE return result url = get_url(dept_city, dest_city, dept_day, dept_id, dest_id, p) if url == 'proxy_forbidden': invalid_proxy(proxy=p, source='tongchengFlight') result['error'] = PROXY_FORBIDDEN return result if url != '': page = crawl_single_page(url, proxy=p) print page else: logger.error('tongchengFlight: Get url failed!') invalid_proxy(proxy=p, source='tongchengFlight') result['error'] = PROXY_INVALID return result if page != '' and len(page) > CONTENT_LEN: flights = ParsePage(page) else:
Flight.dept_id) for i in range(len(multi_price)): multi_ticket.append((Flight.fight_no,Flight.plane_no,Flight.airline,Flight.dept_id,Flight.dest_id,\ Flight.dept_day,Flight.dept_time, Flight.dest_time,Flight.dur, multi_price[i][0], multi_price[i][1],\ multi_price[i][2], multi_price[i][3],multi_price[i][4],multi_price[i][5],multi_price[i][6], multi_price[i][7])) # print every ticket and ervery flight if page_flag == True: print 'the num of tickets is ' + str(len(multi_ticket)) result['error'] = 0 return result else: result['error'] = UNKNOWN_TYPE return result else: if html.find('NEEDVERIFY') != -1: invalid_proxy(proxy=task_content_proxy, source='csairFlight') return { 'para': { 'flight': {}, 'ticket': [] }, 'error': PROXY_INVALID } else: return {'para': {'flight': {}, 'ticket': []}, 'error': DATA_NONE} def csair_request_parser(content): result = -1 return result
except Exception,e: logger.error('bookingHotel: Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result hotel_url = get_hotel_url(url_hotel_name,check_in,check_out) p = get_proxy(source='bookingHotel') if p == None: result['error'] = PROXY_NONE return result page = crawl_single_page(hotel_url, proxy = p) if page == None or page == '': invalid_proxy(proxy=p, source='bookingHotel') result['error'] = PROXY_INVALID return result if len(page) > CONTENT_LEN: room_info = parseRoom(page, check_in, check_out, hotel_id) else: result['error'] = UNKNOWN_TYPE return result if room_info != []: result['para'] = room_info return result else: result['error'] = DATA_NONE
postdata = getPostData(dept_time, dept_id, dest_id) p = get_proxy(source='vuelingFlight') if p == None: result['error'] = PROXY_NONE return result url = 'http://tickets.vueling.com/ScheduleSelect.aspx' Referer = 'http://tickets.vueling.com/ScheduleSelect.aspx' content = request_post_data(url,postdata,referer=Referer,proxy=p,\ Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") if content == '' or content == None: invalid_proxy(proxy=p, source='vuelingFlight') result['error'] = PROXY_INVALID allinfos = vuelingparser(content) if allinfos == []: result['error'] = DATA_NONE return result else: result['para'] = allinfos return result def vueling_request_parser(content): result = -1
def crawl(city_url,city_id): global p source = 'daodao' #city_url = city_url.replace('Tourism','Restaurants') print city_url mc = MC() mc.set_proxy(p) print 'proxy: %s' % p page1 = '' page1 = mc.req('get',city_url,html_flag=True, time_out=10) count =0 while len(page1)<1000: invalid_proxy(p,'Platform') p = get_proxy(source='Platform') print 'proxy: %s' % p mc.set_proxy(p) page1 = mc.req('get',city_url,html_flag=True , time_out=10) count += 1 if count > 20: break source_city_id = re.compile(r'-g(\d+)').findall(city_url)[0] root = html.fromstring(page1) # 城市餐厅总数 rating_info = root.find_class('listing')[0].find_class('popIndexDefault')[0].xpath('text()')[0].encode('utf-8').strip().split('(')[1].replace(',','') nums = re.compile(r'(\d+)').findall(rating_info) res_total = int(nums[0]) print "total: %s " % res_total # 第一页的餐厅列表 items = root.find_class('listing') data_list = [] for item in items: res_url = 'http://www.tripadvisor.cn' + item.find_class('title')[0].xpath('a/@href')[0].strip().encode('utf-8') res_id = re.compile(r'd(\d+)').findall(res_url)[0].encode('utf-8') print res_url data = (source,city_id,res_id,res_url) print data data_list.append(data) print 'insert',insert_db(data_list) print '------------next page------------' itag = '10591' # 餐厅的类别id page = 2 data_list = [] for offset in range(30,res_total+1,30): print '-----------page %s-------' % page page += 1 next_url = 'http://www.tripadvisor.cn/RestaurantSearch?Action=PAGE&geo=%s&ajax=1&itags=%s&sortOrder=popularity&o=a%s&availSearchEnabled=false' % (source_city_id,itag,offset) print next_url content2 = '' content2 = mc.req('get',next_url,html_flag = True) while (len(content2) < 1000): p = get_proxy(source='Platform') print 'proxy: %s' % p content2 = mc.req('get',next_url,html_flag = True) no_count = len( re.compile(r'(该餐馆暂无点评,来写第一条)').findall(content2) ) # 如果大部分是“该餐馆暂无点评,来写第一条”,就停止翻页 if int(no_count) >29: break root2 = html.fromstring(content2) items = root2.find_class('listing') data_list2 = [] for item in items: res_url = 'http://www.tripadvisor.cn' + item.find_class('title')[0].xpath('a/@href')[0].strip().encode('utf-8') res_id = re.compile(r'd(\d+)').findall(res_url)[0].encode('utf-8') print res_url data2 = (source,city_id,res_id,res_url) print data2 data_list2.append(data2) print 'insert',insert_db(data_list2) print 'city %s ok' % city_id
return result if hotel_id_temp == '0': result['error'] = TASK_ERROR return result p = get_proxy(source='elongHotel') if p == None: result['error'] = PROXY_NONE return result post_data = get_post_data(hotel_id_temp, check_in, check_out) page = request_post_data(request_url, data=post_data, proxy=p) if page == None or page == '': invalid_proxy(proxy=p, source='elongHotel') result['error'] = PROXY_INVALID return result room_list = parseRoom(page, hotel_name, city_name_zh, check_in, check_out, hotel_id) if room_list != []: result['para'] = room_list return result else: result['error'] = DATA_NONE return result
return result url = URL % (location, urlday) p = get_proxy(source='elongFlight') if p == None: result['error'] = PROXY_NONE return result mc = MechanizeCrawler(p='') page = mc.get(url, html_flag=True) if page == None: invalid_proxy(proxy=p, source='elongFlight') logger.error( 'elongFlight: Proxy Error: htmlcontent is null with proxy: %s' % p) result['error'] = PROXY_INVALID return result tickets, flights = elong_page_parser(page) if tickets == [] or tickets == None: result['error'] = DATA_NONE return result result['para']['flight'] = flights result['para']['ticket'] = tickets return result
origday = datetime.datetime(string.atoi(dept_date[0:4]),string.atoi(dept_date[5:7]),string.atoi(dept_date[8:])) urlday = (origday - datetime.datetime.today()).days #dept_date = orig_date #logger.info('contents: %s %s %s %s '%(location,flight_no,dept_date,str(urlday))) except Exception,e: logger.error(str(e)) logger.error('Content Error: Wrong content format with %s'%content) return result url = URL%(location,urlday) p = get_proxy(source='elongFlight') htmlcontent = crawl_single_page(url,n=1,proxy = p) if htmlcontent == '': invalid_proxy(p) logger.error('Proxy Error: htmlcontent is null with proxy: %s'%p) return result #判断是否返回导航页,返回导航页说明content没有航班信息 #判断是否找到航班信息,没有返回[] temp_flight_list = flightPattern.findall(htmlcontent) if len(temp_flight_list) == 1: logger.error('Parser Error: cannot find flights with %s'%location) return result flight_list = temp_flight_list[:-1] #flights = [] typ = 0
except Exception, e: logger.error('bookingHotel: Wrong Content Format with %s' % taskcontent) result['error'] = TASK_ERROR return result hotel_url = get_hotel_url(url_hotel_name, check_in, check_out) p = get_proxy(source='bookingHotel') if p == None: result['error'] = PROXY_NONE return result page = crawl_single_page(hotel_url, proxy=p) if page == None or page == '': invalid_proxy(proxy=p, source='bookingHotel') result['error'] = PROXY_INVALID return result if len(page) > CONTENT_LEN: room_info = parseRoom(page, check_in, check_out, hotel_id) else: result['error'] = UNKNOWN_TYPE return result if room_info != []: result['para'] = room_info return result else: result['error'] = DATA_NONE
return result #获取代理 p = get_proxy(source='wegoFlight') if p == None: result['error'] = PROXY_NONE return result #获取初始url url_temp = get_url(dept_id, arr_id, dept_date) search_id = get_search_id(url_temp, proxy=p) if search_id == '': logger.error('Search_Id Error: get Search_Id failed') invalid_proxy(proxy=p, source='wegoFlight') result['error'] = PROXY_FORBIDDEN return result trip_id = get_trip_id(dept_id, arr_id, dept_date) #使用初始url,获取要爬取的页面,page表示一共有多少页 start_url = get_start_url(search_id, trip_id) content_temp = crawl_single_page( start_url, proxy=p, Host="www.wego.cn", Accept= "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" ) if content_temp == "":
return result url = URL%(location,urlday) p = get_proxy(source='elongFlight') if p == None: result['error'] = PROXY_NONE return result mc = MechanizeCrawler(p = '') page = mc.get(url, html_flag = True) if page == None: invalid_proxy(proxy = p, source='elongFlight') logger.error('elongFlight: Proxy Error: htmlcontent is null with proxy: %s'%p) result['error'] = PROXY_INVALID return result tickets, flights = elong_page_parser(page) if tickets == [] or tickets == None: result['error'] = DATA_NONE return result result['para']['flight'] = flights result['para']['ticket'] = tickets return result
except Exception,e: logger.error('ryanairFlight: wrong content format with %s'%content) result['error'] = TASK_ERROR return result p = get_proxy(source = 'ryanairFlight') if p == None: result['error'] = PROXY_NONE return result trip_type = 'Oneway' page = GetData(trip_type, dept_id, dest_id, dept_date, ret_date, proxy = p) if page == None: invalid_proxy(proxy = p, source='ctripFlight') result['error'] = PROXY_INVALID return result data = ParsePage(page) if data == None: result['error'] = DATA_NONE return result currency = GetCurrency(page) allinfo = [] data = jsonlib.read(data) for k, v in data.items(): for one_day_flights in v: for one_day_flight in one_day_flights[1]:
Flight.dur = get_duration(Flight.dest_time,Flight.dest_id,Flight.dept_time,Flight.dept_id) for i in range(len(multi_price)): multi_ticket.append((Flight.fight_no,Flight.plane_no,Flight.airline,Flight.dept_id,Flight.dest_id,\ Flight.dept_day,Flight.dept_time, Flight.dest_time,Flight.dur, multi_price[i][0], multi_price[i][1],\ multi_price[i][2], multi_price[i][3],multi_price[i][4],multi_price[i][5],multi_price[i][6], multi_price[i][7])) # print every ticket and ervery flight if page_flag == True: print 'the num of tickets is '+ str(len(multi_ticket)) result['error'] = 0 return result else: result['error'] = UNKNOWN_TYPE return result else: if html.find('NEEDVERIFY') != -1: invalid_proxy(proxy=task_content_proxy,source='csairFlight') return {'para':{'flight':{},'ticket':[]},'error':PROXY_INVALID} else: return {'para':{'flight':{},'ticket':[]},'error':DATA_NONE} def csair_request_parser(content): result = -1 return result '''if __name__ == '__main__': proxy_flag = False proxy = None taskcontent = 'PEK&PAR&20140730' result = csair_task_parser(taskcontent)
content_len = 0 while i < 5 and content_len < CONTENT_LEN: #p = get_proxy() p = get_proxy(source='youzhanHotel') #print p if p == None: result['error'] = PROXY_NONE return result url = price_url + str(int(time.time() * 1000)) price_page = crawl_single_page(url,proxy=p,n=1) content_len = len(price_page) i += 1 if price_page == None or price_page == '': invalid_proxy(proxy=p, source='youzhanHotel') result['error'] = PROXY_INVALID return result #print price_page price_list = price_parser(price_page,hotel_id) if price_list != []: for each_room in price_list: if len(each_room) > 3: room.city = city room.occupancy = 2 #room.hotel_name = hotel.hotel_name #print '******' #print each_room room.room_desc = each_room[3] room.real_source = each_room[2]
#p = get_proxy() #print p p = get_proxy(source='haodingHotel') if p == '' or p == None: return {'para':[], 'error':NO_PROXY} i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content = crawl_single_page(hotel_url, p) content_len = len(content) i += 1 if content == '' or content == None: invalid_proxy(proxy = p, source='haodingHotel') return {'para':[], 'error':NO_CONTENT} if len(content) < CONTENT_LEN: return {'para':[], 'error':NO_INFO} room_list = parseRoom(content,city_name_zh,country_name_zh,hotel_id,check_in,check_out) if room_list == [] or room_list == None: return {'para':[], 'error':NO_RESULT} return {'para':room_list, 'error':0} def haoding_room_request_parser(content):
#p = '116.228.55.217:8000' p = get_proxy() url = 'http://tickets.vueling.com/ScheduleSelect.aspx' Referer = 'http://tickets.vueling.com/ScheduleSelect.aspx' content = request_post_data(url,postdata,referer=Referer,proxy=p,\ Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") result = -1 if content != '' and len(content) > 100: result = vuelingparser(content, flight_no, req_dept_time) else: invalid_proxy(p) logger.error('Get web content failed!') return result def vuelingparser(content, flight_no, req_dept_time): #allinfos = [] #get flight num flight_num_list = [] flight_num_info_temp = flight_no_pat.findall(content) if flight_num_info_temp != []: for flight_num_info in flight_num_info_temp: flight_num_temp_1 = flight_num_info.find('|') flight_num_temp_2 = flight_num_info.rfind('~^')
return result #获取代理 p = get_proxy(source = 'wegoFlight') if p == None: result['error'] = PROXY_NONE return result #获取初始url url_temp = get_url(dept_id,arr_id,dept_date) search_id = get_search_id(url_temp,proxy = p) if search_id == '': logger.error('Search_Id Error: get Search_Id failed') invalid_proxy(proxy=p, source='wegoFlight') result['error'] = PROXY_FORBIDDEN return result trip_id = get_trip_id(dept_id,arr_id,dept_date) #使用初始url,获取要爬取的页面,page表示一共有多少页 start_url = get_start_url(search_id,trip_id) content_temp = crawl_single_page(start_url,proxy = p, Host="www.wego.cn", Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") if content_temp == "": logger.error('Proxy Error: htmlcontent is null with proxy: %s'%p) #反馈代理无效 invalid_proxy(proxy=p, source='wegoFlight') result['error'] = PROXY_INVALID return result