def GetData(tripType, orig, dest, deptDate, retDate): searchURL = "https://www.bookryanair.com/SkySales/Search.aspx" refererURL = "https://www.bookryanair.com/SkySales/booking.aspx?culture=en-gb&lc=en-gb&cmpid2=Google" data = {"fromaction": "Search.aspx", "SearchInput$TripType": tripType, "SearchInput$Orig": orig, "SearchInput$Dest": dest, "SearchInput$DeptDate": deptDate, "SearchInput$RetDate": retDate, "SearchInput$IsFlexible": "on", "SearchInput$PaxTypeADT": 1, "SearchInput$PaxTypeCHD": 0, "SearchInput$PaxTypeINFANT": 0, "SearchInput$AcceptTerms": "on", "__EVENTTARGET": "SearchInput$ButtonSubmit", } # 如果抓起失败,换一个代理IP,然后重试 for i in range(3): p = get_proxy() resp = request_post_data(searchURL, data, referer = refererURL, proxy = p) if resp == None or len(resp) == 0: invalid_proxy(p) else: return resp return resp
def GetData(tripType, orig, dest, deptDate, retDate): searchURL = "https://www.bookryanair.com/SkySales/Search.aspx" refererURL = "https://www.bookryanair.com/SkySales/booking.aspx?culture=en-gb&lc=en-gb&cmpid2=Google" data = {"fromaction": "Search.aspx", "SearchInput$TripType": tripType, "SearchInput$Orig": orig, "SearchInput$Dest": dest, "SearchInput$DeptDate": deptDate, "SearchInput$RetDate": retDate, "SearchInput$IsFlexible": "on", "SearchInput$PaxTypeADT": 1, "SearchInput$PaxTypeCHD": 0, "SearchInput$PaxTypeINFANT": 0, "SearchInput$AcceptTerms": "on", "__EVENTTARGET": "SearchInput$ButtonSubmit", } #p = get_proxy() p = get_proxy(type = 'f') resp = request_post_data(searchURL, data, referer = refererURL, proxy = p) if resp == None or len(resp) == 0: #invalid_proxy(p) pass else: return resp return resp
def GetData(tripType, orig, dest, deptDate, retDate): searchURL = "https://www.bookryanair.com/SkySales/Search.aspx" refererURL = "https://www.bookryanair.com/SkySales/booking.aspx?culture=en-gb&lc=en-gb&cmpid2=Google" data = {"fromaction": "Search.aspx", "SearchInput$TripType": tripType, "SearchInput$Orig": orig, "SearchInput$Dest": dest, "SearchInput$DeptDate": deptDate, "SearchInput$RetDate": retDate, "SearchInput$IsFlexible": "on", "SearchInput$PaxTypeADT": 1, "SearchInput$PaxTypeCHD": 0, "SearchInput$PaxTypeINFANT": 0, "SearchInput$AcceptTerms": "on", "__EVENTTARGET": "SearchInput$ButtonSubmit", } p = get_proxy() p = '221.181.104.11:8080' resp = request_post_data(searchURL, data, referer = refererURL, proxy = p,Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") if resp == None or len(resp) == 0: #invalid_proxy(p) pass else: return resp return resp
req_dept_time = dept_date + 'T' + dept_hour + ':00' except Exception, e: logger.error('Parse taskcontent failed!' + str(e)) return -1 postdata = getPostData(dept_date, dept_id, dest_id) #获取代理 #p = '116.228.55.217:8000' p = get_proxy() url = 'http://tickets.vueling.com/ScheduleSelect.aspx' Referer = 'http://tickets.vueling.com/ScheduleSelect.aspx' content = request_post_data(url,postdata,referer=Referer,proxy=p,\ Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") result = -1 if content != '' and len(content) > 100: result = vuelingparser(content, flight_no, req_dept_time) else: invalid_proxy(p) logger.error('Get web content failed!') return result def vuelingparser(content, flight_no, req_dept_time): #allinfos = [] #get flight num
taskcontent) result['error'] = TASK_ERROR return result if hotel_id_temp == '0': result['error'] = TASK_ERROR return result p = get_proxy(source='elongHotel') if p == None: result['error'] = PROXY_NONE return result post_data = get_post_data(hotel_id_temp, check_in, check_out) page = request_post_data(request_url, data=post_data, proxy=p) if page == None or page == '': invalid_proxy(proxy=p, source='elongHotel') result['error'] = PROXY_INVALID return result room_list = parseRoom(page, hotel_name, city_name_zh, check_in, check_out, hotel_id) if room_list != []: result['para'] = room_list return result else: result['error'] = DATA_NONE return result
url = get_url(dept_city_zh, dest_city_zh, dept_day, dest_day) i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: page = crawl_single_page(url, proxy=p, referer=first_url, n=1) content_len = len(page) i += 1 if page != '' and page != None and len(page) > CONTENT_LEN: post_data = get_post_data(page, dept_day, dest_day) price_url = PRICE_URL % str(time.time() * 1000) i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: price_page = request_post_data(url=price_url, data=post_data, referer=first_url, \ n=1, proxy=p) content_len = len(price_page) i += 1 price_dict = parsePrice(price_page) flights = parse_page(page, price_dict) result['para'] = flights else: result['error'] = PROXY_INVALID return result else: result['error'] = DATA_NONE return result return result
logger.error('elongHotelParser: Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result if hotel_id_temp == '0': result['error'] = TASK_ERROR return result p = get_proxy(source='elongHotel') if p == None: result['error'] = PROXY_NONE return result post_data = get_post_data(hotel_id_temp, check_in, check_out) page = request_post_data(request_url,data=post_data,proxy=p) if page == None or page == '': invalid_proxy(proxy=p, source='elongHotel') result['error'] = PROXY_INVALID return result room_list = parseRoom(page,hotel_name,city_name_zh,check_in,check_out,hotel_id) if room_list != []: result['para'] = room_list return result else: result['error'] = DATA_NONE return result
for flight_content in flights_content: try: flight_adding_id = flight_to_add_state_pat.findall(flight_content)[0] except: continue post_data = { 'flightToAddState':flight_adding_id, 'flightSearchSession':search_session, 'basketOptions':backet_option, 'flightOptionsState':'Visible', '__BasketState':backet_state } i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content = request_post_data(url=REQUEST_URL, data=post_data, proxy=proxy, n=1) content_len = len(content) i += 1 if len(content) < 100 or content == '' or content == None: continue para = parseFlightAndTicket(content, time_zone_A, time_zone_B) if para['flight'] != {}: flights.update(para['flight']) if para['ticket'] != []: tickets += para['ticket'] result = {'flight':flights, 'ticket':tickets} return result
req_dept_time = dept_date + 'T' + dept_hour + ':00' except Exception,e: logger.error('Parse taskcontent failed!' + str(e)) return -1 postdata = getPostData(dept_date,dept_id,dest_id) #获取代理 #p = '116.228.55.217:8000' p = get_proxy() url = 'http://tickets.vueling.com/ScheduleSelect.aspx' Referer = 'http://tickets.vueling.com/ScheduleSelect.aspx' content = request_post_data(url,postdata,referer=Referer,proxy=p,\ Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") result = -1 if content != '' and len(content) > 100: result = vuelingparser(content,flight_no,req_dept_time) else: invalid_proxy(p) logger.error('Get web content failed!') return result def vuelingparser(content,flight_no,req_dept_time): #allinfos = [] #get flight num
check_in = check_in_temp[:4] + '-' + check_in_temp[4:6] + '-' + \ check_in_temp[6:] check_out_temp = datetime.datetime(int(check_in_temp[:4]), int(check_in_temp[4:6]), \ int(check_in_temp[6:])) check_out = str(check_out_temp + datetime.timedelta(days=1))[:10] hotel_id_temp = hotel_id.split('_')[1] if hotel_id_temp == '0': return room_list i = 0 content_len = 0 while i < 3 and content_len < 100: p = get_proxy() logger.info('Proxy: ' + p) post_data = get_post_data(hotel_id_temp, check_in, check_out) content = request_post_data(request_url, data=post_data, proxy=p) content_len = len(content) i += 1 room_list = parseRoom(content, hotel_name, city_name_zh, check_in, check_out, hotel_id) return room_list def parseRoom(content, hotel_name, city_name_zh, check_in, check_out, hotel_id): room_list = [] if content == '' or len(content) < 100: return room_list try:
cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp = crawl_single_page(first_url,proxy=p, \ Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', n = 1) if resp.find('404错误') < 0: url = get_url(dept_city_zh,dest_city_zh,dept_day) page = crawl_single_page(url, proxy = p, referer = first_url) if page != '' and len(page) > 300: post_data = get_post_data(page, dept_day) price_url = PRICE_URL%str(time.time()*1000) price_page = request_post_data(price_url, data=post_data, referer=first_url, \ n=1, proxy=p) price_dict = parsePrice(price_page) #print price_dict time.sleep(5) flights = parse_page(page, price_dict) result['para'] = flights else: result['error'] = PROXY_INVALID return result else: result['error'] = DATA_NONE return result return result
flight_adding_id = flight_to_add_state_pat.findall( flight_content)[0] except: continue post_data = { 'flightToAddState': flight_adding_id, 'flightSearchSession': search_session, 'basketOptions': backet_option, 'flightOptionsState': 'Visible', '__BasketState': backet_state } i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content = request_post_data(url=REQUEST_URL, data=post_data, proxy=proxy, n=1) content_len = len(content) i += 1 if len(content) < 100 or content == '' or content == None: continue para = parseFlightAndTicket(content, time_zone_A, time_zone_B) if para['flight'] != {}: flights.update(para['flight']) if para['ticket'] != []: tickets += para['ticket'] result = {'flight': flights, 'ticket': tickets} return result
check_in = check_in_temp[:4] + '-' + check_in_temp[4:6] + '-' + \ check_in_temp[6:] check_out_temp = datetime.datetime(int(check_in_temp[:4]), int(check_in_temp[4:6]), \ int(check_in_temp[6:])) check_out = str(check_out_temp + datetime.timedelta(days=1))[:10] hotel_id_temp = hotel_id.split('_')[1] if hotel_id_temp == '0': return room_list i = 0 content_len = 0 while i < 3 and content_len < 100: p = get_proxy() logger.info('Proxy: ' + p) post_data = get_post_data(hotel_id_temp, check_in, check_out) content = request_post_data(request_url,data=post_data,proxy=p) content_len = len(content) i += 1 room_list = parseRoom(content,hotel_name,city_name_zh,check_in,check_out,hotel_id) return room_list def parseRoom(content,hotel_name,city_name_zh,check_in,check_out,hotel_id): room_list = [] if content == '' or len(content) < 100: return room_list try: content_json = json.loads(content)['value']['hotelRoomList'] except Exception, e: