def smartfares_task_parser(taskcontent): result = {} flights = {} tickets = [] result['para'] = {'flight': flights, 'ticket': tickets} result['error'] = 0 taskcontent = taskcontent.encode('utf-8') try: dept_id, dest_id, dept_day, dest_day = taskcontent.strip().split('&')[0], \ taskcontent.strip().split('&')[1], taskcontent.strip().split('&')[2], \ taskcontent.strip().split('&')[3] except: logger.error('smartfaresFlight::Wrong Content Format with %s' % taskcontent) result['error'] = TASK_ERROR return result p = get_proxy(source='smartfaresFlight') #p= None if p == None: result['error'] = PROXY_NONE return result cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) try: search_url = get_search_url(dept_day, dest_day, dept_id, dest_id) content = crawl_single_page(search_url, proxy=p, referer=HOST) search_id = get_search_id(content) if search_id == '' or search_id == None: logger.error('smartfares::Parse search id failed') result['error'] = PROXY_INVALID return result except: logger.error('smartfares::Parse search id failed') result['error'] = PROXY_INVALID return result url_real = URL % search_id i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content_real = crawl_single_page(url=url_real, proxy=p, referer=search_url) content_len = len(content_real) i += 1 if len(content_real) > 100: parser_result = parsePage(content_real) tickets = parser_result['ticket'] flights = parser_result['flight'] result['para'] = {'flight': flights, 'ticket': tickets} return result else: result['error'] = DATA_NONE return result
def smartfares_task_parser(taskcontent): result = {} flights = {} tickets = [] result['para'] = {'flight':flights, 'ticket':tickets} result['error'] = 0 taskcontent = taskcontent.encode('utf-8') try: dept_id, dest_id, dept_day, dest_day = taskcontent.strip().split('&')[0], \ taskcontent.strip().split('&')[1], taskcontent.strip().split('&')[2], \ taskcontent.strip().split('&')[3] except: logger.error('smartfaresFlight::Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result p = get_proxy(source='smartfaresFlight') #p= None if p == None: result['error'] = PROXY_NONE return result cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) try: search_url = get_search_url(dept_day,dest_day,dept_id,dest_id) content = crawl_single_page(search_url, proxy=p, referer=HOST) search_id = get_search_id(content) if search_id == '' or search_id == None: logger.error('smartfares::Parse search id failed') result['error'] = PROXY_INVALID return result except: logger.error('smartfares::Parse search id failed') result['error'] = PROXY_INVALID return result url_real = URL%search_id i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content_real = crawl_single_page(url=url_real, proxy=p, referer=search_url) content_len = len(content_real) i += 1 if len(content_real) > 100: parser_result = parsePage(content_real) tickets = parser_result['ticket'] flights = parser_result['flight'] result['para'] = {'flight':flights, 'ticket':tickets} return result else: result['error'] = DATA_NONE return result
def get_url(dept_city, dest_city, dept_date, dept_id, dest_id, proxy=None): parser_url = '' url_temp = 'http://www.ly.com/iflight/flightinterajax.aspx?action=SEARCHURL&airplaneInternatType=1&iOrgPort=' + \ dept_city + '&iArvPort=' + dest_city + '&idtGoDate=' + dept_date + \ '&idtBackDate=时间/日期&sel_inCabinType=Y&sel_inPassengersType=1&sel_inAdult=1&sel_inChild=0' + \ '&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名' + \ '&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名' + \ '&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&callback=tc10805565235' page1 = crawl_single_page(url_temp, proxy=proxy, referer='http://www.ly.com') print page1 try: num01 = page1.find('(') num02 = page1.rfind(')') json_content_temp = page1[num01 + 1:num02] json_temp1 = json.loads(json_content_temp) if json_temp1['state'] == 100: url_temp1 = json_temp1['href'] else: return parser_url except Exception, e: if page1.find('a5') != -1: parser_url = 'proxy_forbidden' return parser_url
def GetInterPricePage(queryLogTransNo, cookie, referer, proxy): #use_proxy = True): priceURL = "http://flights.ctrip.com/international/GetSubstepSearchResults.aspx?IsJSON=T&queryLogTransNo=%s&QueryType=1&cityPairAirline=first&withDirectAirline=T&RdNo=2103213618&ind=347,359,356,370" % queryLogTransNo #p = proxy p = None #if use_proxy: # 如果抓起失败,换一个代理IP,然后重试 #for i in range(3): #p = get_proxy() resp = crawl_single_page(priceURL, n=1, referer=referer, proxy=p, cookie=cookie) if resp == None or len(resp) == 0: #invalid_proxy(p) pass else: return resp #logger.info('get price page successful') #else: #resp = crawl_single_page(searchURL, cookie = cookie) return
def elong_task_parser(content): contents = content.split('&') if len(contents) != 2: logger.error('elongFlight: wrong content format with %s' % content) return None location, origdate = contents[0].strip(), contents[1].strip() origday = datetime.datetime(string.atoi(origdate[0:4]), string.atoi(origdate[4:6]), string.atoi(origdate[6:])) urlday = (origday - datetime.datetime.today()).days dept_date = str(origday).split(' ')[0].strip() url = URL % (location, urlday) p = get_proxy() htmlcontent = crawl_single_page(url, proxy=p) if htmlcontent == '': invalid_proxy(p) logger.error( 'elongFlight: Proxy Error: htmlcontent is null with proxy: %s' % p) return [] #判断是否返回导航页,返回导航页说明content没有航班信息 #判断是否找到航班信息,没有返回[] temp_flight_list = flightPattern.findall(htmlcontent) if len(temp_flight_list) == 1: logger.error('elongFilght: Parser Error: cannot find flights with %s' % location) return [] flights = [] flight_list = temp_flight_list[:-1] typ = 0 for item in flight_list: typ = len(typePattern.findall(item)) if typ == 0: pass elif typ != 1: transfer_info = transferFlight_parser(item, dept_date, airports_dict) if transfer_info != []: flights.append(transfer_info) else: direct_info = directFlight_parser(item, dept_date, airports_dict) if direct_info != []: flights.append(direct_info) flights_set = set(flights) flights = [a for a in flights_set] #logger.info('Find %d airlines with %s'%(len(flights),location)) return flights
def getPage(url, proxy = None): for i in range(2): page = crawl_single_page(url, proxy = proxy, n = 1) if len(page) > 100: return page return None
def get_proxy(): i = 0 proxy_len = 0 while i < 3 and proxy_len < 5: proxy = crawl_single_page(proxy_url) proxy_len = len(proxy) i += 1 if proxy == '': return None return proxy
def get_proxy(): i = 0 proxy_len = 0 while i < 3 and proxy_len < 5: proxy = crawl_single_page(proxy_url) proxy_len = len(proxy) i += 1 if proxy == '': return None return proxy
def get_json_url(dept_city=None,dest_city=None,dept_time=None): html_url = 'http://www.expedia.com.hk/Flights-Search?trip=oneway&leg1=from:'+dept_city+',to:'+dest_city+',departure:'+dept_time+'TANYT&passengers=children:0,adults:1,seniors:0,infantinlap:Y&options=cabinclass:coach&mode=search&' html_res = crawl_single_page(html_url) regex = re.compile(r'<div id="originalContinuationId">(.*?)</div>',re.M|re.S|re.I) match_id = re.search(regex,html_res) if match_id: return match_id.group(1).strip('\s') else: logger.info('not catch the originalContinuationId of json data') return False
def get_json_url(dept_city=None, dest_city=None, dept_time=None): html_url = 'http://www.expedia.com.hk/Flights-Search?trip=oneway&leg1=from:' + dept_city + ',to:' + dest_city + ',departure:' + dept_time + 'TANYT&passengers=children:0,adults:1,seniors:0,infantinlap:Y&options=cabinclass:coach&mode=search&' html_res = crawl_single_page(html_url) regex = re.compile(r'<div id="originalContinuationId">(.*?)</div>', re.M | re.S | re.I) match_id = re.search(regex, html_res) if match_id: return match_id.group(1).strip('\s') else: logger.info('not catch the originalContinuationId of json data') return False
def elong_task_parser(content): contents = content.split('&') if len(contents) != 2: logger.error('elongFlight: wrong content format with %s'%content) return None location, origdate = contents[0].strip(),contents[1].strip() origday = datetime.datetime(string.atoi(origdate[0:4]),string.atoi(origdate[4:6]),string.atoi(origdate[6:])) urlday = (origday - datetime.datetime.today()).days dept_date = str(origday).split(' ')[0].strip() url = URL%(location,urlday) p = get_proxy() htmlcontent = crawl_single_page(url,proxy = p) if htmlcontent == '': invalid_proxy(p) logger.error('elongFlight: Proxy Error: htmlcontent is null with proxy: %s'%p) return [] #判断是否返回导航页,返回导航页说明content没有航班信息 #判断是否找到航班信息,没有返回[] temp_flight_list = flightPattern.findall(htmlcontent) if len(temp_flight_list) == 1: logger.error('elongFilght: Parser Error: cannot find flights with %s'%location) return [] flights = [] flight_list = temp_flight_list[:-1] typ = 0 for item in flight_list: typ = len(typePattern.findall(item)) if typ == 0: pass elif typ != 1: transfer_info = transferFlight_parser(item,dept_date,airports_dict) if transfer_info != []: flights.append(transfer_info) else: direct_info = directFlight_parser(item,dept_date,airports_dict) if direct_info != []: flights.append(direct_info) flights_set = set(flights) flights = [a for a in flights_set] #logger.info('Find %d airlines with %s'%(len(flights),location)) return flights
def easyjet_task_parser(taskcontent): result = {} flights = {} tickets = [] result['para'] = {'flight':flights, 'ticket':tickets} result['error'] = 0 try: dept_id, dest_id, dept_day_temp = taskcontent.strip().split('&')[0], \ taskcontent.strip().split('&')[1], \ taskcontent.strip().split('&')[2] except: logger.error('easyjet::Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result search_url = get_search_url(dept_id, dest_id, dept_day_temp) p = get_proxy(source='easyjet') time_zone_A = airport[dept_id] time_zone_B = airport[dest_id] #print p #print search_url if p == None: result['error'] = PROXY_NONE return result cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content = crawl_single_page(url=search_url, proxy=p, n=1, referer=HOST) content_len = len(content) i += 1 if content == '' or content == None or len(content) < CONTENT_LEN: result['error'] = PROXY_INVALID return result para = parsePage(content, p, time_zone_A, time_zone_B) if para == {'flight':{}, 'ticket':[]}: result['error'] = DATA_NONE return result else: flights = para['flight'] tickets = para['ticket'] result['para'] = {'ticket':tickets, 'flight':flights} return result
def easyjet_task_parser(taskcontent): result = {} flights = {} tickets = [] result['para'] = {'flight': flights, 'ticket': tickets} result['error'] = 0 try: dept_id, dest_id, dept_day_temp = taskcontent.strip().split('&')[0], \ taskcontent.strip().split('&')[1], \ taskcontent.strip().split('&')[2] except: logger.error('easyjet::Wrong Content Format with %s' % taskcontent) result['error'] = TASK_ERROR return result search_url = get_search_url(dept_id, dest_id, dept_day_temp) p = get_proxy(source='easyjet') time_zone_A = airport[dept_id] time_zone_B = airport[dest_id] #print p #print search_url if p == None: result['error'] = PROXY_NONE return result cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content = crawl_single_page(url=search_url, proxy=p, n=1, referer=HOST) content_len = len(content) i += 1 if content == '' or content == None or len(content) < CONTENT_LEN: result['error'] = PROXY_INVALID return result para = parsePage(content, p, time_zone_A, time_zone_B) if para == {'flight': {}, 'ticket': []}: result['error'] = DATA_NONE return result else: flights = para['flight'] tickets = para['ticket'] result['para'] = {'ticket': tickets, 'flight': flights} return result
def GetInterPricePage(queryLogTransNo, cookie, referer, use_proxy = True): priceURL = "http://flights.ctrip.com/international/GetSubstepSearchResults.aspx?IsJSON=T&queryLogTransNo=%s&QueryType=1&cityPairAirline=first&withDirectAirline=T&RdNo=2103213618&ind=347,359,356,370" % queryLogTransNo #if use_proxy: # 如果抓起失败,换一个代理IP,然后重试,次数当前为0 for i in range(1): p = get_proxy() resp = crawl_single_page(priceURL, referer=referer, proxy = p, cookie = cookie) if resp == None or len(resp) == 0: invalid_proxy(p) else: return resp #else: #resp = crawl_single_page(searchURL, cookie = cookie) return
def get_url(dept_city, dest_city, dept_date, dept_id, dest_id, proxy = None): parser_url = '' url_temp = 'http://www.ly.com/iflight/flightinterajax.aspx?action=SEARCHURL&airplaneInternatType=1&iOrgPort=' + dept_city + '&iArvPort=' + dest_city + '&idtGoDate=' + dept_date + '&idtBackDate=时间/日期&sel_inCabinType=Y&sel_inPassengersType=1&sel_inAdult=1&sel_inChild=0&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&callback=tc10805565235' page1 = crawl_single_page(url_temp, proxy=proxy, n=1, Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8') try: num01 = page1.find('(') num02 = page1.rfind(')') json_content_temp = page1[num01+1:num02] json_temp1 = json.loads(json_content_temp) if json_temp1['state'] == 100: url_temp1 = json_temp1['href'] else: return parser_url except Exception,e: logger.error('Can not get url temp 1!') return parser_url
def get_url(dept_city, dest_city, dept_date, dept_id, dest_id, proxy): parser_url = '' url_temp = 'http://www.ly.com/iflight/flightinterajax.aspx?action=SEARCHURL&airplaneInternatType=1&iOrgPort=' + dept_city + '&iArvPort=' + dest_city + '&idtGoDate=' + dept_date + '&idtBackDate=时间/日期&sel_inCabinType=Y&sel_inPassengersType=1&sel_inAdult=1&sel_inChild=0&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&callback=tc10805565235' page1 = crawl_single_page(url_temp, proxy=proxy)#, n=1, Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8') try: num01 = page1.find('(') num02 = page1.rfind(')') json_content_temp = page1[num01+1:num02] json_temp1 = json.loads(json_content_temp) if json_temp1['state'] == 100: url_temp1 = json_temp1['href'] else: return parser_url #if json_temp1[''] except Exception,e: #logger.error('Can not get url temp 1!') return parser_url
def csair_task_parser(taskcontent): result = {} multi_ticket = [] one_flight = {} result['para'] = {'flight': one_flight, 'ticket': multi_ticket} result['error'] = 0 try: param_list = taskcontent.strip().split('&') url= 'http://b2c.csair.com/B2C40/detail-'+param_list[0]+param_list[1]+'-'+param_list[2]\ +'-1-0-0-0-1-0-1-0-1-0.g2c' except: logger.info('url param is not valid\n') result['error'] = TASK_ERROR return result #Initial all params dic_flightdate = {} multi_price = [] select_time = 0 Flag1 = False Flag2 = False page_flag = False cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) task_content_proxy = get_proxy(source='csairFlight') if task_content_proxy == None: result['error'] = PROXY_NONE return result html = crawl_single_page(url, proxy=task_content_proxy) if html == '' or html == None: result['error'] = PROXY_INVALID return result pattern = re.compile(r'\s*<FLIGHTS>\s*') match = pattern.search(html) if match and len(html) > CONTENT_LEN: dom = etree.fromstring(html) etree.tostring(dom) for ele in dom.iter(): if ele.tag is not None: if ele.tag in word_flightdate: #print ele.tag, ele.text dic_flightdate[ele.tag] = ele.text elif ele.tag in word_parent_list: page_flag = True #node of DateFIGHT multi_flight = [] Flight = nanhang_flight() select_time += 1 flight_num = 0 ticket_dur_list = [] for word in ele: if word.tag in word_list[0]: flight_num += 1 dic_flight = {} EachFlight = nanhang_eachflight() for word_child in word: if word_child.tag in word_child_list[0]: Flag1 = True dic_flight[ word_child. tag] = word_child.text #each flight if Flag1 == True: try: Flag1 = False EachFlight.flight_no = dic_flight[ word_child_list[0][0]] EachFlight.dept_id = dic_flight[ word_child_list[0][1]] EachFlight.dest_id = dic_flight[ word_child_list[0][2]] EachFlight.flight_key = EachFlight.flight_no + '_' + EachFlight.dept_id + '_' + EachFlight.dest_id dept_time = dic_flight[word_child_list[0] [3]] EachFlight.dept_time = dept_time[ 0:10] + 'T' + dept_time[ -5:len(dept_time)] dest_time = dic_flight[word_child_list[0] [4]] EachFlight.dest_time = dest_time[ 0:10] + 'T' + dest_time[ -5:len(dest_time)] EachFlight.dur = get_duration( dest_time, EachFlight.dest_id, dept_time, EachFlight.dept_id) EachFlight.dept_time = EachFlight.dept_time + ':00' EachFlight.dest_time = EachFlight.dest_time + ':00' ticket_dur_list.append(EachFlight.dur) EachFlight.airline = '南方航空公司' EachFlight.plane_no = dic_flight[ word_child_list[0] [5]] # rebulid and compute flight except KeyError, e: print e else: one_flight[EachFlight.flight_key] = ( EachFlight.flight_no, EachFlight.airline, EachFlight.plane_no, EachFlight.dept_id, EachFlight.dest_id, EachFlight.dept_time, EachFlight.dest_time, EachFlight.dur) multi_flight.append(( EachFlight.flight_key, EachFlight.flight_no, EachFlight.airline, EachFlight.plane_no, EachFlight.dept_id, EachFlight.dest_id, EachFlight.dept_time, EachFlight.dest_time, EachFlight.dur)) #list of multi flight elif word.tag in word_list[1]: multi_price = [] #node of price for word_child in word: if word_child.tag in word_next_list: dic_ticket = {} for word_next_child in word_child: if word_next_child.tag in word_child_list[ 1]: Flag2 = True dic_ticket[ word_next_child. tag] = word_next_child.text if Flag2 == True: try: Flag2 = False Flight.price = string.atof( dic_ticket[word_child_list[1] [0]]) Flight.tax = string.atof( dic_ticket[word_child_list[1] [1]] ) + string.atof(dic_ticket[ word_child_list[1] [2]]) + string.atof(dic_ticket[ word_child_list[1][3]]) Flight.currency = dic_ticket[ word_child_list[1][4]] Flight.seat_type = dic_ticket[ word_child_list[1][5]] if Flight.seat_type == 'ECONOMY': Flight.seat_type = '经济舱' if Flight.seat_type == 'BUSINESS': Flight.seat_type = '商务舱' if Flight.seat_type == 'FIRST': Flight.seat_type = '头等舱' if Flight.seat_type == 'PREMIUMECONOMY': Flight.seat_type = '超经济舱' Flight.return_rule = 'NULL' Flight.stop = flight_num - 1 Flight.surcharge = -1 Flight.source = 'csair::csair' except KeyError, e: print e else: multi_price.append( (Flight.price, Flight.tax, Flight.surcharge, Flight.currency, Flight.seat_type, Flight.source, Flight.return_rule, Flight.stop)) if select_time is not 0: if multi_flight != []: new_flight_no = [] Flight.fight_no = '_'.join( [item[1] for item in multi_flight]) Flight.plane_no = '_'.join( [item[3] for item in multi_flight]) Flight.airline = '_'.join( [item[2] for item in multi_flight]) Flight.dept_id = multi_flight[0][4] Flight.dest_id = multi_flight[len(multi_flight) - 1][5] Flight.dept_day = dic_flightdate[word_flightdate[ 0]][0:4] + '-' + dic_flightdate[ word_flightdate[0]][ 4:6] + '-' + dic_flightdate[ word_flightdate[0]][6:8] Flight.dept_time = multi_flight[0][6] Flight.dest_time = multi_flight[len(multi_flight) - 1][7] Flight.dur = get_duration(Flight.dest_time, Flight.dest_id, Flight.dept_time, Flight.dept_id) for i in range(len(multi_price)): multi_ticket.append((Flight.fight_no,Flight.plane_no,Flight.airline,Flight.dept_id,Flight.dest_id,\ Flight.dept_day,Flight.dept_time, Flight.dest_time,Flight.dur, multi_price[i][0], multi_price[i][1],\ multi_price[i][2], multi_price[i][3],multi_price[i][4],multi_price[i][5],multi_price[i][6], multi_price[i][7]))
except Exception,e: logger.error('Parse taskcontent failed with ' + str(e)) return [] check_in = check_in_temp[:4] + '-' + check_in_temp[4:6] + '-' + check_in_temp[6:] check_out_temp = datetime.datetime(int(check_in_temp[:4]), int(check_in_temp[4:6]), \ int(check_in_temp[6:])) check_out = str(check_out_temp + datetime.timedelta(days=1))[:10] hotel_url = get_hotel_url(url_hotel_name,check_in,check_out) print hotel_url p = get_proxy() i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content = crawl_single_page(hotel_url, p) #content = open('bookingroom3.html','r').read() #content_len = len(content) if content_len > CONTENT_LEN: ##fout = open('bookingroom3.html','w') #fout.write(content) #fout.close() break i += 1 print 'Content len :' + str(content_len) room_info = parseRoom(content, check_in, check_out, hotel_id) return room_info def parseRoom(content, check_in, check_out, hotel_id):
return result p = get_proxy(source='biyiHotel') print p if p == None: result['error'] = PROXY_NONE return result first_url = 'http://www.biyi.cn/' url = get_url(hotel_name, city_name_en, check_in_day, check_out_day) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp = crawl_single_page(url=first_url, proxy=p, Accept=accept, referer=first_url, n=1) #for x in cj: # print x #page = with_cookie_crawler(first_url=first_url, second_url=url, proxy=p, min_page_len = 3000) #cj = cookielib.CookieJar() #opener2 = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp2 = crawl_single_page(url=url,proxy=p,Accept=accept,referer=first_url, n=1) for y in cj: print y #print cj #cj2.update(cj) print '----------------' i = 0 content_len = 0
def youzhan_task_parser(taskcontent): all_info = [] room_list = [] taskcontent = taskcontent.encode('utf-8').strip() hotel_id = taskcontent.split('&')[0] star = taskcontent.split('&')[2] ipathid = taskcontent.split('&')[1] city = taskcontent.split('&')[3] country = taskcontent.split('&')[4] #room_type = taskcontent.split('&')[3] from_date_temp = taskcontent.split('&')[5] from_date = from_date_temp[:4] + '-' + from_date_temp[4:6] + '-' \ + from_date_temp[6:] to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \ int(from_date_temp[6:])) to_date = str(to_date_temp + datetime.timedelta(days=1))[:10] #获取代理 p = get_proxy() #if p == "": #logger.error("get proxy failed") #return None hotel = Hotel() room = Room() rating_url = get_rating_url(hotel_id) rating_page = crawl_single_page(rating_url, proxy=p) grade_str = grade_parser(rating_page) if grade_str != '': hotel.grade = grade_str[:-1] else: pass #logger.error('Error: No grade_str found!') map_url = get_map_url(hotel_id) map_page = crawl_single_page(map_url, proxy=p) #print map_page map_info_list = staticmap_parser(map_page) if map_info_list != []: hotel.hotel_name = map_info_list[1] if is_alphabet(hotel.hotel_name.decode('utf-8')) == True: hotel.hotel_name_en = hotel.hotel_name else: hotel.hotel_name_en = 'NULL' hotel.map_info = map_info_list[0] else: logger.error( 'youzhanHotel: Map info do not have hotel name and map_info') return [] info_url = get_info_url(hotel_id, from_date, to_date) info_page = crawl_single_page(info_url, proxy=p) if info_page == '': #invalid_proxy(p) return [] info_list = info_parser(info_page) if info_list != []: hotel.country = country hotel.city = city hotel.address = info_list[1] hotel_desc_temp = info_list[3].replace('<br/>', '').replace(''', '') if hotel_desc_temp != '': hotel.description = hotel_desc_temp else: hotel.description = 'NULL' hotel.service = info_list[4] if '停车场' in hotel.service: hotel.has_parking = 'Yes' if '无线网络' in hotel.service or 'wifi' in hotel.service: hotel.has_wifi = 'Yes' else: return [] hotel.source = 'youzhan' hotel.source_id = hotel_id hotel.star = star price_url = get_price_url(hotel_id, ipathid, from_date, to_date) price_page = crawl_single_page(price_url, proxy=p) price_list = price_parser(price_page, hotel_id) #print '********' #print price_list if price_list != []: for each_room in price_list: if len(each_room) > 3: room.city = city room.occupancy = 2 room.hotel_name = hotel.hotel_name #print '******' #print each_room room.room_desc = each_room[3] room.real_source = each_room[2] num = each_room[3].find('-') if num > 0: if len(each_room[3][:num]) < 20: room.room_type = each_room[3][:num] else: room.room_type = 'NULL' else: if len(each_room[3]) < 20: room.room_type = each_room[3] else: room.room_type = 'NULL' if each_room[0] != u'nbsp;': room.price = each_room[0] room.has_breakfast = each_room[1] room.room_desc = each_room[3] if '免费WiFi' in room.room_desc: hotel.is_wifi_free = 'Yes' if '免费取消' in room.room_desc: hotel.is_cancel_free = 'Yes' room.currency = 'CNY' room.source = 'youzhan' room.source_hotelid = hotel_id room.check_in = from_date room.check_out = to_date room_tuple = (room.hotel_name,room.city,room.source,room.source_hotelid,\ room.source_roomid,room.real_source,room.room_type,room.occupancy,\ room.bed_type,room.size,room.floor,room.check_in,room.check_out,room.price,\ room.tax,room.currency,room.is_extrabed,room.is_extrabed_free,room.has_breakfast,\ room.is_breakfast_free,room.is_cancel_free,room.room_desc) room_list.append(room_tuple) hotel_tuple = (hotel.hotel_name, hotel.hotel_name_en,hotel.source,hotel.source_id,hotel.brand_name,\ hotel.map_info,hotel.address,hotel.city,hotel.country,hotel.postal_code, \ hotel.star,hotel.grade,hotel.has_wifi,hotel.is_wifi_free,hotel.has_parking,\ hotel.is_parking_free,hotel.service,hotel.img_items,hotel.description) hotel_list = [] hotel_list.append(hotel_tuple) all_info.append(hotel_list) all_info.append(room_list) return all_info
#p = crawl_single_page('http://114.215.168.168:8086/proxy') p = get_proxy(source='jijitongFlight') #print p #p = None if p == None: result['error'] = PROXY_NONE return result first_url = FIRST_URL % (dept_city_en,dest_city_en,dept_day_temp,dest_day_temp) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp = crawl_single_page(first_url,proxy=p, Accept=accept, referer=HOST, n=1) #resp = '' #print '________________' if resp.find('404错误') < 0: url = get_url(dept_city_zh,dest_city_zh,dept_day,dest_day) i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: page = crawl_single_page(url, proxy = p, referer = first_url, n = 1) content_len = len(page) i += 1 if page != '' and page != None and len(page) > CONTENT_LEN: post_data = get_post_data(page, dept_day, dest_day)
search_id = get_search_id(url_temp, proxy=p) if search_id == '': logger.error('Search_Id Error: get Search_Id failed') invalid_proxy(proxy=p, source='wegoFlight') result['error'] = PROXY_FORBIDDEN return result trip_id = get_trip_id(dept_id, arr_id, dept_date) #使用初始url,获取要爬取的页面,page表示一共有多少页 start_url = get_start_url(search_id, trip_id) content_temp = crawl_single_page( start_url, proxy=p, Host="www.wego.cn", Accept= "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" ) if content_temp == "": logger.error('Proxy Error: htmlcontent is null with proxy: %s' % p) #反馈代理无效 invalid_proxy(proxy=p, source='wegoFlight') result['error'] = PROXY_INVALID return result page_num = 0 page_num = pageParser(content_temp) page_num_get = 0 if page_num == 0:
#p = crawl_single_page('http://114.215.168.168:8086/proxy') #p = None if p == None: result['error'] = PROXY_NONE return result url = get_url(dept_city, dest_city, dept_day, dept_id, dest_id, p) if url == 'proxy_forbidden': invalid_proxy(proxy=p, source='tongchengFlight') result['error'] = PROXY_FORBIDDEN return result if url != '': page = crawl_single_page(url, proxy=p) print page else: logger.error('tongchengFlight: Get url failed!') invalid_proxy(proxy=p, source='tongchengFlight') result['error'] = PROXY_INVALID return result if page != '' and len(page) > CONTENT_LEN: flights = ParsePage(page) else: logger.error('tongchengFlight: Crawl page failed!') invalid_proxy(proxy=p, source='tongchengFlight') result['error'] = PROXY_INVALID return result
except Exception,e: logger.error('jijitongFlight:Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result p = get_proxy(source='jijitongFlight') if p == None: result['error'] = PROXY_NONE return result first_url = FIRST_URL % (dept_city_en,dest_city_en,dept_day_temp) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp = crawl_single_page(first_url,proxy=p, \ Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', n = 1) if resp.find('404错误') < 0: url = get_url(dept_city_zh,dest_city_zh,dept_day) page = crawl_single_page(url, proxy = p, referer = first_url) if page != '' and len(page) > 300: post_data = get_post_data(page, dept_day) price_url = PRICE_URL%str(time.time()*1000) price_page = request_post_data(price_url, data=post_data, referer=first_url, \ n=1, proxy=p) price_dict = parsePrice(price_page) #print price_dict time.sleep(5) flights = parse_page(page, price_dict)
location = dept_id + '-' + dest_id origday = datetime.datetime(string.atoi(dept_date[0:4]),string.atoi(dept_date[5:7]),string.atoi(dept_date[8:])) urlday = (origday - datetime.datetime.today()).days #dept_date = orig_date #logger.info('contents: %s %s %s %s '%(location,flight_no,dept_date,str(urlday))) except Exception,e: logger.error(str(e)) logger.error('Content Error: Wrong content format with %s'%content) return result url = URL%(location,urlday) p = get_proxy(source='elongFlight') htmlcontent = crawl_single_page(url,n=1,proxy = p) if htmlcontent == '': invalid_proxy(p) logger.error('Proxy Error: htmlcontent is null with proxy: %s'%p) return result #判断是否返回导航页,返回导航页说明content没有航班信息 #判断是否找到航班信息,没有返回[] temp_flight_list = flightPattern.findall(htmlcontent) if len(temp_flight_list) == 1: logger.error('Parser Error: cannot find flights with %s'%location) return result flight_list = temp_flight_list[:-1]
except Exception, e: logger.info('url id wrong :' + e) result['error'] = TASK_ERROR return result cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) url_res = get_json_url(dept_city, dest_city, dept_time) if url_res != False: url = 'http://www.expedia.com.hk/Flight-Search-Outbound?c=' + url_res + '&_=' + str( time.time()) task_content_proxy = get_proxy(source='expediaFlight') if task_content_proxy == None: result['error'] = PROXY_NONE return result html_res = crawl_single_page(url, proxy=task_content_proxy) if html_res == '' or html_res == None: result['error'] = PROXY_INVALID return result else: result['error'] = TASK_ERROR return result try: json_list = json.loads(html_res) if json_list[key_list[0]] == None: result['error'] = DATA_NONE return result search_legs = json_list[key_list[0]][key_list[1]] for legs_list in search_legs: for legs_key in legs_list: if legs_key == key_list[2]: #legs list
searchURL = "http://flights.ctrip.com/booking/%s-%s-day-1.html?DCity1=%s&ACity1=%s&DDate1=%s&passengerQuantity=1&SendTicketCity=undefined&PassengerType=ADU&SearchType=S&RouteIndex=1&RelDDate=&RelRDate=" interSearchURL = "http://flights.ctrip.com/international/ShowFareFirst.aspx?flighttype=S&relddate=%s&dcity=%s&acity=%s" is_inter = False searcURL = "" if dept_id.lower() in CN_AIRPORTS and dest_id.lower() in CN_AIRPORTS: searchURL = searchURL % (dept_id, dest_id, dept_id, dest_id, dept_date) else: searchURL = interSearchURL % (dept_date, dept_id, dest_id) is_inter = True refererURL = "http://flights.ctrip.com/booking/" cookie = {} p = get_proxy() resp = crawl_single_page(searchURL, proxy=p, cookie=cookie) if resp == None or len(resp) == 0: invalid_proxy(p) return -1 # 2. 解析页面 try: tree = etree.HTML(resp) except Exception, e: logger.info('etree error: %s' % str(e)) return -1 if is_inter or GetTextByXpath(tree, "//title/text()").endswith("携程国际机票"): # 国际机票 if len(tree.xpath("//input[@id='queryLogTransNo']")) > 0: queryLogTransNo = tree.xpath( "//input[@id='queryLogTransNo']")[0].get("value")
#获取代理 p = get_proxy(type = '') #获取初始url url_temp = get_url(dept_id,arr_id,dept_date) search_id = get_search_id(url_temp,proxy = p) if search_id == '': logger.error('Search_Id Error: get Search_Id failed') return None trip_id = get_trip_id(dept_id,arr_id,dept_date) #使用初始url,获取要爬取的页面,page表示一共有多少页 start_url = get_start_url(search_id,trip_id) content_temp = crawl_single_page(start_url,proxy = p, Host="www.wego.cn", Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") if content_temp == "": logger.error('Proxy Error: htmlcontent is null with proxy: %s'%p) #反馈代理无效 invalid_proxy(p) return None page_num = 0 page_num = pageParser(content_temp) page_num_get = 0 if page_num == 0: logger.info('Parser Error: cannot find flights with %s - %s'%(dept_id,arr_id)) return None #拼出要爬取的urls
#获取初始url url_temp = get_url(dept_id, arr_id, dept_date) search_id = get_search_id(url_temp, proxy=p) if search_id == '': logger.error('Search_Id Error: get Search_Id failed') return None trip_id = get_trip_id(dept_id, arr_id, dept_date) #使用初始url,获取要爬取的页面,page表示一共有多少页 start_url = get_start_url(search_id, trip_id) content_temp = crawl_single_page( start_url, proxy=p, Host="www.wego.cn", Accept= "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" ) if content_temp == "": logger.error('Proxy Error: htmlcontent is null with proxy: %s' % p) #反馈代理无效 invalid_proxy(p) return None page_num = 0 page_num = pageParser(content_temp) page_num_get = 0 if page_num == 0: logger.info('Parser Error: cannot find flights with %s - %s' %
searchURL = "http://flights.ctrip.com/booking/%s-%s-day-1.html?DCity1=%s&ACity1=%s&DDate1=%s&passengerQuantity=1&SendTicketCity=undefined&PassengerType=ADU&SearchType=S&RouteIndex=1&RelDDate=&RelRDate=" interSearchURL = "http://flights.ctrip.com/international/ShowFareFirst.aspx?flighttype=S&relddate=%s&dcity=%s&acity=%s" is_inter = False searcURL = "" if dept_id.lower() in CN_AIRPORTS and dest_id.lower() in CN_AIRPORTS: searchURL = searchURL %(dept_id, dest_id, dept_id, dest_id, dept_date) else: searchURL = interSearchURL %(dept_date, dept_id, dest_id) is_inter = True refererURL = "http://flights.ctrip.com/booking/" cookie = {} p = get_proxy() resp = crawl_single_page(searchURL, proxy = p, cookie = cookie) if resp == None or len(resp) == 0: invalid_proxy(p) return None # 2. 解析页面 tree = etree.HTML(resp) if is_inter or GetTextByXpath(tree, "//title/text()").endswith("携程国际机票"): # 国际机票 queryLogTransNo = tree.xpath("//input[@id='queryLogTransNo']")[0].get("value") # 抓取机票价格页面 resp = GetInterPricePage(queryLogTransNo, cookie, searchURL)#, use_proxy) return ParseInterPage(resp) else: # 国内机票 return []
dept_time = dept_time[0:4] +'/'+ dept_time[4:6] + '/' + dept_time[6:8] except Exception,e: logger.info('url id wrong :'+e) result['error'] = TASK_ERROR return result cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) url_res = get_json_url(dept_city,dest_city,dept_time) if url_res != False: url = 'http://www.expedia.com.hk/Flight-Search-Outbound?c='+ url_res +'&_='+str(time.time()) task_content_proxy = get_proxy(source='expediaFlight') if task_content_proxy == None: result['error'] = PROXY_NONE return result html_res = crawl_single_page(url, proxy = task_content_proxy) if html_res == '' or html_res == None: result['error'] = PROXY_INVALID return result else: result['error'] = TASK_ERROR return result try: json_list = json.loads(html_res) if json_list[key_list[0]] == None: result['error'] = DATA_NONE return result search_legs = json_list[key_list[0]][key_list[1]] for legs_list in search_legs: for legs_key in legs_list: if legs_key == key_list[2]: #legs list
p = get_proxy(source='biyiHotel') print p if p == None: result['error'] = PROXY_NONE return result first_url = 'http://www.biyi.cn/' url = get_url(hotel_name, city_name_en, check_in_day, check_out_day) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp = crawl_single_page(url=first_url, proxy=p, Accept=accept, referer=first_url, n=1) #for x in cj: # print x #page = with_cookie_crawler(first_url=first_url, second_url=url, proxy=p, min_page_len = 3000) #cj = cookielib.CookieJar() #opener2 = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp2 = crawl_single_page(url=url, proxy=p, Accept=accept, referer=first_url, n=1) for y in cj:
#logger.info(ipathid) room_type = infos[2] checkin_date = infos[3].split('-')[0]#format:2014-05-05 checkout_date = infos[3].split('-')[1]#format:2014-05-06 real_source = infos[4].split('::')[-1] #logger.info('type' + room_type + ' source' + real_source) except Exception,e: logger.error('wrong content format' + str(e)) return -1 p = get_proxy() room = Room() price_url = get_price_url(hotel_id,ipathid,checkin_date,checkout_date) price_page = crawl_single_page(price_url,n=1,proxy=p) price_list = price_parser(price_page,hotel_id) result = 1000000#设置一个极大值 if price_list != []: for each_room in price_list: if len(each_room) > 3: #room.city = city #room.occupancy = 1 #room.hotel_name = hotel.hotel_name #print each_room #room.room_desc = each_room[3] room.real_source = each_room[2] num = each_room[3].find('-')
def youzhan_task_parser(taskcontent): all_info = [] room_list = [] taskcontent = taskcontent.encode('utf-8').strip() hotel_id = taskcontent.split('&')[0] star = taskcontent.split('&')[2] ipathid = taskcontent.split('&')[1] city = taskcontent.split('&')[3] country = taskcontent.split('&')[4] #room_type = taskcontent.split('&')[3] from_date_temp = taskcontent.split('&')[5] from_date = from_date_temp[:4] + '-' + from_date_temp[4:6] + '-' \ + from_date_temp[6:] to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \ int(from_date_temp[6:])) to_date = str(to_date_temp + datetime.timedelta(days = 1))[:10] #获取代理 p = get_proxy() #if p == "": #logger.error("get proxy failed") #return None hotel = Hotel() room = Room() rating_url = get_rating_url(hotel_id) rating_page = crawl_single_page(rating_url, proxy=p) grade_str = grade_parser(rating_page) if grade_str != '': hotel.grade = grade_str[:-1] else: pass #logger.error('Error: No grade_str found!') map_url = get_map_url(hotel_id) map_page = crawl_single_page(map_url, proxy=p) #print map_page map_info_list = staticmap_parser(map_page) if map_info_list != []: hotel.hotel_name = map_info_list[1] if is_alphabet(hotel.hotel_name.decode('utf-8')) == True: hotel.hotel_name_en = hotel.hotel_name else: hotel.hotel_name_en = 'NULL' hotel.map_info = map_info_list[0] else: logger.error('youzhanHotel: Map info do not have hotel name and map_info') return [] info_url = get_info_url(hotel_id,from_date,to_date) info_page = crawl_single_page(info_url,proxy=p) if info_page == '': #invalid_proxy(p) return [] info_list = info_parser(info_page) if info_list != []: hotel.country = country hotel.city = city hotel.address = info_list[1] hotel_desc_temp = info_list[3].replace('<br/>','').replace(''','') if hotel_desc_temp != '': hotel.description = hotel_desc_temp else: hotel.description = 'NULL' hotel.service = info_list[4] if '停车场' in hotel.service: hotel.has_parking = 'Yes' if '无线网络' in hotel.service or 'wifi' in hotel.service: hotel.has_wifi = 'Yes' else: return [] hotel.source = 'youzhan' hotel.source_id = hotel_id hotel.star = star price_url = get_price_url(hotel_id,ipathid,from_date,to_date) price_page = crawl_single_page(price_url,proxy=p) price_list = price_parser(price_page,hotel_id) #print '********' #print price_list if price_list != []: for each_room in price_list: if len(each_room) > 3: room.city = city room.occupancy = 2 room.hotel_name = hotel.hotel_name #print '******' #print each_room room.room_desc = each_room[3] room.real_source = each_room[2] num = each_room[3].find('-') if num > 0: if len(each_room[3][:num]) < 20: room.room_type = each_room[3][:num] else: room.room_type = 'NULL' else: if len(each_room[3]) < 20: room.room_type = each_room[3] else: room.room_type = 'NULL' if each_room[0] != u'nbsp;': room.price = each_room[0] room.has_breakfast = each_room[1] room.room_desc = each_room[3] if '免费WiFi' in room.room_desc: hotel.is_wifi_free = 'Yes' if '免费取消' in room.room_desc: hotel.is_cancel_free = 'Yes' room.currency = 'CNY' room.source = 'youzhan' room.source_hotelid = hotel_id room.check_in = from_date room.check_out = to_date room_tuple = (room.hotel_name,room.city,room.source,room.source_hotelid,\ room.source_roomid,room.real_source,room.room_type,room.occupancy,\ room.bed_type,room.size,room.floor,room.check_in,room.check_out,room.price,\ room.tax,room.currency,room.is_extrabed,room.is_extrabed_free,room.has_breakfast,\ room.is_breakfast_free,room.is_cancel_free,room.room_desc) room_list.append(room_tuple) hotel_tuple = (hotel.hotel_name, hotel.hotel_name_en,hotel.source,hotel.source_id,hotel.brand_name,\ hotel.map_info,hotel.address,hotel.city,hotel.country,hotel.postal_code, \ hotel.star,hotel.grade,hotel.has_wifi,hotel.is_wifi_free,hotel.has_parking,\ hotel.is_parking_free,hotel.service,hotel.img_items,hotel.description) hotel_list = [] hotel_list.append(hotel_tuple) all_info.append(hotel_list) all_info.append(room_list) return all_info
dept_city = city_dict[dept_id] dest_city = city_dict[dest_id] dept_city_cn = city_dict_cn[dept_id].encode('utf-8') dest_city_cn = city_dict_cn[dest_id].encode('utf-8') except Exception, e: logger.error('tongchengFlight: wrong content format with %s'%taskcontent + str(e)) return -1 dept_date = dept_day[:4] + '-' + dept_day[4:6] + '-' +dept_day[6:] #2014-05-10 dept_time = dept_date + 'T' + dept_minute #p = get_proxy() url = get_url(dept_city_cn, dest_city_cn, dept_date, dept_city, dest_city) if url != '': page = crawl_single_page(url, proxy = None, Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8') else: return -1 if page != '' and len(page) > 100: result = ParsePage(page,flight_no,dept_time) else: #invalid_proxy(p) return -1 return result def get_url(dept_city, dest_city, dept_date, dept_id, dest_id, proxy = None): parser_url = '' url_temp = 'http://www.ly.com/iflight/flightinterajax.aspx?action=SEARCHURL&airplaneInternatType=1&iOrgPort=' + dept_city + '&iArvPort=' + dest_city + '&idtGoDate=' + dept_date + '&idtBackDate=时间/日期&sel_inCabinType=Y&sel_inPassengersType=1&sel_inAdult=1&sel_inChild=0&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&callback=tc10805565235'
#logger.info(ipathid) room_type = infos[2] checkin_date = infos[3].split('-')[0] #format:2014-05-05 checkout_date = infos[3].split('-')[1] #format:2014-05-06 real_source = infos[4].split('::')[-1] #logger.info('type' + room_type + ' source' + real_source) except Exception, e: logger.error('wrong content format' + str(e)) return -1 p = get_proxy() room = Room() price_url = get_price_url(hotel_id, ipathid, checkin_date, checkout_date) price_page = crawl_single_page(price_url, n=1, proxy=p) price_list = price_parser(price_page, hotel_id) result = 1000000 #设置一个极大值 if price_list != []: for each_room in price_list: if len(each_room) > 3: #room.city = city #room.occupancy = 1 #room.hotel_name = hotel.hotel_name #print each_room #room.room_desc = each_room[3] room.real_source = each_room[2] num = each_room[3].find('-')
#print p #p = None if p == None: result['error'] = PROXY_NONE return result first_url = FIRST_URL % (dept_city_en, dest_city_en, dept_day_temp, dest_day_temp) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp = crawl_single_page(first_url, proxy=p, Accept=accept, referer=HOST, n=1) #resp = '' #print '________________' if resp.find('404错误') < 0: url = get_url(dept_city_zh, dest_city_zh, dept_day, dest_day) i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: page = crawl_single_page(url, proxy=p, referer=first_url, n=1) content_len = len(page) i += 1 if page != '' and page != None and len(page) > CONTENT_LEN:
check_out = str(check_out_temp + datetime.timedelta(days=1))[:10] except Exception, e: logger.error('bookingHotel: Wrong Content Format with %s' % taskcontent) result['error'] = TASK_ERROR return result hotel_url = get_hotel_url(url_hotel_name, check_in, check_out) p = get_proxy(source='bookingHotel') if p == None: result['error'] = PROXY_NONE return result page = crawl_single_page(hotel_url, proxy=p) if page == None or page == '': invalid_proxy(proxy=p, source='bookingHotel') result['error'] = PROXY_INVALID return result if len(page) > CONTENT_LEN: room_info = parseRoom(page, check_in, check_out, hotel_id) else: result['error'] = UNKNOWN_TYPE return result if room_info != []: result['para'] = room_info return result else:
origday = datetime.datetime(string.atoi(dept_date[0:4]), string.atoi(dept_date[5:7]), string.atoi(dept_date[8:])) urlday = (origday - datetime.datetime.today()).days #dept_date = orig_date #logger.info('contents: %s %s %s %s '%(location,flight_no,dept_date,str(urlday))) except Exception, e: logger.error(str(e)) logger.error('Content Error: Wrong content format with %s' % content) return result url = URL % (location, urlday) p = get_proxy(source='elongFlight') htmlcontent = crawl_single_page(url, n=1, proxy=p) if htmlcontent == '': invalid_proxy(p) logger.error('Proxy Error: htmlcontent is null with proxy: %s' % p) return result #判断是否返回导航页,返回导航页说明content没有航班信息 #判断是否找到航班信息,没有返回[] temp_flight_list = flightPattern.findall(htmlcontent) if len(temp_flight_list) == 1: logger.error('Parser Error: cannot find flights with %s' % location) return result flight_list = temp_flight_list[:-1]
#获取初始url url_temp = get_url(dept_id,arr_id,dept_date) search_id = get_search_id(url_temp,proxy = p) if search_id == '': logger.error('Search_Id Error: get Search_Id failed') invalid_proxy(proxy=p, source='wegoFlight') result['error'] = PROXY_FORBIDDEN return result trip_id = get_trip_id(dept_id,arr_id,dept_date) #使用初始url,获取要爬取的页面,page表示一共有多少页 start_url = get_start_url(search_id,trip_id) content_temp = crawl_single_page(start_url,proxy = p, Host="www.wego.cn", Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") if content_temp == "": logger.error('Proxy Error: htmlcontent is null with proxy: %s'%p) #反馈代理无效 invalid_proxy(proxy=p, source='wegoFlight') result['error'] = PROXY_INVALID return result page_num = 0 page_num = pageParser(content_temp) page_num_get = 0 if page_num == 0: logger.info('Parser Error: cannot find flights with %s - %s'%(dept_id,arr_id)) result['error'] = DATA_NONE return result
check_out_temp = datetime.datetime(int(check_in_temp[:4]), int(check_in_temp[4:6]), int(check_in_temp[6:])) check_out = str(check_out_temp + datetime.timedelta(days=1))[:10] except Exception,e: logger.error('bookingHotel: Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result hotel_url = get_hotel_url(url_hotel_name,check_in,check_out) p = get_proxy(source='bookingHotel') if p == None: result['error'] = PROXY_NONE return result page = crawl_single_page(hotel_url, proxy = p) if page == None or page == '': invalid_proxy(proxy=p, source='bookingHotel') result['error'] = PROXY_INVALID return result if len(page) > CONTENT_LEN: room_info = parseRoom(page, check_in, check_out, hotel_id) else: result['error'] = UNKNOWN_TYPE return result if room_info != []: result['para'] = room_info return result else:
if p == None: result['error'] = PROXY_NONE return result #生成URL并判断其是否可用 url = get_url(dept_code, dest_code, dept_day) if url == '' or url == None: logger.error('feifanFlight: Get url failed!') result['error'] = UNKNOWN_TYPE return result #抓取页面并判断其是否可用 #feifan常常要刷新才能获取内容,所以爬取3次 for i in range(3): page = crawl_single_page(url, proxy=p) if page != '' and len(page) > 300: flights = parsePage(page, dept_year) if flights == []: if page.find('非凡旅行网-抱歉,您没有权限访问') != -1: invalid_proxy(proxy=p, source='feifanFlight') result['error'] = PROXY_FORBIDDEN else: result['error'] = DATA_NONE return result else: result['para'] = flights return result else: continue
if len(info_list) < 5: return [] except Exception, e: logger.error('tongchengFlight,wrong content format with %s'%(taskcontent)) dept_id, dest_id, dept_city, dest_city, dept_date_temp = info_list[0], info_list[1], \ info_list[2], info_list[3], info_list[4] dept_day = dept_date_temp[:4] + '-' + dept_date_temp[4:6] + '-' +dept_date_temp[6:] p = get_proxy() url = get_url(dept_city, dest_city, dept_day, dept_id, dest_id, p) if url != '': page = crawl_single_page(url, proxy = p)#, Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8') else: logger.error('tongchengFlight: Get url failed!') return flights if page != '' and len(page) > 100: flights = ParsePage(page) else: logger.error('tongchengFlight: Crawl page failed!') return flights return flights def get_url(dept_city, dest_city, dept_date, dept_id, dest_id, proxy): parser_url = ''
room = Room() price_url = get_price_url(hotel_id,ipathid,from_date,to_date) i = 0 content_len = 0 while i < 5 and content_len < CONTENT_LEN: #p = get_proxy() p = get_proxy(source='youzhanHotel') #print p if p == None: result['error'] = PROXY_NONE return result url = price_url + str(int(time.time() * 1000)) price_page = crawl_single_page(url,proxy=p,n=1) content_len = len(price_page) i += 1 if price_page == None or price_page == '': invalid_proxy(proxy=p, source='youzhanHotel') result['error'] = PROXY_INVALID return result #print price_page price_list = price_parser(price_page,hotel_id) if price_list != []: for each_room in price_list: if len(each_room) > 3: room.city = city room.occupancy = 2
dept_date = dept_day[0:4] + '-' + dept_day[4:6] + '-' + dept_day[6:] dept_year= dept_date[:4] orig_dept_time = dept_date + 'T' + dept_hour + ':00' except Exception,e: logger.error('feifanFlight: wrong content format with %s'%content + str(e)) return -1 #获取代理 #p = get_proxy() #生成URL并判断其是否可用 url = get_url(dept_id, dest_id, dept_date) if url != '' and url != None: page = crawl_single_page(url, proxy=None) else: logger.error('feifanFlight: Get url failed!') return -1 #抓取页面并判断其是否可用 if page != '' and len(page) > 300: result = parsePage(page, dept_year,flight_no, orig_dept_time) else: logger.error('feifanFlight: Get page content failed!') return -1 return result def parsePage(content,dept_year, flight_no, orig_dept_time): result = -1
return [] check_in = check_in_temp[:4] + '-' + check_in_temp[ 4:6] + '-' + check_in_temp[6:] check_out_temp = datetime.datetime(int(check_in_temp[:4]), int(check_in_temp[4:6]), \ int(check_in_temp[6:])) check_out = str(check_out_temp + datetime.timedelta(days=1))[:10] hotel_url = get_hotel_url(url_hotel_name, check_in, check_out) logger.info(hotel_url) p = get_proxy() logger.info(p) i = 0 content_len = 0 while i < 3 and content_len < CONTENT_LEN: content = crawl_single_page(hotel_url, proxy=p) #content = open('bookingroom3.html','r').read() #content_len = len(content) if content_len > CONTENT_LEN: ##fout = open('bookingroom3.html','w') #fout.write(content) #fout.close() break i += 1 print 'Content len :' + str(content_len) room_info = parseRoom(content, check_in, check_out, hotel_id) return room_info def parseRoom(content, check_in, check_out, hotel_id):
def csair_task_parser(taskcontent): result = {} multi_ticket = [] one_flight = {} result['para'] = {'flight':one_flight, 'ticket':multi_ticket} result['error'] = 0 try: param_list = taskcontent.strip().split('&') url= 'http://b2c.csair.com/B2C40/detail-'+param_list[0]+param_list[1]+'-'+param_list[2]\ +'-1-0-0-0-1-0-1-0-1-0.g2c' except: logger.info('url param is not valid\n') result['error'] = TASK_ERROR return result #Initial all params dic_flightdate = {} multi_price = [] select_time = 0 Flag1 = False Flag2 = False page_flag = False cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) task_content_proxy = get_proxy(source='csairFlight') if task_content_proxy == None: result['error'] = PROXY_NONE return result html = crawl_single_page(url, proxy = task_content_proxy) if html == '' or html == None: result['error'] = PROXY_INVALID return result pattern = re.compile(r'\s*<FLIGHTS>\s*') match = pattern.search(html) if match and len(html) > CONTENT_LEN: dom =etree.fromstring(html) etree.tostring(dom) for ele in dom.iter(): if ele.tag is not None: if ele.tag in word_flightdate: #print ele.tag, ele.text dic_flightdate[ele.tag] = ele.text elif ele.tag in word_parent_list: page_flag = True #node of DateFIGHT multi_flight = [] Flight = nanhang_flight() select_time += 1 flight_num = 0 ticket_dur_list = [] for word in ele: if word.tag in word_list[0]: flight_num += 1 dic_flight = {} EachFlight = nanhang_eachflight() for word_child in word: if word_child.tag in word_child_list[0]: Flag1 = True dic_flight[word_child.tag]= word_child.text #each flight if Flag1 == True: try: Flag1 = False EachFlight.flight_no = dic_flight[word_child_list[0][0]] EachFlight.dept_id = dic_flight[word_child_list[0][1]] EachFlight.dest_id = dic_flight[word_child_list[0][2]] EachFlight.flight_key = EachFlight.flight_no + '_' + EachFlight.dept_id + '_'+ EachFlight.dest_id dept_time = dic_flight[word_child_list[0][3]] EachFlight.dept_time = dept_time[0:10] +'T'+dept_time[-5:len(dept_time)] dest_time = dic_flight[word_child_list[0][4]] EachFlight.dest_time = dest_time[0:10] +'T'+dest_time[-5:len(dest_time)] EachFlight.dur = get_duration(dest_time,EachFlight.dest_id, dept_time,EachFlight.dept_id) EachFlight.dept_time = EachFlight.dept_time + ':00' EachFlight.dest_time = EachFlight.dest_time+ ':00' ticket_dur_list.append(EachFlight.dur) EachFlight.airline = '南方航空公司' EachFlight.plane_no = dic_flight[word_child_list[0][5]] # rebulid and compute flight except KeyError,e: print e else: one_flight[EachFlight.flight_key] = (EachFlight.flight_no, EachFlight.airline, EachFlight.plane_no,EachFlight.dept_id,EachFlight.dest_id,EachFlight.dept_time, EachFlight.dest_time,EachFlight.dur) multi_flight.append((EachFlight.flight_key,EachFlight.flight_no, EachFlight.airline, EachFlight.plane_no,EachFlight.dept_id,EachFlight.dest_id,EachFlight.dept_time, EachFlight.dest_time,EachFlight.dur)) #list of multi flight elif word.tag in word_list[1]: multi_price = [] #node of price for word_child in word: if word_child.tag in word_next_list: dic_ticket = {} for word_next_child in word_child: if word_next_child.tag in word_child_list[1]: Flag2 = True dic_ticket[word_next_child.tag] = word_next_child.text if Flag2 == True: try: Flag2 = False Flight.price = string.atof(dic_ticket[word_child_list[1][0]]) Flight.tax = string.atof(dic_ticket[word_child_list[1][1]]) + string.atof(dic_ticket[word_child_list[1][2]]) + string.atof(dic_ticket[word_child_list[1][3]]) Flight.currency = dic_ticket[word_child_list[1][4]] Flight.seat_type = dic_ticket[word_child_list[1][5]] if Flight.seat_type == 'ECONOMY': Flight.seat_type = '经济舱' if Flight.seat_type =='BUSINESS': Flight.seat_type = '商务舱' if Flight.seat_type == 'FIRST': Flight.seat_type = '头等舱' if Flight.seat_type == 'PREMIUMECONOMY': Flight.seat_type = '超经济舱' Flight.return_rule = 'NULL' Flight.stop = flight_num - 1 Flight.surcharge = -1 Flight.source = 'csair::csair' except KeyError,e: print e else: multi_price.append((Flight.price, Flight.tax, Flight.surcharge, Flight.currency,Flight.seat_type, Flight.source, Flight.return_rule, Flight.stop)) if select_time is not 0: if multi_flight != []: new_flight_no = [] Flight.fight_no = '_'.join([item[1] for item in multi_flight]) Flight.plane_no = '_'.join([item[3] for item in multi_flight]) Flight.airline = '_'.join([item[2]for item in multi_flight]) Flight.dept_id = multi_flight[0][4] Flight.dest_id = multi_flight[len(multi_flight)-1][5] Flight.dept_day = dic_flightdate[word_flightdate[0]][0:4]+'-'+ dic_flightdate[word_flightdate[0]][4:6]+'-'+dic_flightdate[word_flightdate[0]][6:8] Flight.dept_time = multi_flight[0][6] Flight.dest_time = multi_flight[len(multi_flight)-1][7] Flight.dur = get_duration(Flight.dest_time,Flight.dest_id,Flight.dept_time,Flight.dept_id) for i in range(len(multi_price)): multi_ticket.append((Flight.fight_no,Flight.plane_no,Flight.airline,Flight.dept_id,Flight.dest_id,\ Flight.dept_day,Flight.dept_time, Flight.dest_time,Flight.dur, multi_price[i][0], multi_price[i][1],\ multi_price[i][2], multi_price[i][3],multi_price[i][4],multi_price[i][5],multi_price[i][6], multi_price[i][7]))