Python invalid_proxyの例、common.common.invalid_proxy Pythonの例

コード例 #1

0

ファイルを表示

def GetData(tripType, orig, dest, deptDate, retDate):
    searchURL = "https://www.bookryanair.com/SkySales/Search.aspx"
    refererURL = "https://www.bookryanair.com/SkySales/booking.aspx?culture=en-gb&lc=en-gb&cmpid2=Google"

    data = {"fromaction": "Search.aspx", "SearchInput$TripType": tripType,
                "SearchInput$Orig": orig,
                "SearchInput$Dest": dest,
                "SearchInput$DeptDate": deptDate,
                "SearchInput$RetDate": retDate,
                "SearchInput$IsFlexible": "on",
                "SearchInput$PaxTypeADT": 1,
                "SearchInput$PaxTypeCHD": 0,
                "SearchInput$PaxTypeINFANT": 0,
                "SearchInput$AcceptTerms": "on",
                "__EVENTTARGET": "SearchInput$ButtonSubmit",
                }

    # 如果抓起失败，换一个代理IP，然后重试
    for i in range(3):
        p = get_proxy()
        resp = request_post_data(searchURL, data, referer = refererURL, proxy = p)
        if resp == None or len(resp) == 0:
            invalid_proxy(p)
        else:
            return resp
    return resp

コード例 #2

0

ファイルを表示

def elong_task_parser(content):

    contents = content.split('&')
    if len(contents) != 2:
        logger.error('elongFlight: wrong content format with %s' % content)
        return None
    location, origdate = contents[0].strip(), contents[1].strip()

    origday = datetime.datetime(string.atoi(origdate[0:4]),
                                string.atoi(origdate[4:6]),
                                string.atoi(origdate[6:]))
    urlday = (origday - datetime.datetime.today()).days
    dept_date = str(origday).split(' ')[0].strip()

    url = URL % (location, urlday)

    p = get_proxy()

    htmlcontent = crawl_single_page(url, proxy=p)
    if htmlcontent == '':
        invalid_proxy(p)
        logger.error(
            'elongFlight: Proxy Error: htmlcontent is null with proxy: %s' % p)
        return []

    #判断是否返回导航页，返回导航页说明content没有航班信息

    #判断是否找到航班信息，没有返回[]
    temp_flight_list = flightPattern.findall(htmlcontent)
    if len(temp_flight_list) == 1:
        logger.error('elongFilght: Parser Error: cannot find flights with %s' %
                     location)
        return []

    flights = []

    flight_list = temp_flight_list[:-1]

    typ = 0
    for item in flight_list:
        typ = len(typePattern.findall(item))
        if typ == 0:
            pass
        elif typ != 1:
            transfer_info = transferFlight_parser(item, dept_date,
                                                  airports_dict)
            if transfer_info != []:
                flights.append(transfer_info)
        else:
            direct_info = directFlight_parser(item, dept_date, airports_dict)
            if direct_info != []:
                flights.append(direct_info)

    flights_set = set(flights)
    flights = [a for a in flights_set]
    #logger.info('Find %d airlines with %s'%(len(flights),location))

    return flights

コード例 #3

0

ファイルを表示

ファイル: elongTaskParser.py プロジェクト: dangpu/momoko

def elong_task_parser(content):

    contents = content.split('&')
    if len(contents) != 2:
        logger.error('elongFlight: wrong content format with %s'%content)
        return None
    location, origdate = contents[0].strip(),contents[1].strip()
    
    origday = datetime.datetime(string.atoi(origdate[0:4]),string.atoi(origdate[4:6]),string.atoi(origdate[6:]))
    urlday = (origday - datetime.datetime.today()).days
    dept_date = str(origday).split(' ')[0].strip()
    
    url = URL%(location,urlday)

    p = get_proxy()

    htmlcontent = crawl_single_page(url,proxy = p)
    if htmlcontent == '':
        invalid_proxy(p)
        logger.error('elongFlight: Proxy Error: htmlcontent is null with proxy: %s'%p)
        return []
    
    #判断是否返回导航页，返回导航页说明content没有航班信息
    
    #判断是否找到航班信息，没有返回[]
    temp_flight_list = flightPattern.findall(htmlcontent)
    if len(temp_flight_list) == 1:
        logger.error('elongFilght: Parser Error: cannot find flights with %s'%location)
        return []

    flights = []

    flight_list = temp_flight_list[:-1]

    typ = 0
    for item in flight_list:
        typ = len(typePattern.findall(item))
        if typ == 0:
            pass
        elif typ != 1:
            transfer_info = transferFlight_parser(item,dept_date,airports_dict)
            if transfer_info != []:
                flights.append(transfer_info)
        else:
            direct_info = directFlight_parser(item,dept_date,airports_dict)
            if direct_info != []:
                flights.append(direct_info)
    
    flights_set = set(flights)
    flights = [a for a in flights_set]
    #logger.info('Find %d airlines with %s'%(len(flights),location))


    return flights

コード例 #4

0

ファイルを表示

ファイル: ctripFlightTaskParser.py プロジェクト: dangpu/momoko

def GetInterPricePage(queryLogTransNo, cookie, referer, use_proxy = True):
    priceURL = "http://flights.ctrip.com/international/GetSubstepSearchResults.aspx?IsJSON=T&queryLogTransNo=%s&QueryType=1&cityPairAirline=first&withDirectAirline=T&RdNo=2103213618&ind=347,359,356,370" % queryLogTransNo
    
    #if use_proxy:
    # 如果抓起失败，换一个代理IP，然后重试，次数当前为0
    for i in range(1):
        p = get_proxy()
        resp = crawl_single_page(priceURL, referer=referer, proxy = p, cookie = cookie)
        if resp == None or len(resp) == 0:
            invalid_proxy(p)
        else:
            return resp
    #else:
        #resp = crawl_single_page(searchURL, cookie = cookie)

    return

コード例 #5

0

ファイルを表示

ファイル: long_comment.py プロジェクト: Thorntan/common_scripts

def crawl(url):
    global PROXY
    mc = MC()
    #mc.set_debug(True)
    mc.set_proxy(PROXY)
    content = mc.req('get', url, html_flag = True,time_out=15)
    count = 0
    while len(content)<1000:
        invalid_proxy(PROXY,'Platform')
        PROXY = get_proxy(source = 'Platform')
        mc.set_proxy(PROXY)
        print 'proxy: %s' % PROXY
        content = mc.req('get', url, html_flag = True,time_out=15)
        count += 1
        if count > 10:
            break
    return content

コード例 #6

0

ファイルを表示

ファイル: rest_img.py プロジェクト: Thorntan/common_scripts

def crawl(url):
    global p
    mc = MC()
    #mc.set_debug(True)
    mc.set_proxy(p)
    print 'proxy:',p
    content = mc.req('get', url, html_flag = True,time_out=20)
    count = 0
    while len(content) < 2000:
        invalid_proxy(p,'Platform')
        p = get_proxy(source = 'Platform')
        mc.set_proxy(p)
        print p
        content = mc.req('get', url, html_flag = True,time_out=20)
        count += 1
        if count>5:
            break
    return content

コード例 #7

0

ファイルを表示

ファイル: all_comment.py プロジェクト: Thorntan/common_scripts

def crawl(url):
    global PROXY
    mc = MC()
    mc.set_proxy(PROXY)
    content = mc.req('get', url, html_flag = True)
    count = 0
    while len(content)<1000:
        invalid_proxy(PROXY,'Platform')
        PROXY = get_proxy(source = 'Platform')
        mc.set_proxy(PROXY)
        print 'proxy: %s' % PROXY
        content = mc.req('get', url, html_flag = True)
        count += 1
        if count > 10:
            break
    #open('test.html','w').write(content)
    #content = open('test.html','r').read()
    return content

コード例 #8

0

ファイルを表示

ファイル: ctripFlightTaskParser.py プロジェクト: rongweihe/momoko

def GetInterPricePage(queryLogTransNo, cookie, referer, use_proxy=True):
    priceURL = "http://flights.ctrip.com/international/GetSubstepSearchResults.aspx?IsJSON=T&queryLogTransNo=%s&QueryType=1&cityPairAirline=first&withDirectAirline=T&RdNo=2103213618&ind=347,359,356,370" % queryLogTransNo

    #if use_proxy:
    # 如果抓起失败，换一个代理IP，然后重试，次数当前为0
    for i in range(1):
        p = get_proxy()
        resp = crawl_single_page(priceURL,
                                 referer=referer,
                                 proxy=p,
                                 cookie=cookie)
        if resp == None or len(resp) == 0:
            invalid_proxy(p)
        else:
            return resp
    #else:
    #resp = crawl_single_page(searchURL, cookie = cookie)

    return

コード例 #9

0

ファイルを表示

ファイル: elongHotelRoomParser.py プロジェクト: dangpu/momoko

        return result
        
    if hotel_id_temp == '0':
        result['error'] = TASK_ERROR
        return result

    p = get_proxy(source='elongHotel')
    if p == None:
        result['error'] = PROXY_NONE
        return result

    post_data = get_post_data(hotel_id_temp, check_in, check_out)

    page = request_post_data(request_url,data=post_data,proxy=p)
    if page == None or page == '':
        invalid_proxy(proxy=p, source='elongHotel')
        result['error'] = PROXY_INVALID
        return result

    room_list = parseRoom(page,hotel_name,city_name_zh,check_in,check_out,hotel_id)

    if room_list != []:
        result['para'] = room_list
        return result
    else:
        result['error'] = DATA_NONE

    return result

def elong_room_request_parser(content):

コード例 #10

0

ファイルを表示

    except Exception, e:
        logger.error('ryanairFlight: wrong content format with %s' % content)
        result['error'] = TASK_ERROR
        return result

    p = get_proxy(source='ryanairFlight')

    if p == None:
        result['error'] = PROXY_NONE
        return result

    trip_type = 'Oneway'
    page = GetData(trip_type, dept_id, dest_id, dept_date, ret_date, proxy=p)

    if page == None:
        invalid_proxy(proxy=p, source='ctripFlight')
        result['error'] = PROXY_INVALID
        return result

    data = ParsePage(page)
    if data == None:
        result['error'] = DATA_NONE
        return result

    currency = GetCurrency(page)

    allinfo = []
    data = jsonlib.read(data)
    for k, v in data.items():
        for one_day_flights in v:
            for one_day_flight in one_day_flights[1]:

コード例 #11

0

ファイルを表示

    if url == '' or url == None:
        logger.error('feifanFlight: Get url failed!')
        result['error'] = UNKNOWN_TYPE
        return result

    #抓取页面并判断其是否可用
    #feifan常常要刷新才能获取内容，所以爬取3次
    for i in range(3):
        page = crawl_single_page(url, proxy=p)

        if page != '' and len(page) > 300:
            flights = parsePage(page, dept_year)
            if flights == []:
                if page.find('非凡旅行网-抱歉,您没有权限访问') != -1:
                    invalid_proxy(proxy=p, source='feifanFlight')
                    result['error'] = PROXY_FORBIDDEN
                else:
                    result['error'] = DATA_NONE
                return result
            else:
                result['para'] = flights
                return result
        else:
            continue
    invalid_proxy(proxy=p, source='feifanFlight')
    logger.error('feifanFlight: Get page content failed!')
    result['error'] = PROXY_INVALID

    return result

コード例 #12

0

ファイルを表示

ファイル: tongchengParser.py プロジェクト: rongweihe/momoko

                     (taskcontent))
        result['error'] = TASK_ERROR
        return result

    p = get_proxy(source='tongchengFlight')
    #p = crawl_single_page('http://114.215.168.168:8086/proxy')
    #p = None

    if p == None:
        result['error'] = PROXY_NONE
        return result

    url = get_url(dept_city, dest_city, dept_day, dept_id, dest_id, p)

    if url == 'proxy_forbidden':
        invalid_proxy(proxy=p, source='tongchengFlight')
        result['error'] = PROXY_FORBIDDEN
        return result

    if url != '':
        page = crawl_single_page(url, proxy=p)
        print page
    else:
        logger.error('tongchengFlight: Get url failed!')
        invalid_proxy(proxy=p, source='tongchengFlight')
        result['error'] = PROXY_INVALID
        return result

    if page != '' and len(page) > CONTENT_LEN:
        flights = ParsePage(page)
    else:

コード例 #13

0

ファイルを表示

                                                      Flight.dept_id)
                            for i in range(len(multi_price)):
                                multi_ticket.append((Flight.fight_no,Flight.plane_no,Flight.airline,Flight.dept_id,Flight.dest_id,\
                                  Flight.dept_day,Flight.dept_time, Flight.dest_time,Flight.dur, multi_price[i][0], multi_price[i][1],\
                                  multi_price[i][2], multi_price[i][3],multi_price[i][4],multi_price[i][5],multi_price[i][6], multi_price[i][7]))
        # print every ticket and ervery flight
        if page_flag == True:
            print 'the num of tickets is ' + str(len(multi_ticket))
            result['error'] = 0
            return result
        else:
            result['error'] = UNKNOWN_TYPE
            return result
    else:
        if html.find('NEEDVERIFY') != -1:
            invalid_proxy(proxy=task_content_proxy, source='csairFlight')
            return {
                'para': {
                    'flight': {},
                    'ticket': []
                },
                'error': PROXY_INVALID
            }
        else:
            return {'para': {'flight': {}, 'ticket': []}, 'error': DATA_NONE}


def csair_request_parser(content):
    result = -1
    return result

コード例 #14

0

ファイルを表示

ファイル: bookingRoomParser.py プロジェクト: dangpu/momoko

    except Exception,e:
        logger.error('bookingHotel: Wrong Content Format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result

    hotel_url = get_hotel_url(url_hotel_name,check_in,check_out)

    p = get_proxy(source='bookingHotel')
    if p == None:
        result['error'] = PROXY_NONE
        return result

    page = crawl_single_page(hotel_url, proxy = p)
    if page == None or page == '':
        invalid_proxy(proxy=p, source='bookingHotel')
        result['error'] = PROXY_INVALID
        return result
    
    if len(page) > CONTENT_LEN:
        room_info = parseRoom(page, check_in, check_out, hotel_id)
    else:
        result['error'] = UNKNOWN_TYPE
        return result
    
    if room_info != []:
        result['para'] = room_info
        return result
    else:
        result['error'] = DATA_NONE

コード例 #15

0

ファイルを表示

ファイル: vuelingParser.py プロジェクト: rongweihe/momoko

    postdata = getPostData(dept_time, dept_id, dest_id)

    p = get_proxy(source='vuelingFlight')
    if p == None:
        result['error'] = PROXY_NONE
        return result

    url = 'http://tickets.vueling.com/ScheduleSelect.aspx'
    Referer = 'http://tickets.vueling.com/ScheduleSelect.aspx'

    content = request_post_data(url,postdata,referer=Referer,proxy=p,\
            Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")

    if content == '' or content == None:
        invalid_proxy(proxy=p, source='vuelingFlight')
        result['error'] = PROXY_INVALID

    allinfos = vuelingparser(content)

    if allinfos == []:
        result['error'] = DATA_NONE
        return result
    else:
        result['para'] = allinfos

    return result


def vueling_request_parser(content):
    result = -1

コード例 #16

0

ファイルを表示

ファイル: rest_crawl.py プロジェクト: Thorntan/common_scripts

def crawl(city_url,city_id):
    global p
    source = 'daodao'
    #city_url = city_url.replace('Tourism','Restaurants')
    print city_url
    mc = MC()
    mc.set_proxy(p)
    print 'proxy:  %s' % p
    page1 = ''
    page1 = mc.req('get',city_url,html_flag=True, time_out=10)
    count =0
    while len(page1)<1000:
        invalid_proxy(p,'Platform')
        p = get_proxy(source='Platform')
        print 'proxy: %s' % p
        mc.set_proxy(p)
        page1 = mc.req('get',city_url,html_flag=True , time_out=10)
        count += 1
        if count > 20:
            break
    source_city_id = re.compile(r'-g(\d+)').findall(city_url)[0]
    root = html.fromstring(page1)

    # 城市餐厅总数
    rating_info = root.find_class('listing')[0].find_class('popIndexDefault')[0].xpath('text()')[0].encode('utf-8').strip().split('(')[1].replace(',','')
    nums = re.compile(r'(\d+)').findall(rating_info)
    res_total = int(nums[0])
    print "total: %s " % res_total

    # 第一页的餐厅列表
    items = root.find_class('listing')
    data_list = []
    for item in items:
        res_url = 'http://www.tripadvisor.cn' + item.find_class('title')[0].xpath('a/@href')[0].strip().encode('utf-8')
        res_id = re.compile(r'd(\d+)').findall(res_url)[0].encode('utf-8')
        print res_url
        data = (source,city_id,res_id,res_url)
        print data
        data_list.append(data)
    print 'insert',insert_db(data_list)


    print '------------next page------------'
    itag = '10591' # 餐厅的类别id
    page = 2
    data_list = []
    for offset in range(30,res_total+1,30):
        print '-----------page %s-------' % page
        page += 1
        next_url = 'http://www.tripadvisor.cn/RestaurantSearch?Action=PAGE&geo=%s&ajax=1&itags=%s&sortOrder=popularity&o=a%s&availSearchEnabled=false' % (source_city_id,itag,offset)
        print next_url

        content2 = ''
        content2 = mc.req('get',next_url,html_flag = True)
        while (len(content2) < 1000):
            p = get_proxy(source='Platform')
            print 'proxy: %s' % p
            content2 = mc.req('get',next_url,html_flag = True)
        no_count = len( re.compile(r'(该餐馆暂无点评，来写第一条)').findall(content2) )
        # 如果大部分是“该餐馆暂无点评，来写第一条”，就停止翻页
        if int(no_count) >29:
            break
        root2 = html.fromstring(content2)
        items = root2.find_class('listing')
        data_list2 = []
        for item in items:
            res_url = 'http://www.tripadvisor.cn' + item.find_class('title')[0].xpath('a/@href')[0].strip().encode('utf-8')
            res_id = re.compile(r'd(\d+)').findall(res_url)[0].encode('utf-8')
            print res_url
            data2 = (source,city_id,res_id,res_url)
            print data2
            data_list2.append(data2)
        print 'insert',insert_db(data_list2)
    print 'city %s ok' % city_id

コード例 #17

0

ファイルを表示

        return result

    if hotel_id_temp == '0':
        result['error'] = TASK_ERROR
        return result

    p = get_proxy(source='elongHotel')
    if p == None:
        result['error'] = PROXY_NONE
        return result

    post_data = get_post_data(hotel_id_temp, check_in, check_out)

    page = request_post_data(request_url, data=post_data, proxy=p)
    if page == None or page == '':
        invalid_proxy(proxy=p, source='elongHotel')
        result['error'] = PROXY_INVALID
        return result

    room_list = parseRoom(page, hotel_name, city_name_zh, check_in, check_out,
                          hotel_id)

    if room_list != []:
        result['para'] = room_list
        return result
    else:
        result['error'] = DATA_NONE

    return result

コード例 #18

0

ファイルを表示

        return result

    url = URL % (location, urlday)

    p = get_proxy(source='elongFlight')

    if p == None:
        result['error'] = PROXY_NONE
        return result

    mc = MechanizeCrawler(p='')

    page = mc.get(url, html_flag=True)

    if page == None:
        invalid_proxy(proxy=p, source='elongFlight')
        logger.error(
            'elongFlight: Proxy Error: htmlcontent is null with proxy: %s' % p)
        result['error'] = PROXY_INVALID
        return result

    tickets, flights = elong_page_parser(page)

    if tickets == [] or tickets == None:
        result['error'] = DATA_NONE
        return result

    result['para']['flight'] = flights
    result['para']['ticket'] = tickets

    return result

コード例 #19

0

ファイルを表示

ファイル: elongRequestParser.py プロジェクト: dangpu/momoko

        origday = datetime.datetime(string.atoi(dept_date[0:4]),string.atoi(dept_date[5:7]),string.atoi(dept_date[8:]))
        urlday = (origday - datetime.datetime.today()).days
        #dept_date = orig_date
        #logger.info('contents: %s %s %s %s '%(location,flight_no,dept_date,str(urlday)))
    except Exception,e:
        logger.error(str(e))
        logger.error('Content Error: Wrong content format with %s'%content)
        return result
    
    url = URL%(location,urlday)

    p = get_proxy(source='elongFlight')

    htmlcontent = crawl_single_page(url,n=1,proxy = p)
    if htmlcontent == '':
        invalid_proxy(p)
        logger.error('Proxy Error: htmlcontent is null with proxy: %s'%p)
        return result
    
    #判断是否返回导航页，返回导航页说明content没有航班信息
    
    #判断是否找到航班信息，没有返回[]
    temp_flight_list = flightPattern.findall(htmlcontent)
    if len(temp_flight_list) == 1:
        logger.error('Parser Error: cannot find flights with %s'%location)
        return result

    flight_list = temp_flight_list[:-1]
    
    #flights = []
    typ = 0

コード例 #20

0

ファイルを表示

ファイル: bookingRoomParser.py プロジェクト: rongweihe/momoko

    except Exception, e:
        logger.error('bookingHotel: Wrong Content Format with %s' %
                     taskcontent)
        result['error'] = TASK_ERROR
        return result

    hotel_url = get_hotel_url(url_hotel_name, check_in, check_out)

    p = get_proxy(source='bookingHotel')
    if p == None:
        result['error'] = PROXY_NONE
        return result

    page = crawl_single_page(hotel_url, proxy=p)
    if page == None or page == '':
        invalid_proxy(proxy=p, source='bookingHotel')
        result['error'] = PROXY_INVALID
        return result

    if len(page) > CONTENT_LEN:
        room_info = parseRoom(page, check_in, check_out, hotel_id)
    else:
        result['error'] = UNKNOWN_TYPE
        return result

    if room_info != []:
        result['para'] = room_info
        return result
    else:
        result['error'] = DATA_NONE

コード例 #21

0

ファイルを表示

ファイル: wegoParser.py プロジェクト: rongweihe/momoko

        return result

    #获取代理
    p = get_proxy(source='wegoFlight')

    if p == None:
        result['error'] = PROXY_NONE
        return result

    #获取初始url
    url_temp = get_url(dept_id, arr_id, dept_date)
    search_id = get_search_id(url_temp, proxy=p)

    if search_id == '':
        logger.error('Search_Id Error: get Search_Id failed')
        invalid_proxy(proxy=p, source='wegoFlight')
        result['error'] = PROXY_FORBIDDEN
        return result

    trip_id = get_trip_id(dept_id, arr_id, dept_date)

    #使用初始url，获取要爬取的页面，page表示一共有多少页
    start_url = get_start_url(search_id, trip_id)
    content_temp = crawl_single_page(
        start_url,
        proxy=p,
        Host="www.wego.cn",
        Accept=
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    )
    if content_temp == "":

コード例 #22

0

ファイルを表示

ファイル: elongParser.py プロジェクト: dangpu/momoko

        return result

    url = URL%(location,urlday)

    p = get_proxy(source='elongFlight')
    
    if p == None:
        result['error'] = PROXY_NONE
        return result

    mc = MechanizeCrawler(p = '')

    page = mc.get(url, html_flag = True)

    if page == None:
        invalid_proxy(proxy = p, source='elongFlight')
        logger.error('elongFlight: Proxy Error: htmlcontent is null with proxy: %s'%p)
        result['error'] = PROXY_INVALID
        return result

    tickets, flights = elong_page_parser(page)

    if tickets == [] or tickets == None:
        result['error'] = DATA_NONE
        return result

    result['para']['flight'] = flights
    result['para']['ticket'] = tickets

    return result

コード例 #23

0

ファイルを表示

ファイル: ryanairParser.py プロジェクト: dangpu/momoko

    except Exception,e:
        logger.error('ryanairFlight: wrong content format with %s'%content)
        result['error'] = TASK_ERROR
        return result
    
    p = get_proxy(source = 'ryanairFlight')

    if p == None:
        result['error'] = PROXY_NONE
        return result

    trip_type = 'Oneway'
    page = GetData(trip_type, dept_id, dest_id, dept_date, ret_date, proxy = p)

    if page == None:
        invalid_proxy(proxy = p, source='ctripFlight')
        result['error'] = PROXY_INVALID
        return result

    data = ParsePage(page)
    if data == None:
        result['error'] = DATA_NONE
        return result

    currency = GetCurrency(page)
       
    allinfo = []
    data = jsonlib.read(data)
    for k, v in data.items():
        for one_day_flights in v:
            for one_day_flight in one_day_flights[1]:

コード例 #24

0

ファイルを表示

ファイル: feifanParser.py プロジェクト: dangpu/momoko

    
    if url == '' or url == None:
        logger.error('feifanFlight: Get url failed!')
        result['error'] = UNKNOWN_TYPE
        return result

    #抓取页面并判断其是否可用
    #feifan常常要刷新才能获取内容，所以爬取3次
    for i in range(3):
        page = crawl_single_page(url, proxy=p)

        if page != '' and len(page) > 300:
            flights = parsePage(page, dept_year)
            if flights == []:
                if page.find('非凡旅行网-抱歉,您没有权限访问') != -1:
                    invalid_proxy(proxy=p, source='feifanFlight')
                    result['error'] = PROXY_FORBIDDEN
                else:
                    result['error'] = DATA_NONE
                return result
            else:
                result['para'] = flights
                return result
        else:
            continue
    invalid_proxy(proxy=p, source='feifanFlight')
    logger.error('feifanFlight: Get page content failed!')
    result['error'] = PROXY_INVALID
        
    return result

コード例 #25

0

ファイルを表示

ファイル: csairParser.py プロジェクト: dangpu/momoko

                 Flight.dur = get_duration(Flight.dest_time,Flight.dest_id,Flight.dept_time,Flight.dept_id)
                 for i in range(len(multi_price)):
                   multi_ticket.append((Flight.fight_no,Flight.plane_no,Flight.airline,Flight.dept_id,Flight.dest_id,\
                     Flight.dept_day,Flight.dept_time, Flight.dest_time,Flight.dur, multi_price[i][0], multi_price[i][1],\
                     multi_price[i][2], multi_price[i][3],multi_price[i][4],multi_price[i][5],multi_price[i][6], multi_price[i][7]))
      # print every ticket and ervery flight
    if page_flag == True:
      print 'the num of tickets is '+ str(len(multi_ticket))
      result['error'] = 0
      return result
    else:
        result['error'] = UNKNOWN_TYPE
        return result
  else:
       if html.find('NEEDVERIFY') != -1:
           invalid_proxy(proxy=task_content_proxy,source='csairFlight')
           return {'para':{'flight':{},'ticket':[]},'error':PROXY_INVALID}
       else:
           return {'para':{'flight':{},'ticket':[]},'error':DATA_NONE}

def csair_request_parser(content):
    result = -1
    return result


'''if __name__ == '__main__':

    proxy_flag = False
    proxy = None
    taskcontent = 'PEK&PAR&20140730'
    result = csair_task_parser(taskcontent)

コード例 #26

0

ファイルを表示

ファイル: youzhanRoomParser.py プロジェクト: rongweihe/momoko

    content_len = 0
    while i < 5 and content_len < CONTENT_LEN:
        #p = get_proxy()
        p = get_proxy(source='youzhanHotel')
        #print p
        if p == None:
            result['error'] = PROXY_NONE
            return result

        url = price_url + str(int(time.time() * 1000))
        price_page = crawl_single_page(url,proxy=p,n=1)
        content_len = len(price_page)
        i += 1

    if price_page == None or price_page == '':
        invalid_proxy(proxy=p, source='youzhanHotel')
        result['error'] = PROXY_INVALID
        return result
    #print price_page
    price_list = price_parser(price_page,hotel_id)

    if price_list != []:
        for each_room in price_list:
            if len(each_room) > 3:
                room.city = city
                room.occupancy = 2
                #room.hotel_name = hotel.hotel_name
                #print '******'
                #print each_room
                room.room_desc = each_room[3]
                room.real_source = each_room[2]

コード例 #27

0

ファイルを表示

    
    #p = get_proxy()
    #print p
    p = get_proxy(source='haodingHotel')
    if p == '' or p == None:
        return {'para':[], 'error':NO_PROXY}
    
    i = 0
    content_len = 0
    while i < 3 and content_len < CONTENT_LEN:
        content = crawl_single_page(hotel_url, p)
        content_len = len(content)
        i += 1

    if content == '' or content == None:
        invalid_proxy(proxy = p, source='haodingHotel')
        return {'para':[], 'error':NO_CONTENT}

    if len(content) < CONTENT_LEN:
        return {'para':[], 'error':NO_INFO}

    room_list = parseRoom(content,city_name_zh,country_name_zh,hotel_id,check_in,check_out)

    if room_list == [] or room_list == None:
        return {'para':[], 'error':NO_RESULT}

    return {'para':room_list, 'error':0}


def haoding_room_request_parser(content):

コード例 #28

0

ファイルを表示

ファイル: vuelingRequestParser.py プロジェクト: rongweihe/momoko

    #p = '116.228.55.217:8000'

    p = get_proxy()

    url = 'http://tickets.vueling.com/ScheduleSelect.aspx'
    Referer = 'http://tickets.vueling.com/ScheduleSelect.aspx'

    content = request_post_data(url,postdata,referer=Referer,proxy=p,\
            Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")

    result = -1

    if content != '' and len(content) > 100:
        result = vuelingparser(content, flight_no, req_dept_time)
    else:
        invalid_proxy(p)
        logger.error('Get web content failed!')

    return result


def vuelingparser(content, flight_no, req_dept_time):
    #allinfos = []
    #get flight num
    flight_num_list = []
    flight_num_info_temp = flight_no_pat.findall(content)
    if flight_num_info_temp != []:
        for flight_num_info in flight_num_info_temp:
            flight_num_temp_1 = flight_num_info.find('|')
            flight_num_temp_2 = flight_num_info.rfind('~^')

コード例 #29

0

ファイルを表示

ファイル: wegoParser.py プロジェクト: dangpu/momoko

        return result

    #获取代理
    p = get_proxy(source = 'wegoFlight')

    if p == None:
        result['error'] = PROXY_NONE
        return result
    
    #获取初始url
    url_temp = get_url(dept_id,arr_id,dept_date)
    search_id = get_search_id(url_temp,proxy = p)

    if search_id == '':
        logger.error('Search_Id Error: get Search_Id failed')
        invalid_proxy(proxy=p, source='wegoFlight')
        result['error'] = PROXY_FORBIDDEN
        return result

    trip_id = get_trip_id(dept_id,arr_id,dept_date)
    
    #使用初始url，获取要爬取的页面，page表示一共有多少页
    start_url = get_start_url(search_id,trip_id)
    content_temp = crawl_single_page(start_url,proxy = p, Host="www.wego.cn", Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
    if content_temp == "":
        logger.error('Proxy Error: htmlcontent is null with proxy: %s'%p)
        #反馈代理无效
        invalid_proxy(proxy=p, source='wegoFlight')
        result['error'] = PROXY_INVALID
        return result