Esempio n. 1
0
    def __init__(self, type=1, status=1):
        self.status = 1
        self.type = type

        if self.type == 1:
            chrome_options = webdriver.ChromeOptions()
            if spider_config.getint('proxyconf', 'switch') >= 1:
                proxy_type = spider_config.getint('proxyconf', 'type')
                if proxy_type == 1:
                    chrome_options.add_extension(
                        get_chrome_proxy_extension(spider_config.get('proxy1', 'dynamicProxy')))
                if proxy_type == 2:
                    data = json.dumps({"targetSiteName": spider_config.get("proxy2", "targetSiteName")})
                    request = urllib2.Request(url=spider_config.get("proxy2", "dynamicProxy"), data=data)
                    response = urllib2.urlopen(request)
                    proxy_data = json.loads(response.read())
                    logging.info("返回代理:"+json.dumps(proxy_data))
                    proxy_ip = proxy_data["proxy"]["IP"]
                    proxy_port = proxy_data["proxy"]["port"]
                    chrome_options.add_argument('--proxy-server=http://' + proxy_ip + ":" + proxy_port)

            prefs = {"profile.managed_default_content_settings.images": 2}
            chrome_options.add_experimental_option("prefs", prefs)

            self.browser = webdriver.Chrome(spider_config.get('browser', 'chromedriver'), chrome_options=chrome_options)
        elif self.type == 2:
            self.browser = webdriver.Firefox()
Esempio n. 2
0
def switch_proxy_ip():
    try:
        if spider_config.getint("proxyconf",
                                "type") == 1 and spider_config.getint(
                                    "proxyconf", "switch") > 0:
            req = urllib2.Request(spider_config.get('proxy1', 'switchProxy'))
            res_data = urllib2.urlopen(req)
            res = res_data.read()
            logger.info('切换代理IP:' + res)
    except:
        logger.error('切换代理IP失败')
Esempio n. 3
0
    def get_shopping_result(self, browser, req):

        browser.get(self.get_shopping_url(req))

        if_loaded = 'if (typeof document.getElementById("table-0") == "undefined") return null; else return document.getElementById("table-0")'

        result = browser.execute_script(if_loaded)

        t = 0

        timeout = spider_config.getint('spider', 'timeout')
        if timeout < 10:
            timeout = 10

        while not result and t <= timeout:
            t = t + 1
            self.logger.info('sleep {0}'.format(t))
            result = browser.execute_script(if_loaded)
            time.sleep(1)

        if result:
            result = browser.execute_script(js_tiger_shopping)
            result["status"] = 0
        else:
            result = {"status": 1}

        self.logger.debug(result)

        return result
Esempio n. 4
0
    def get_shopping_result(self, browser, req):

        browser.get(self.get_shopping_url(req))

        if req.entry == 'mobile':
            if_loaded = 'if (typeof document.getElementById("fareflight") == "undefined") return null; else return document.getElementById("fareflight")'
        else:
            if_loaded = 'if (typeof document.getElementsByClassName("js_availability_container") == "undefined") return null; else return document.getElementsByClassName("js_availability_container")'
        result = browser.execute_script(if_loaded)

        t = 0
        timeout = spider_config.getint('spider', 'timeout')
        if timeout < 10:
            timeout = 10

        while not result and t <= timeout:
            t = t + 1
            self.logger.info('sleep {0}'.format(t))
            result = browser.execute_script(if_loaded)
            time.sleep(1)

        if result:
            if req.parser == 'python':
                result = parse_pc_shopping(browser, req)
            elif req.entry == 'mobile':
                result = browser.execute_script(js_mobile_shopping)
            else:
                result = browser.execute_script(js_pc_shopping)
            result["status"] = 0
        else:
            result = {"status": 1}

        self.logger.debug(result)

        return result
Esempio n. 5
0
    def get_shopping_result(proxies, req):
        url = "https://booking.tigerair.com.au/TigerAirIBE/Booking/Search"
        # 第一次请求获取cookie和token,用于第二次请求
        r = requests.get(url, proxies=proxies)
        doc = pq(r.text)
        temp_token = doc("[name='__RequestVerificationToken']").val()
        if req.flightOption == 1:
            trip_kind = "oneWay"
        else:
            trip_kind = "roundTrip"

        data = {'__RequestVerificationToken': temp_token,
                'TripKind': trip_kind, 'Destination': req.toCity,
                'Origin': req.fromCity,
                'DepartureDate': TigerHttp.date_format(req.startDate),
                'AdultCount': str(req.adultNumber),
                'ChildCount': str(req.childNumber)}
        try:
            r = requests.post(url=url, data=data, proxies=proxies,
                              timeout=spider_config.getint("spider", "timeout"), cookies=r.cookies)
            doc = pq(r.text)
            return TigerHttp.start(doc, req)
        except requests.exceptions.ConnectTimeout:
            return {"status": 1}
        except requests.exceptions.Timeout:
            return {"status": 1}
Esempio n. 6
0
class SchedulerConfig(object):
    JOBS = [{
        'id': 'proxy',
        'func': '__main__:switch_proxy_ip',
        'args': None,
        'trigger': 'interval',
        'seconds': spider_config.getint('proxy1', 'switchTime')
    }]
Esempio n. 7
0
    def get_proxies(self):
        proxies = None
        if spider_config.getint("proxyconf", "switch") > 0:

            if len(self._ProxiesPool) > 0:
                cache_proxy = self._ProxiesPool.pop()
                self._ProxiesPool.add(cache_proxy)
                cache_proxy = cache_proxy.split(r"""://""")[1]
                return {
                    "http": "http://" + cache_proxy,
                    "https": "http://" + cache_proxy
                }

            try:
                if spider_config.getint("proxyconf", "type") == 1:
                    proxy_inf = spider_config.get("proxy1", "dynamicProxy")
                    proxies = {
                        "http": "http://" + proxy_inf,
                        "https": "http://" + proxy_inf
                    }

                if spider_config.getint("proxyconf", "type") == 2:
                    data = json.dumps({
                        "targetSiteName":
                        spider_config.get("proxy2", "targetSiteName")
                    })
                    request = urllib2.Request(url=spider_config.get(
                        "proxy2", "dynamicProxy"),
                                              data=data)
                    response = urllib2.urlopen(request)
                    proxy_data = json.loads(response.read())
                    logging.info("ip" + json.dumps(proxy_data))
                    proxy_ip = proxy_data["proxy"]["IP"]
                    proxy_port = proxy_data["proxy"]["port"]

                    proxies = {
                        "http": "http://" + proxy_ip + ":" + proxy_port,
                        "https": "http://" + proxy_ip + ":" + proxy_port
                    }
            except:
                self._logger.error("获取动态代理失败")
        return proxies
Esempio n. 8
0
    def acquire_browser(self, type):
        self._lock.acquire()

        result = None

        for bs in self._driverPool:
            if bs.status == 0 and bs.type == type:
                bs.status = 1
                result = bs.browser
                break

        max_thread = spider_config.getint('spider', 'maxThread')
        if result is None and len(self._driverPool) < max_thread:
            result = WebDriverInfo(type=type)
            self._driverPool.append(result)
            result = result.browser

        self._lock.release()
        return result
Esempio n. 9
0
        str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(startTime))) +
        '-----结束时间:' +
        str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(endTime))) +
        '--------------------共消耗' + str((endTime - startTime) * 1000) + 'ms')
    return result


def switch_proxy_ip():
    try:
        if spider_config.getint("proxyconf",
                                "type") == 1 and spider_config.getint(
                                    "proxyconf", "switch") > 0:
            req = urllib2.Request(spider_config.get('proxy1', 'switchProxy'))
            res_data = urllib2.urlopen(req)
            res = res_data.read()
            logger.info('切换代理IP:' + res)
    except:
        logger.error('切换代理IP失败')


if __name__ == '__main__':
    # scheduler = APScheduler()
    # app.config.from_object(SchedulerConfig())
    # scheduler.init_app(app)
    # scheduler.start()

    port = spider_config.getint('server', 'port')
    logger.info('get port: {0}'.format(port))

    app.run(host="0.0.0.0", port=port, debug=False)