Esempio n. 1
0
    def get_video_src(self):

        #room url check
        o = urlparse(self.room_url)
        rule_key = o.netloc

        if rule_key in self.rules:
            cap = webdriver.DesiredCapabilities.PHANTOMJS

            cap["phantomjs.page.settings.resourceTimeout"] = 1000
            cap["phantomjs.page.settings.loadImages"] = True
            cap["phantomjs.page.settings.disk-cache"] = True
            cap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0(iPhone;CPU iPhone OS 9_1 like Mac OSX) AppleWebKit / 601.1"
            ".46(KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1"
            driver = PhantomJS(self.driverPath["PhantomJS"],
                               desired_capabilities=cap)
            # 指定使用的浏览器
            # driver = webdriver.Firefox()
            driver.implicitly_wait(10)
            my_rule = self.rules[rule_key]

            url_prefix = my_rule["url_prefix"]
            driver.get("%s%s" % (url_prefix, o.path))
            try:
                result_video = driver.find_element_by_tag_name(
                    'video').get_attribute('src')
                driver.close()
                return result_video

            except:
                # return "未能获得该直播间地址直播流地址"
                driver.close()
                return "主播不在了"
                # return " "
        else:

            return "不支持的网站(not support url)"
Esempio n. 2
0
class RequestUtil:
    __browserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0'

    def __init__(self):
        self.cookies = ''
        self._lock = threading.RLock()

    def http_get_request(self, url, referer, timeout=''):
        self._lock.acquire()
        cookie = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie),
                                      SmartRedirectHandler())
        urllib2.install_opener(opener)
        headers = {
            'User-Agent': self.__browserAgent,
            'Referer': referer,
            'Cache-Control': 'max-age=0',
            'Accept': '*/*',
            'Connection': 'Keep-Alive',
            'Accept-encoding': 'gzip'
        }
        req = urllib2.Request(url=url, headers=headers)
        if timeout == '':
            open = urllib2.urlopen(req)
        else:
            open = urllib2.urlopen(req, timeout=timeout)
        if self.cookies == '':
            for item in cookie:
                self.cookies = self.cookies + item.name + '=' + item.value + ';'
            self.cookies = self.cookies[:-1]
        if url != open.url:
            req = urllib2.Request(url=open.url, headers=headers)
        self._lock.release()
        return (open, req)

    def http_post_request(self, url, datas, referer, timeout=''):
        self._lock.acquire()
        postdata = urllib.urlencode(datas)
        headers = {
            'User-Agent': self.__browserAgent,
            'Referer': referer,
            'Content-Type': 'application/x-www-form-urlencoded',
            'Cache-Control': 'no-cache',
            'Accept': '*/*',
            'Connection': 'Keep-Alive',
            'Accept-encoding': 'gzip',
            'Cookie': self.cookies
        }
        req = urllib2.Request(url=url, data=postdata, headers=headers)
        req.get_host()
        if timeout == '':
            open = urllib2.urlopen(req)
        else:
            open = urllib2.urlopen(req, timeout=timeout)
        if url != open.url:
            req = urllib2.Request(url=open.url, headers=headers)
        self._lock.release()
        return (open, req)

    def http_get(self, url, refer='https://www.baidu.com'):
        return self.http_get_request(url, refer, 60)

    def http_post(self, url, datas, refer='https://www.baidu.com'):
        return self.http_post_request(url, datas, refer, 60)

    def http_post_request2(self, url, datas, timeout=''):
        if timeout == '':
            open = urllib2.urlopen(url, datas)
        else:
            open = urllib2.urlopen(url, datas, timeout=timeout)
        data = open.read()
        return data

    def http_post2(self, url, datas):
        return self.http_post_request2(url, datas, 300)

    def create_phandomjs(self, service_args, caps, timeout=30):
        self.driver = PhantomJS(desired_capabilities=caps,
                                service_args=service_args)
        self.driver.set_page_load_timeout(timeout)
        self.driver.set_script_timeout(timeout)
        self.driver.implicitly_wait(timeout)

    def close_phandomjs(self):
        try:
            self.driver.quit()
        except:
            pass

    def http_get_phandomjs(self,
                           url,
                           refer='https://www.baidu.com',
                           timeout=1000):
        caps = dict(DesiredCapabilities.PHANTOMJS)
        caps['browserName'] = 'chrome'
        caps["phantomjs.page.settings.resourceTimeout"] = timeout
        caps["phantomjs.page.settings.loadImages"] = False
        caps["phantomjs.page.settings.userAgent"] = (self.__browserAgent)
        caps["phantomjs.page.customHeaders.Referer"] = (refer)

        service_args = []
        service_args.append('--load-images=no')
        service_args.append('--disk-cache=yes')
        service_args.append('--cookies-file=')

        self.create_phandomjs(timeout=timeout,
                              service_args=service_args,
                              caps=caps)
        self.driver.get(url)
        return self.driver.page_source
Esempio n. 3
0
class PagesCrawler(BaseSpider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kw):
        args = DEFAULT_INPUT.copy()
        args.update(kw)
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['maxdepth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's']]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
        if self.phantom:
            self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0
        dispatcher.connect(self.closed, spider_closed)
        dispatcher.connect(self.crashed, spider_error)

    def start_requests(self):
        self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO)
        self.log("ARGUMENTS : "+str(self.args), log.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(
            scrapyd_config().get('logs_dir'),
            HYPHE_PROJECT,
            self.name,
            self.crawler.settings['JOBID']
        )
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(
            executable_path=PHANTOM['PATH'],
            service_args=phantom_args,
            desired_capabilities=self.capabilities,
            service_log_path="%s-phantomjs.log" % self.prefixfiles
        )
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def crashed(self, spider):
        self.errors += 1
        self.closed("CRASH")

    def closed(self, reason):
        if self.errors:
            self.log("%s error%s encountered during the crawl." %
                (self.errors, 's' if self.errors > 1 else ''), log.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

          # Collect whole DOM of the webpage including embedded iframes
            with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

          # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout,
                        self.ph_idle_timeout, self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", log.INFO)
                except SeleniumTimeout:
                    self.log("Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log("Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

      # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images(response.body), flags=flags, request=response.request)
                self.log("WARNING: page with base64 embedded images was cleaned-up for links extraction")
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        self.log("ERROR : %s" % failure.getErrorMessage(), log.ERROR)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url)
            elif redir_url.startswith('./') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url)
            except ValueError, e:
                self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lru, lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)
Esempio n. 4
0
class PagesCrawler(BaseSpider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False,
                                         deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kw):
        args = DEFAULT_INPUT.copy()
        args.update(kw)
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['maxdepth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.discover_prefixes = [
            url_to_lru_clean(
                "http%s://%s" %
                (https, u.replace('http://', '').replace('https://', '')))
            for u in to_list(args['discover_prefixes']) for https in ['', 's']
        ]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args[
            'phantom'] and args['phantom'].lower() != "false"
        if self.phantom:
            self.ph_timeout = int(
                args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(
                args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(
                args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0
        dispatcher.connect(self.closed, spider_closed)
        dispatcher.connect(self.crashed, spider_error)

    def start_requests(self):
        self.log(
            "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'],
            log.INFO)
        self.log("ARGUMENTS : " + str(self.args), log.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'),
                                        HYPHE_PROJECT, self.name,
                                        self.crawler.settings['JOBID'])
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles,
                 log.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' %
                            self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities[
            'phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(executable_path=PHANTOM['PATH'],
                                 service_args=phantom_args,
                                 desired_capabilities=self.capabilities,
                                 service_log_path="%s-phantomjs.log" %
                                 self.prefixfiles)
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def crashed(self, spider):
        self.errors += 1
        self.closed("CRASH")

    def closed(self, reason):
        if self.errors:
            self.log(
                "%s error%s encountered during the crawl." %
                (self.errors, 's' if self.errors > 1 else ''), log.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

            # Collect whole DOM of the webpage including embedded iframes
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

            # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout, self.ph_idle_timeout,
                        self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", log.INFO)
                except SeleniumTimeout:
                    self.log(
                        "Scrolling/Unfolding timed-out (%ss)" %
                        self.ph_timeout, log.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err,
                             log.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log(
                        "Scrolling/Unfolding crashed: %s %s" % (type(e), e),
                        log.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

    # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url,
                                        headers=response.headers,
                                        body=cleanupbase64images(
                                            response.body),
                                        flags=flags,
                                        request=response.request)
                self.log(
                    "WARNING: page with base64 embedded images was cleaned-up for links extraction"
                )
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        error = failure.getErrorMessage()
        self.log("ERROR : %s" % error, log.ERROR)
        if PROXY and not PROXY.startswith(
                ':') and "OpenSSL.SSL.Error" in error:
            return self._request(failure.request.url, noproxy=True)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                      redir_url)
            elif redir_url.startswith(
                    './') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                      redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log(
                    "ERROR: links extractor crashed on %s: %s %s" %
                    (response, type(e), e), log.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url)
            except ValueError, e:
                self.log("Error converting URL %s to LRU: %s" % (url, e),
                         log.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lru, lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)
Esempio n. 5
0
class plugin:
    def __init__(self):
        APP_ROOT = os.path.dirname(os.path.abspath(__file__))
        print(APP_ROOT)
        self.req = 0
        self.driver = PhantomJS(APP_ROOT + "/phantomjs",
                                service_log_path=os.path.devnull)
        self.driver.implicitly_wait(3)

    def restart(self):
        self.__init__()

    def frame_search(self, path):
        framedict = {}
        for child_frame in self.driver.find_elements_by_tag_name('frame'):
            child_frame_name = child_frame.get_attribute('name')
            framedict[child_frame_name] = {'framepath': path, 'children': {}}
            xpath = '//frame[@name="{}"]'.format(child_frame_name)
            self.driver.switch_to.frame(
                self.driver.find_element_by_xpath(xpath))
            framedict[child_frame_name]['children'] = self.frame_search(
                framedict[child_frame_name]['framepath'] + [child_frame_name])

            self.driver.switch_to.default_content()
            if len(framedict[child_frame_name]['framepath']) > 0:
                for parent in framedict[child_frame_name]['framepath']:
                    parent_xpath = '//frame[@name="{}"]'.format(parent)
                    self.driver.switch_to.frame(
                        self.driver.find_element_by_xpath(parent_xpath))
        return framedict

    def tmon(self):
        self.driver.get(
            "https://login.ticketmonster.co.kr/user/loginform?return_url=")
        self.driver.find_element_by_name('userid').send_keys(
            config['ACCOUNT']['tmon_id'])
        self.driver.find_element_by_name('password').send_keys(
            config['ACCOUNT']['tmon_pw'])
        self.driver.find_element_by_xpath('//*[@id="loginFrm"]/a[2]').click()
        self.driver.get(
            'http://m.benefit.ticketmonster.co.kr/promotions/page/attendance?view_mode=app'
        )
        self.driver.find_element_by_xpath(
            '//*[@id="attn_wrap"]/div/div/div[3]/div[2]/div[1]/button').click(
            )

        print(self.driver.find_element_by_class_name('content').text)
        self.tmon_ret = self.driver.find_element_by_class_name('content').text

    def ondisk(self):
        try:
            self.driver.get("http://ondisk.co.kr/index.php")
            self.driver.implicitly_wait(3)
            self.driver.find_element_by_xpath('//*[@id="mb_id"]').send_keys(
                config['ACCOUNT']['ondisk_id'])
            self.driver.find_element_by_xpath(
                '//*[@id="page-login"]/form/div[2]/p[2]/input').send_keys(
                    config['ACCOUNT']['ondisk_pw'])
            self.driver.find_element_by_xpath(
                '//*[@id="page-login"]/form/div[2]/p[3]/input').click()
            self.driver.get(
                "http://ondisk.co.kr/index.php?mode=eventMarge&sm=event&action=view&idx=746&event_page=1"
            )
            self.driver.switch_to_frame(1)
            self.driver.execute_script(
                "window.alert = function(msg){ window.msg = msg; };")
            self.driver.find_element_by_class_name('button').click()

            alert_text = self.driver.execute_script("return window.msg;")
            print(alert_text)
        except:
            print("ERR")
            print(self.driver.page_source)
        self.ondisk_ret = alert_text

    def ok_cash_bag(self):
        today = datetime.datetime.now().strftime("%Y%m%d")
        sess = requests.session()
        getdata = sess.get(
            "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101"
        )
        param = {
            "lsd": "AVpmy4vJ",
            "api_key": "645711852239977",
            "cancel_url":
            "https://member.okcashbag.com/ocb/socialId/facebookProcessor?error=access_denied&error_code=200&error_description=Permissions+error&error_reason=user_denied#_=_",
            "display": "page",
            "enable_profile_selector": "",
            "isprivate": "",
            "legacy_return": "0",
            "profile_selector_ids": "",
            "return_session": "",
            "skip_api_login": "******",
            "signed_next": "1",
            "trynum": "1",
            "timezone": "-540",
            "lgndim":
            "eyJ3IjoxOTIwLCJoIjoxMDgwLCJhdyI6MTkyMCwiYWgiOjEwNDAsImMiOjI0fQ==",
            "lgnrnd": "173648_UqkK",
            "lgnjs": "1528418208",
            "email": config['ACCOUNT']['fb_id'],
            "pass": config['ACCOUNT']['fb_pw'],
            "prefill_contact_point": config['ACCOUNT']['fb_id'],
            "prefill_source": "last_login",
            "prefill_type": "contact_point",
            "first_prefill_source": "last_login",
            "first_prefill_type": "contact_point",
            "had_cp_prefilled": "true",
            "had_password_prefilled": "false"
        }
        postdata = sess.post(
            "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101",
            data=param)

        # print(postdata.text)
        postdata = sess.post(
            "https://member.okcashbag.com//ocb/socialId/socialIdLoginProcess/42100/687474703A2F2F7777772e6f6b636173686261672e636f6d2F696e6465782e646f3F6c6f67696e3D59"
        )
        samlResponse = postdata.text.split("samlResponse.value = \"")[1].split(
            "\"")[0]
        # print(samlResponse)
        param = {"samlResponse": samlResponse, "sst_cd": "", "return_url": ""}
        postdata = sess.post("http://www.okcashbag.com/index.do?login=Y",
                             data=param)
        print(
            postdata.text.split('<span id="profileNickname" class="name">')
            [1].split("</span>")[0] + "님 로그인")
        print(
            postdata.text.split('<span id="spanUsablePoint">')[1].split(
                '</span>')[0] + "포인트")
        getdata = sess.get(
            "http://www.okcashbag.com/life/event/attend/attendMain.do")
        param = {"method": "", "myUrl": "", "recommUser": "", "today": today}
        postdata = sess.post(
            "http://www.okcashbag.com/life/event/attend/attend.do", data=param)
        print(postdata.text)
        if len(postdata.text.split('<i class="win-point">')) > 1:
            print(postdata.text.split('<i class="win-point">')[1] + "포인트 적립")
        elif len(postdata.text.split("success")) > 1:
            print("출석체크 완료 ")
            self.ok_ret = "출석체크 완료"
        else:
            print('이미 출석체크 완료')
            self.ok_ret = "이미 출석체크 완료"
Esempio n. 6
0
class PagesCrawler(Spider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False,
                                         deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kwargs):
        mongo = MongoClient(MONGO_HOST, MONGO_PORT)[MONGO_DB][MONGO_JOBS_COL]
        job = mongo.find_one({"_id": kwargs["job_id"]})
        args = job["crawl_arguments"]
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['max_depth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.prefixes_trie = LRUTrie()
        for p in self.follow_prefixes:
            self.prefixes_trie.set_lru(p, True)
        for p in self.nofollow_prefixes:
            self.prefixes_trie.set_lru(p, False)
        self.discover_prefixes = [
            url_to_lru_clean(
                "http%s://%s" %
                (https, u.replace('http://', '').replace('https://', '')),
                TLDS_TREE) for u in to_list(args['discover_prefixes'])
            for https in ['', 's']
        ]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args[
            'phantom'] and args['phantom'].lower() != "false"
        self.cookies = None
        if 'cookies' in args and args["cookies"]:
            self.cookies = dict(
                cookie.split('=', 1)
                for cookie in re.split(r'\s*;\s*', args['cookies'])
                if '=' in cookie)
        if self.phantom:
            self.ph_timeout = int(
                args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(
                args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(
                args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(PagesCrawler, cls).from_crawler(crawler, *args,
                                                       **kwargs)
        crawler.signals.connect(spider.spider_closed, signal=spider_closed)
        crawler.signals.connect(spider.spider_crashed, signal=spider_error)
        return spider

    def start_requests(self):
        self.log(
            "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'],
            logging.INFO)
        self.log("ARGUMENTS : " + str(self.args), logging.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'),
                                        HYPHE_PROJECT, self.name,
                                        self.crawler.settings['JOBID'])
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles,
                 logging.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' %
                            self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities[
            'phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(executable_path=PHANTOM['PATH'],
                                 service_args=phantom_args,
                                 desired_capabilities=self.capabilities,
                                 service_log_path="%s-phantomjs.log" %
                                 self.prefixfiles)
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def spider_crashed(self, spider):
        self.errors += 1
        self.spider_closed(spider, reason="CRASH")

    def spider_closed(self, spider, reason=""):
        if self.errors:
            self.log(
                "%s error%s encountered during the crawl (%s)." %
                (self.errors, 's' if self.errors > 1 else '', reason),
                logging.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url, TLDS_TREE)

        if self.phantom:
            self.phantom.get(response.url)

            # Collect whole DOM of the webpage including embedded iframes
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

            # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", logging.INFO)
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout, self.ph_idle_timeout,
                        self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", logging.INFO)
                except SeleniumTimeout:
                    self.log(
                        "Scrolling/Unfolding timed-out (%ss)" %
                        self.ph_timeout, logging.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err,
                             logging.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log(
                        "Scrolling/Unfolding crashed: %s %s" % (type(e), e),
                        logging.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

    # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url,
                                        headers=response.headers,
                                        body=cleanupbase64images(
                                            response.body),
                                        flags=flags,
                                        request=response.request)
                self.log(
                    "WARNING: page with base64 embedded images was cleaned-up for links extraction"
                )
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        error = failure.getErrorMessage()
        self.log("ERROR : %s" % error, logging.ERROR)
        if PROXY and not PROXY.startswith(
                ':') and "OpenSSL.SSL.Error" in error:
            return self._request(failure.request.url, noproxy=True)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                      redir_url)
            elif redir_url.startswith('../'):
                lrustart = lru[:lru.rfind('|p:')]
                while redir_url.startswith('../'):
                    lrustart = lrustart[:lrustart.rfind('|p:')]
                    redir_url = redir_url[3:]
                redir_url = "%s/%s" % (lru_to_url(lrustart + '|'), redir_url)
            elif redir_url.startswith(
                    './') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                      redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log(
                    "ERROR: links extractor crashed on %s: %s %s" %
                    (response, type(e), e), logging.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url, TLDS_TREE)
            except (ValueError, IndexError) as e:
                self.log("Error converting URL %s to LRU: %s" % (url, e),
                         logging.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)

    def _make_html_page(self, response, lru, lrulinks):
        p = self._make_raw_page(response, lru)
        if STORE_HTML:
            p['body'] = Binary(response.body.encode('zip'))
        p['lrulinks'] = lrulinks
        return p

    def _make_raw_page(self, response, lru):
        p = self._new_page(response.url, lru)
        p['status'] = response.status
        p['size'] = len(response.body)
        if isinstance(response, HtmlResponse):
            p['encoding'] = response.encoding
        if response.meta.get('depth'):
            p['depth'] = response.meta['depth']
        if response.headers.get('content-type'):
            p['content_type'] = response.headers.get('content-type').partition(
                ';')[0]
        p['error'] = None
        return p

    def _new_page(self, url, lru=None):
        if lru is None:
            lru = url_to_lru_clean(url, TLDS_TREE)
        p = Page()
        p['url'] = url
        p['lru'] = lru
        p['depth'] = 0
        p['timestamp'] = int(time.time() * 1000)
        return p

    def _should_follow(self, depth, tolru):
        c1 = depth < self.maxdepth
        c2 = self.prefixes_trie.match_lru(tolru)
        return c1 and c2

    def _request(self, url, noproxy=False, **kw):
        kw['meta'] = {'handle_httpstatus_all': True, 'noproxy': noproxy}
        kw['callback'] = self.handle_response
        kw['errback'] = self.handle_error
        if self.cookies:
            kw['cookies'] = self.cookies
        if self.phantom:
            kw['method'] = 'HEAD'
        return Request(url, **kw)