Beispiel #1
0
class Parser(object):

    def __init__(self):
        self.browser = PhantomJS()

    def cleanup(self):
        self.browser.quit()
Beispiel #2
0
    def selenium(self, webdriverOption=0):
        """
        # 调用浏览器下载,适用于任何情形
        :return:
        """
        if not self.url[:4] == "http":
            return None

        driver = None
        if webdriverOption == 0:
            from selenium.webdriver import PhantomJS

            driver = PhantomJS()
        elif webdriverOption == 1:
            from selenium.webdriver import Chrome

            driver = Chrome()
        elif webdriverOption == 2:
            from selenium.webdriver import Firefox

            driver = Firefox()

        if not driver:
            print(u"-->DownLoader->Selenium driver初始化出错,请检查运行环境或webdriverOption选项")

        driver.get(self.url)
        src = driver.page_source
        driver.quit()
        self.pageSource = src
        return src
Beispiel #3
0
def main():
    steam_id, api, return_amount, user_categories = read_config_values()
    print("SteamID:", steam_id)
    print("API key:", api)
    print("Return amount:", return_amount)
    if len(user_categories):  # > 0
        check_user_categories_validity(user_categories)
        print("Categories:", "; ".join(user_categories))
    print()

    print("Fetching your Steam library..")
    user_library = fetch_user_library(api, steam_id)
    print("Found {} in your library.".format(len(user_library)))

    print("Opening PhantomJS..")
    driver = PhantomJS(cwd + r"\dependencies\phantomJS\phantomjs.exe",
                       service_log_path=cwd +
                       r"\dependencies\phantomJS\ghostdriver.log")

    print("Opening SteamDB..")
    output = fetch_sales(driver, user_library, return_amount, user_categories)

    driver.quit()
    with open("games.txt", 'w', encoding='utf-8') as file:
        file.write(output)
    input("\nDone. I also wrote the games to a text file.")
Beispiel #4
0
class SeleniumTestCase(LiveServerTestCase):
    def _pre_setup(self):
        super(SeleniumTestCase, self)._pre_setup()
        self.driver = PhantomJS()

    def _post_teardown(self):
        self.driver.quit()
        super(SeleniumTestCase, self)._post_teardown()

    def login(self, username='******', password='******', url='login'):
        """
        Login to the server and be authenticated
        """
        self.open(reverse(url))
        self.driver.find_element_by_id("id_username").clear()
        self.driver.find_element_by_id("id_username").send_keys(username)
        self.driver.find_element_by_id("id_password").clear()
        self.driver.find_element_by_id("id_password").send_keys(password)
        self.driver.find_element_by_id("submit-id-login").click()

    def open(self, url):
        self.driver.get("%s%s" %(self.live_server_url, url))

    def is_element_present(self, how, what):
        try:
            self.driver.find_element(by=how, value=what)
        except NoSuchElementException, e:
            return False
        return True
Beispiel #5
0
class Translator(threading.Thread):
    def __init__(self, queue, executable_path=None,
                              desired_capabilities=None,
                              service_args=None,
                              google_translate_url=config['google_translate_url'],
                              window_size=config['window_size']):

        super(self.__class__, self).__init__()
        self._queue = queue

        kwargs = {}

        if executable_path is not None:
            kwargs['executable_path'] = executable_path
        if desired_capabilities is not None:
            kwargs['desired_capabilities'] = desired_capabilities
        if service_args is not None:
            kwargs['service_args'] = service_args

        self._driver = PhantomJS(**kwargs)
        self._driver.set_window_size(*window_size)
        self._driver.get(google_translate_url)

    def run(self):
        while True:
            task = self._queue.get()

            if task is None:
                self._queue.task_done()
                self._driver.quit()
                break

            task.do(self._driver)
            self._queue.task_done()
Beispiel #6
0
def main():
    global HEAD
    if len(sys.argv) > 1:
        try: HEAD = int(sys.argv[1])
        except: HEAD = 10
    # test mirror list
    mirror_list = read_mirrors()
    for i in mirror_list:
        try:
            cururl = i
            print("Testing:",i)
            res = request.urlopen(i)
        except:
            print("Testing on",i,"failed")
            continue
        try:
            update_mirrors(cururl)
            break;
        except:
            continue;

    try: res
    except: raise Warning('All mirrors unavailable!')
    print('Available mirror:',cururl)

    # get vpn table
    countries = dict()
    dr = PhantomJS()
    dr.get(cururl)
    page = Selector(text=dr.page_source)\
            .xpath('.//td[@id="vpngate_inner_contents_td"]/'
                    'table[@id="vg_hosts_table_id"]//tr')

    if HEAD < len(page): page = page[:HEAD]

    print('Pagelen:',len(page))

    for vpn in page:
        if len(vpn.xpath('./td[@class="vg_table_header"]')) > 0:
            continue

        row = vpn.xpath('./td')
        country = row[0].xpath('./text()').extract_first()
        country = '_'.join(country.split(' '))
        ovpn = row[6].xpath('./a/@href').extract_first()

        if ovpn:
            if country in countries:
                countries[country] += 1
                get_ovpn(url=cururl+ovpn, save_to=country+'/'+str(countries[country]))
            else:
                countries[country] = 0
                if not os.path.exists(country):
                    os.mkdir(country)
                get_ovpn(url=cururl+ovpn, save_to=country+'/'+str(countries[country]))

    dr.quit()
Beispiel #7
0
class Crawler:
    def __init__(self, timeout=20, phantomjs_cfg_file='python-utils/config/phantomjs_cfg.json', use_cfg_file=False, proxy_pool_server='http://127.0.0.1:15110'):
        self.timeout = timeout
        if use_cfg_file:
            phantomjs_service_args = ['--config={}'.format(phantomjs_cfg_file)]
        else:
            _, proxy_type, proxy, proxy_auth = get_proxy(proxy_pool_server)
            phantomjs_service_args = [
                '--proxy-type={}'.format(proxy_type),
                '--proxy={}'.format(proxy),
                '--proxy-auth={}'.format(proxy_auth),
            ]

        self.driver = PhantomJS(
            desired_capabilities=self.new_desired_capabilities(),
            service_args=phantomjs_service_args)
        self.check_client_info()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def close(self):
        self.driver.quit()

    @contextmanager
    def wait_for_page_load(self, old_element):
        yield
        WebDriverWait(self.driver, self.timeout).until(EC.staleness_of(old_element))

    def new_desired_capabilities(self, user_agent=default_ua):
        desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
        if not user_agent:
            user_agent = ua.random
        desired_capabilities["phantomjs.page.settings.userAgent"] = user_agent
        return desired_capabilities

    def check_client_info(self):
        url='http://www.whoishostingthis.com/tools/user-agent/'
        self.driver.get(url)
        ip_addr = get_xpath_element(self.driver, '//*[@id="user-agent"]/div[2]/span').text.strip()
        user_agent = get_xpath_element(self.driver, '//*[@id="user-agent"]/div[1]').text.strip()
        logger.info('IP: {}, User-Agent: {}'.format(ip_addr, user_agent))
        if self.wrong_ip(ip_addr):
            logger.error('Proxy not set correctly!')
            sys.exit(-1)

    def wrong_ip(self, ip_addr):
        if ip_addr.startswith('166.111.') or ip_addr.startswith('59.66.') or ip_addr.startswith('101.5.') or ip_addr.startswith('101.6.'):
            return True
        else:
            return False
def main():
    driver = PhantomJS()
    scraper = NLScraper(driver, year=2014)
    print(sys.argv[1])
    writer = unicodecsv.DictWriter(open(sys.argv[1], 'w'), ('amount', 'scheme', 'year',
        'country', 'currency', 'recipient_name', 'recipient_postcode',
        'recipient_id', 'recipient_location'))
    writer.writeheader()
    try:
        scraper.start(writer)
    finally:
        driver.quit()
Beispiel #9
0
 def onegoogolePR(self, url):
     '''返回单个PR'''
     prUrl = 'http://pr.chinaz.com'  # 谷歌PR查询地址
     driver = PhantomJS()
     driver.get(prUrl)
     driver.find_element_by_id('PRAddress').send_keys(url)
     driver.find_element_by_class_name('search-write-btn').click()
     try:
         imgsrc = driver.find_element_by_css_selector('span#pr>img').get_attribute('src')
         pr = search(r'\d', imgsrc).group()
     except:
         pr = '暂无数据'
     driver.quit()
     return pr
Beispiel #10
0
def catalog_url(url='http://www.meitun.com/'):
    # catalog_url is AJAX,use phantomJS
    driver = PhantomJS()
    driver.get(url)
    driver.maximize_window()
    mov_ele = driver.find_element_by_css_selector('.nav>ul>li:nth-child(1)')
    # the mouse move to the lazy layout element,and perform
    ActionChains(driver).move_to_element(mov_ele).perform()
    time.sleep(3)
    response = driver.page_source
    driver.quit()
    # use pyquery parser the page source,more quickly
    d = pq(response)
    return map(lambda x: 'http:' + pq(x).attr('href'), d.find('.cg-pdts a'))
Beispiel #11
0
class AdvertisementAdvancedViewTests(LiveServerTestCase):
    def setUp(self):
        self.driver = PhantomJS()

        self.user = User.objects.create_user('admin', '*****@*****.**', 'pass')
        self.user.save()

        self.provider = Provider(
            name='provider',
            user=self.user,
        )
        self.provider.save()

        self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider)

    def tearDown(self):
        self.driver.quit()

    def open(self, url):
        self.driver.get("%s%s" % (self.live_server_url, url))

    def test_side_ad_display(self):
        """
        Test that the side ads display properly
        """
        self.open(reverse('advertisements.views.side_ads'))

        self.assertEqual(len(self.driver.find_elements_by_xpath("//a")), 4)

        self.driver.find_element_by_xpath("//a[1]/img")
        self.driver.find_element_by_xpath("//a[2]/img")
        self.driver.find_element_by_xpath("//a[3]/img")
        self.driver.find_element_by_xpath("//a[4]/img")

        self.assertNotEqual(self.driver.find_element_by_xpath("//a[1]").get_attribute("href"), '')
        self.assertNotEqual(self.driver.find_element_by_xpath("//a[2]").get_attribute("href"), '')
        self.assertNotEqual(self.driver.find_element_by_xpath("//a[3]").get_attribute("href"), '')
        self.assertNotEqual(self.driver.find_element_by_xpath("//a[4]").get_attribute("href"), '')

    def test_top_ad_display(self):
        """
        Test that the top ad displays properly
        """
        self.open(reverse('advertisements.views.top_ad'))

        self.assertEqual(len(self.driver.find_elements_by_xpath("//a")), 1)
        self.driver.find_element_by_xpath("//a/img")
        self.assertNotEqual(self.driver.find_element_by_xpath("//a").get_attribute("href"), '')
Beispiel #12
0
 def on_start_again(self, url):
     driver = PhantomJS()
     driver.get(url)
     time.sleep(2)
     driver.maximize_window()
     t = driver.find_element_by_css_selector('.page-txt').text
     res_t = []
     if t:
         t = int(t.split('/')[1][:-1]) - 1  # get the page count
         # the count of page turning should be i-1
         while t:
             t -= 1
             move_ele = driver.find_element_by_css_selector('#next')
             ActionChains(driver).move_to_element(move_ele).click()
             time.sleep(1)
             res_t.append(driver.page_source)
     driver.quit()
     for item in res_t:
         self.step_first(item)
Beispiel #13
0
class Premiumgeneratorlink(object):
	def __init__(self, url):
		self.url = url
		self.browser = PhantomJS()

	def get_link(self):
		try:
			self.browser.get('http://premiumgeneratorlink.com/')
			self.browser.find_element_by_name('link').send_keys(self.url)
			self.browser.find_element_by_xpath('//a[@class="input"]').click()
			wdw = WebDriverWait(self.browser, 10)
			wdw.until(EC.element_to_be_clickable((By.ID, 'check'))).click()
			wdw.until(EC.element_to_be_clickable((By.ID, 'generate'))).click()
			link = wdw.until(EC.visibility_of_element_located((By.XPATH, '//form[@class="center"]'))).get_attribute('action')
		except (WebDriverException, NoSuchElementException, TimeoutException):
			return False
		finally:
			self.browser.quit()
		return link
def check_agree(link, soup):
    # Agree if asked to (click on accept)

    if soup.find('input',
                 {'id': 'ctl00_mainContentArea_disclaimerContent_yesButton'}):
        print("Agreeing the terms of use - please wait...")
        driver = PhantomJS('.\phantomjs.exe' if platform.
                           startswith('win32') else './phantomjs')
        driver.get(link)
        driver.find_element_by_id(
            'ctl00_mainContentArea_disclaimerContent_yesButton').click()
        for cookie in driver.get_cookies():
            s.cookies.set(cookie['name'], cookie['value'])
        driver.quit()
        resp_inner = s.get(link)
        soup = Soup(resp_inner.text, features="lxml")
        print("Done, now let's get back to the scraping process.")

    return soup
Beispiel #15
0
class Leecherus(object):
	def __init__(self, url):
		self.url = url
		self.browser = PhantomJS()

	def get_link(self):
		try:
			self.browser.get('http://leecher.us')
			wdw = WebDriverWait(self.browser, 10)
			wdw.until(EC.visibility_of_element_located((By.NAME, 'link'))).send_keys(self.url)
			wdw.until(EC.element_to_be_clickable((By.XPATH, '//button[@class="subscribe"]'))).click()
			wdw.until(EC.element_to_be_clickable((By.XPATH, '//input[@class="subscribe"]'))).click()
			self.browser.switch_to_window(self.browser.window_handles[1])
			onclick = wdw.until(EC.element_to_be_clickable((By.ID, 'get_link'))).get_attribute('onclick')
		except (WebDriverException, NoSuchElementException, TimeoutException, IndexError):
			return False
		finally:
			self.browser.quit()
		m = re.search("'(http://[^']+)'", onclick)
		return m.group(1) if m else False
Beispiel #16
0
class PagesCrawler(BaseSpider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False,
                                         deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kw):
        args = DEFAULT_INPUT.copy()
        args.update(kw)
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['maxdepth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.discover_prefixes = [
            url_to_lru_clean(
                "http%s://%s" %
                (https, u.replace('http://', '').replace('https://', '')))
            for u in to_list(args['discover_prefixes']) for https in ['', 's']
        ]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args[
            'phantom'] and args['phantom'].lower() != "false"
        if self.phantom:
            self.ph_timeout = int(
                args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(
                args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(
                args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0
        dispatcher.connect(self.closed, spider_closed)
        dispatcher.connect(self.crashed, spider_error)

    def start_requests(self):
        self.log(
            "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'],
            log.INFO)
        self.log("ARGUMENTS : " + str(self.args), log.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'),
                                        HYPHE_PROJECT, self.name,
                                        self.crawler.settings['JOBID'])
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles,
                 log.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' %
                            self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities[
            'phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(executable_path=PHANTOM['PATH'],
                                 service_args=phantom_args,
                                 desired_capabilities=self.capabilities,
                                 service_log_path="%s-phantomjs.log" %
                                 self.prefixfiles)
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def crashed(self, spider):
        self.errors += 1
        self.closed("CRASH")

    def closed(self, reason):
        if self.errors:
            self.log(
                "%s error%s encountered during the crawl." %
                (self.errors, 's' if self.errors > 1 else ''), log.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

            # Collect whole DOM of the webpage including embedded iframes
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

            # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout, self.ph_idle_timeout,
                        self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", log.INFO)
                except SeleniumTimeout:
                    self.log(
                        "Scrolling/Unfolding timed-out (%ss)" %
                        self.ph_timeout, log.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err,
                             log.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log(
                        "Scrolling/Unfolding crashed: %s %s" % (type(e), e),
                        log.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

    # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url,
                                        headers=response.headers,
                                        body=cleanupbase64images(
                                            response.body),
                                        flags=flags,
                                        request=response.request)
                self.log(
                    "WARNING: page with base64 embedded images was cleaned-up for links extraction"
                )
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        error = failure.getErrorMessage()
        self.log("ERROR : %s" % error, log.ERROR)
        if PROXY and not PROXY.startswith(
                ':') and "OpenSSL.SSL.Error" in error:
            return self._request(failure.request.url, noproxy=True)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                      redir_url)
            elif redir_url.startswith(
                    './') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                      redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log(
                    "ERROR: links extractor crashed on %s: %s %s" %
                    (response, type(e), e), log.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url)
            except ValueError, e:
                self.log("Error converting URL %s to LRU: %s" % (url, e),
                         log.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lru, lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)
Beispiel #17
0
def selenium_opener(url):
    driver=PhantomJS(executable_path = 'phantomjs的路径')
    driver.get(url)
    hrml=driver.page_source
    driver.quit()
    return html
Beispiel #18
0
def get_tickers(stock_list):

    list = pd.read_csv(stock_list)
    try:
        tickers = [c for c in list[list.columns[1]]]
    except Exception as e:
        print(e)
        sys.exit(1)

    #List for the stocks that had some error being collected
    error_stocks = []

    # Creates directory for the stock data CSV files
    if not os.path.exists('stock_dfs'):
        os.makedirs('stock_dfs')

    global source

    # When data collecting will start and end for the Dates
    global start
    global end

    print(f'>>>Getting Stock Data from {source} from {end}')

    #Iterating through each ticker
    for ticker in tqdm.tqdm(tickers):

        # Reading data on the stock. If grabbing todays data failes, tries to grab data from yesterday
        try:
            df = web.DataReader(ticker, source, start, end)
        except:
            #Changing end date to yesterday
            end = (dt.datetime.now() - dt.timedelta(1)).strftime('%Y-%m-%d')
            df = web.DataReader(ticker, source, start, end)

        #High/Low Open/Close percentage
        df['HL_pct'] = ((df['High'] - df['Low']) / df['Low']) * 100
        df['OC_pct'] = ((df['Close'] - df['Open']) / df['Open']) * 100

        #Boolinger Band
        df['Middle Boolinger'] = df['Adj Close'].rolling(20).mean()
        df['Sup_Boolinger'] = df['Middle Boolinger'] + (
            2 * df['Adj Close'].rolling(20).std())
        df['Inf_Boolinger'] = df['Middle Boolinger'] - (
            2 * df['Adj Close'].rolling(20).std())

        #Exponential Moving Mean
        df['Exp20_Close'] = df['Adj Close'].ewm(span=20, adjust=False).mean()

        #Expantion/Contraction of stock price
        df['Deviation_band'] = df['Adj Close'].rolling(20).std()

        #RSI
        change = df['Adj Close'].diff(1)

        gain = change.mask(change < 0, 0)
        loss = change.mask(change > 0, 0)
        avg_gain = gain.ewm(min_periods=rsi_period, com=rsi_period - 1).mean()
        avg_loss = loss.ewm(min_periods=rsi_period, com=rsi_period - 1).mean()
        rs = abs(avg_gain / avg_loss)
        df['RSI'] = 100 - (100 / (1 + rs))
        '''
            Now the code will do a webscrape on some pages on yahoo finance to get more details and info. It will do this by table
            reading or span-string reading since some pages don't have tables. With table reading it's straight up but with span reading
            we need to get the reactID of each line we want. And for that it's kind of hardcoded, I read through all the span lines 
            and wrote down the useful ones.  
        '''

        #Reading into page
        resp = requests.get(
            f'https://finance.yahoo.com/quote/{ticker}/financials')
        #BeautifulSoup scrapes the page in TXT form
        soup = bs.BeautifulSoup(resp.text, 'lxml')

        #Number of span lines we got
        length = int(np.array(soup.find_all('span')).shape[0])

        #All lines with the span class, which has the info we want
        lines = np.array(soup.find_all('span'))

        #List to store the span lines that have the reactID codes we want
        spans = []

        #Dates we want to find
        find_dates = ['12/30/2019', '12/30/2018', '12/30/2017', '12/30/2016']

        #List for the dates we actually find
        dates = []

        #Iterating through the lines and grabbing all lines from the span class
        for line in range(0, length):
            spans.append(BeautifulSoup(str(lines[line]), features='lxml').span)

        #Iterating through each date we want to find in the website
        for date in find_dates:

            #Iterating through each span-class line
            for line in range(0, length):

                #If the text line and date match then put the date in the found dates list
                if spans[line].string == date:
                    dates.append(spans[line].string)
                    break

        #Changes date format for indexing with the webreader dataframe
        for index, date in enumerate(dates):

            #If any string dpesn't match the format than it's not a date and will be removed
            try:
                dates[index] = dt.datetime.strptime(
                    date, "%m/%d/%Y").strftime("%Y-%m-%d")
            except:

                #dates.remove will raise exception when there is no more of such content in the list, stopping the loop
                removed = False
                while (removed == False):
                    try:
                        dates.remove(dates[index])
                    except:
                        removed = True

        #Adding 3 days to the dates, because most stocks don't opperate on the last day of the year. Which is
        #the date time for the data to appear on the website.
        for index, date in enumerate(dates):
            dates[index] = (dt.datetime.strptime(date, '%Y-%m-%d') +
                            dt.timedelta(3)).strftime('%Y-%m-%d')

        #Info we want to get from the webiste
        interesting_lines = [
            'Total Revenue', 'Cost of Revenue', 'Gross Profit',
            'Selling General and Administrative', 'Total Operating Expenses',
            'Operating Income or Loss', 'Interest Expense',
            'Total Other Income/Expenses Net', 'Income Before Tax',
            'Income Tax Expense', 'Income from Continuing Operations',
            'Net Income', 'Net Income available to common shareholders',
            'EBITDA'
        ]

        #List for the info we actually find on the website
        infos = []

        #List for the ReactIDs of the lines that have the data about the infos above
        number_ids = []

        #Column renaming
        column_names = [
            'Total Revenue (TTM)', 'Cost of Revenue (TTM)',
            'Gross Profit (TTM)',
            'Selling General and Administrative Expenses (TTM)',
            'Total Operating Expenses (TTM)', 'Operating Income or Loss (TTM)',
            'Interest Expense (TTM)', 'Total Other Income/Expenses Net',
            'Income Before Tax (TTM)', 'Income Tax Expense (TTM)',
            'Income from Coninuing Operations (TTM)', 'Net Income (TTM)',
            'Net Income available to Shareholders (TTM)', 'EBITDA (TTM)'
        ]

        #Iterating through the informations we want
        for index, info in enumerate(interesting_lines):

            #Boolean for if the information was found
            check = False

            #Iterating through the span lines
            for line in range(0, length):

                #If line contains the information we want, appends it to the found infos list.
                if spans[line].string == info:
                    infos.append(spans[line].string)

                    #Appends the info's reactID +5, one line below, where the numbers and data are
                    number_ids.append(
                        str(int(spans[line]['data-reactid']) + 5))
                    check = True
                    pass

            #In case the information isn't found, the respective column name is changed to a NAN, to be removed later
            if check == False:
                column_names[index] = np.nan

        #Removing NANs from column name list
        column_names = [c for c in column_names if str(c) != 'nan']

        #Creating the columns for the information
        for column in column_names:
            df[f'{column}'] = np.nan

        #Iterating through dates, with indexing
        for index, date in enumerate(dates):

            #Iterating through new columns, with indexing
            for column, string in enumerate(column_names):

                #Iterating through span lines
                for line in range(0, length):

                    #Fetching data for the respective information column in order
                    if spans[line]['data-reactid'] == number_ids[column]:

                        #Locates the date in dataframe index, formats the string of the data, turns it into a Integer and
                        #puts the data in it's correct place in time.
                        try:
                            df[f'{string}'].loc[dates[index]] = int(
                                (spans[line].string).replace(',', ''))
                        except Exception as e:
                            print(e)
                            print(
                                f'Error formating/alocating string to int for stock {ticker}'
                            )

                            #Appending to stocks with errors list
                            error_stocks.append(ticker)
                            continue
            #Adding 2 to the IDs for each iteration so we get the lines of previous dates for the information
            number_ids = [int(c) for c in number_ids]
            number_ids = [c + 2 for c in number_ids]
            number_ids = [str(c) for c in number_ids]

        #Page URL that we will pass to PhantomJS
        url = f'https://finance.yahoo.com/quote/{ticker}/key-statistics'

        #Initiating PhantomJS
        driver = PhantomJS(executable_path=r'phantomjs.exe')

        #Opening URL with PhantomJS to fully load the page
        driver.get(url)

        #Returning page source after all the JavaScript codes have been loaded
        resp = driver.page_source

        #Closing PhantomJS
        driver.quit()

        #List of tables that Pandas found in the web page
        dfs = pd.read_html(resp)

        #Dataframe to put all the tables in just one
        key_stats = pd.DataFrame()

        #Iterating through the tables
        for dframe in dfs:

            #If dataframe is empty, passes the first table
            if key_stats.empty:
                key_stats = dframe

            #If it already has a table, appends the new ones
            else:
                key_stats = key_stats.append(dframe)

        #Fixing dataframe index, with numbers from 0 to length of dataframe
        key_stats.index = [c for c in range(0, key_stats.shape[0])]

        #There´s some info that we don´t have interest so we drop what we don´t need
        stats = key_stats.loc[:8]

        #Removing columns 0 and 1
        stats = stats.drop([0, 1], axis=1)

        #Passing the information names as the dataframe index
        stats.index = [c for c in stats['Unnamed: 0'].values]

        #Removing the column with information names, since it´s all in the index
        stats = stats.drop(['Unnamed: 0'], axis=1)

        #Transposing the dataframe, so that the Dates become the index and the information names become the column
        stats = stats.transpose()

        #Criating the new columns in the main dataframe
        for column in stats.columns:
            df[f'{column}'] = np.nan

        #Putting all the dates in a list
        dates = [c for c in stats.index]

        #Iterating through the dates
        for index, date in enumerate(dates):
            #Changing date format
            try:
                dates[index] = dt.datetime.strptime(
                    date, "%m/%d/%Y").strftime("%Y-%m-%d")
            except:
                #One of the dates actually has more things than the date so we remove all that
                date = date.replace('As of Date: ', '')
                date = date.replace('Current', '')
                dates[index] = dt.datetime.strptime(
                    date, "%m/%d/%Y").strftime("%Y-%m-%d")

        #Adding 3 days because stocks don´t opperate in the last day of the year
        for index, date in enumerate(dates):
            dates[index] = (dt.datetime.strptime(date, '%Y-%m-%d') +
                            dt.timedelta(3)).strftime('%Y-%m-%d')

        #Passing changed dates back into the dataframe´s index
        stats.index = dates

        #Iterating through dates again
        for date in stats.index:

            #Iterating through the new columns
            for column in stats.columns:

                #Locating the dates and columns in the main dataframe and putting the respetive data in it´s place
                try:
                    df[f'{column}'].loc[date] = stats[f'{column}'].loc[date]

                #If any errr occurs in this process, shows the error for the respective stock and adds it to the
                #stocks-with-error list
                except Exception as e:
                    print(e)
                    print(
                        f'Error formating/alocating string to int for stock {ticker}'
                    )

                    #Appending to stocks with errors list
                    error_stocks.append(ticker)
        '''
        Since we only have info year by year and the .loc funtion only puts the data in the specific index, we need to
        fill the NANs with the previous data that isn't a NAN (ffill method). This way, from each data alocated, all future 
        lines will have this exact data, until a new data (the most recent) appears, and the process repeats.
        '''
        df.fillna(method='ffill', inplace=True)

        # Saving csv file
        df.to_csv('stock_dfs/{}.csv'.format(ticker))

    #Showing any stocks with errors if there are any
    if error_stocks != []:
        print('\n ------ Inspect Errors ------- \n')
        print([c for c in error_stocks])
Beispiel #19
0
class gmail(Thread):
    def __init__(self, account):
        name = account['name']

        super().__init__(name=name)  # Thread __init__

        lg.warning('{0[name]}, proxy: {0[Proxy]}'.format(account))

        self.account = account
        self.solved = 0

        if 0:  # Getting cookies snippet
            print(self.driver.get_cookies())
            cookies = {
                _['name']: _['value']
                for _ in self.driver.get_cookies()
            }
            with open('cookies.json', 'w') as f:
                dump(cookies, f, indent=4)

    def verify(self, el):
        '''Verifies the account. May be untrivial:('''

        text = el.text  # get_attribute('value')
        lg.info('Text: {}'.format(text))
        if text == "Verify it's you":
            lg.debug('Verify')
            #el=self.driver.find_element_by_id('identifierNext')
            el = self.driver.find_element_by_xpath(
                '//div[.="Confirm your recovery email"]')
            print(el)
            el.click()
            el = WebDriverWait(self.driver, 3).until(
                EC.visibility_of_element_located(
                    (By.NAME, 'knowledgePreregisteredEmailResponse')))
            el.send_keys(account[2])  # recovery email

    def login(self):
        if 0:  # to test
            #'https://www.whoishostingthis.com/tools/user-agent/'
            self.driver.get('about:about')
            sleep(1000)
        #self.driver.get('https://mail.google.com')
        self.driver.get(
            'https://accounts.google.com/signin/v2/identifier?continue=https%3A%2F%2Fmail.google.com%2Fmail%2F&service=mail&sacu=1&rip=1&flowName=GlifWebSignIn&flowEntry=ServiceLogin'
        )
        prefilled = False

        lg.debug('Logging in with {}'.format(self.account))
        try:
            el = WebDriverWait(self.driver, 2).until(
                EC.visibility_of_element_located((By.ID, 'identifierId')))
        except TimeoutException:
            prefilled = True

        if prefilled:
            lg.info('Username prefilled already')
        else:
            lg.debug('Entering username')
            el.send_keys(self.account['name'])  # username
            nxt = self.driver.find_element_by_id('identifierNext')
            nxt.click()

        logged_in = False
        try:
            el = WebDriverWait(self.driver, 20).until(
                EC.visibility_of_element_located((By.NAME, 'password')))
        except TimeoutException:  # We're logged in?
            # TODO: Check for something visible after being logged in
            # Because we may genuinely be in timeout
            logged_in = True

        if logged_in:
            lg.info('Logged in already')
        else:
            lg.debug('Entering password')
            el.send_keys(self.account['Second Password'])
            nxt = WebDriverWait(self.driver, 5).until(
                EC.element_to_be_clickable((By.ID, 'passwordNext')))
            nxt.click()

            # WebDriverWait(self.driver, 60).until(
            #     EC.frame_to_be_available_and_switch_to_it((By.ID, 'tab1_1'))
            # )

            try:
                el = WebDriverWait(self.driver, 3).until(
                    EC.visibility_of_element_located((By.ID, 'headingText')))
                #open('1.html','w').write(self.driver.page_source)
                self.verify(el)
            except TimeoutException:  # We're in
                pass

    def screenshot(self, name):
        self.driver.save_screenshot('{}/{}-{}.png'.format(
            getcwd(), self.account['name'], name))

    def solve(self):
        '''Solve the captcha one time'''
        WebDriverWait(self.driver, 30).until(
            EC.frame_to_be_available_and_switch_to_it(
                (By.XPATH, '//iframe[@title="recaptcha widget"]')))

        el = WebDriverWait(self.driver, 20).until(
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, 'div.recaptcha-checkbox-checkmark')))
        #lg.info(el
        el.click()

        lg.debug('Clicked solve box')

        def check_style(driver, el):
            '''Now need to see what happened there. Check an attribute to see if we're successful.'''
            attr = el.get_attribute('aria-checked')
            lg.debug(attr)
            return attr == 'true'

        lg.debug('Before check_style')
        timeout = False
        try:
            WebDriverWait(self.driver, 20).until(lambda driver: check_style(
                driver, self.driver.find_element_by_id('recaptcha-anchor')))
        except TimeoutException:
            timeout = True  # Next (very soon) we'll see what happened

        lg.debug('Final: ' + self.driver.find_element_by_id(
            'recaptcha-anchor').get_attribute('aria-checked'))

        self.driver.switch_to.default_content()
        if timeout:
            lg.warning('Timeout')
            self.screenshot('timeout')
            el = self.driver.find_element_by_xpath(
                '//iframe[@title="recaptcha challenge"]')
            #set_trace()
            self.driver.switch_to.frame(el)
            l = len(self.driver.page_source)
            lg.debug(l)
            with open('recaptcha_main.html', 'w') as f:
                f.write(self.driver.page_source)
            if l > 10000:
                lg.warning('Captcha')
                self.screenshot('captcha')
                return True  # Need to quit
            self.driver.switch_to.default_content()
            self.driver.refresh()
        else:
            el = self.driver.find_element_by_id('submit')
            el.click()  # Submit button
            lg.info('Clicked submit')

            lg.debug('Before staleness')
            WebDriverWait(self.driver, 10,
                          poll_frequency=0.1).until(EC.staleness_of(el))
            lg.debug('After staleness')

    def create_driver(self):
        if 1:
            caps = DesiredCapabilities().FIREFOX.copy()

            profile_path = path.expanduser(
                '~') + '/.mozilla/firefox/' + self.account['name']

            # caps['proxy'] = {
            caps['moz:firefoxOptions'] = {
                "args": ["-profile", profile_path],  # geckodriver 0.18+
            }

            profile = FirefoxProfile(profile_path)
            #profile.set_preference("general.useragent.override", 'Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0')

            self.driver = Firefox(profile, capabilities=caps)
            #self.driver = Firefox(profile)
        else:  # PhantomJS
            # https://github.com/detro/ghostdriver
            caps = DesiredCapabilities().PHANTOMJS
            caps["phantomjs.page.settings.userAgent"] = \
                'Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0'
            service_args = [
                '--proxy={}'.format(':'.join(
                    self.account['Proxy'].split(':')[:2])),
                '--proxy-type=http',
            ]
            print(service_args)
            self.driver = PhantomJS(service_args=service_args,
                                    capabilities=caps)
            self.driver.set_window_size(1120, 550)
            #profile.set_preference("general.useragent.override","Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16")
        #profile.set_preference("general.useragent.override","Mozilla/5.0 (X11; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0")
        # profile.set_preference("browser.startup.homepage_override.mstone", "ignore");
        # profile.set_preference("startup.homepage_welcome_url.additional",  "about:blank");
        # profile.set_preference("xpinstall.signatures.required", "false");
        # profile.set_preference("toolkit.telemetry.reportingpolicy.firstRun", "false");

    def run(self):
        '''Login and run in cycle'''

        self.create_driver()

        try:
            self.login()

            tosleep=datetime.combine(
                date.today(), dt_time(drophour,00,5,tzinfo=timezone.utc))-\
                datetime.now(timezone.utc)
            tosleep = tosleep.seconds
            lg.info('Sleeping for {}'.format(tosleep))
            if '/pooh/' in path.expanduser('~'):
                tosleep = 0  # don't sleep on developer's host
            if not debug: sleep(tosleep)

            # Creating new window to work in (otherwise sometimes the page will ask whether we're ok to leave it)
            self.driver.execute_script(
                '''window.open('{}',"_blank");'''.format(solve_url))
            self.driver.switch_to.window(self.driver.window_handles[-1])
            lg.debug('Created new window')

            # Cycle here getting tokens until there are no more nocaptcha
            start_time = end_time = time()  # In case we have exception
            while True:
                #for i in range(1):
                if self.solve(): break
                self.solved += 1
            end_time = time()
        except:
            lg.exception('In run')
            self.screenshot('exception')
        finally:
            lg.warning('Closing driver')
            with suppress(WebDriverException):
                self.driver.quit()
        rate = (end_time - start_time) / self.solved if self.solved else 0
        lg.warning('Solved: {} ({:.2f})'.format(self.solved, rate))
Beispiel #20
0
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date,
                  invoice_amount):
    retail_invoice_url = RETAIL_INVOICE_URL[retail]

    driver = PhantomJS()
    driver.get(retail_invoice_url)

    # 1 Set doc_type 'select'
    try:
        select_doc_type = Select(driver.find_element_by_name('txtTipoDte'))
        value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value']
        select_doc_type.select_by_value(value)
        # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name']
        # select_doc_type.select_by_visible_text(name)
    except Exception:
        print 'ERROR: set doc_type select as Boleta'
        driver.save_screenshot('screen.png')
        return '', ''

    time.sleep(5)

    # 2 Get recaptcha img url
    try:
        recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image')
        recaptcha_img_url = recaptcha_img.get_attribute('src')
    except Exception:
        print 'ERROR: get recaptcha image url'
        driver.save_screenshot('screen.png')
        return '', ''

    # 3 Solve recaptcha
    v = VisionApi()
    recaptcha_value = v.detect_text_from_url(recaptcha_img_url)

    if recaptcha_value is None:
        print 'ERROR: solving recaptcha image'
        driver.save_screenshot('screen.png')
        return '', ''

    # 4 Fill form
    script = u"""
        document.getElementsByName('txtFolio')[0].value = '{invoice_id}';
        document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}';
        document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}';
        document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}';
    """.format(
        invoice_id=invoice_id,
        invoice_date=invoice_date,
        invoice_amount=invoice_amount,
        recaptcha_value=recaptcha_value,
    )
    driver.execute_script(script)

    # 5 Submit form
    try:
        driver.find_element_by_name('frmDatos').submit()
    except Exception:
        print 'ERROR: submitting form'
        driver.save_screenshot('screen.png')
        return '', ''

    # 6 Get url files
    try:
        xml_a_tag = driver.find_element_by_xpath(
            '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]')
        pdf_a_tag = driver.find_element_by_xpath(
            '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]')

        xml_url = xml_a_tag.get_attribute('href')
        pdf_url = pdf_a_tag.get_attribute('href')
    except Exception:
        print 'ERROR: getting url files'
        driver.save_screenshot('screen.png')
        return '', ''

    # 8 Delete driver session
    driver.close()
    driver.quit()

    return xml_url, pdf_url
Beispiel #21
0
class ProviderAdvancedViewTests(LiveServerTestCase):
    def setUp(self):
        self.driver = PhantomJS()

        self.user = User.objects.create_user('admin', '*****@*****.**', 'password')
        self.user.save()

        self.provider = Provider(
            name='provider',
            user=self.user,
        )
        self.provider.save()

        self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider)

        self.login()

    def tearDown(self):
        self.driver.quit()

    def open(self, url):
        self.driver.get("%s%s" % (self.live_server_url, url))

    def login(self):
        self.open(settings.LOGIN_URL)
        self.driver.find_element_by_id("id_username").send_keys("admin")
        self.driver.find_element_by_id("id_password").send_keys("password")
        self.driver.find_element_by_css_selector("button.btn.btn-default").click()

        self.assertEqual(
            self.driver.current_url,
            self.live_server_url + reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk]),
        )

    def test_can_login(self):
        """
        Test that the user can login
        """
        pass

    def test_provider_page_has_all_data(self):
        """
        Test that the provider statistics page has all the correct data
        """
        self.open(reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk]))

        self.assertEqual("Open Ads", self.driver.title)
        self.assertIn(
            "{0} advertisements".format(self.provider.name),
            self.driver.find_element_by_css_selector("h1.page-header").text
        )
        self.assertIn(
            "{0} advertisements in rotation".format(20),
            self.driver.find_element_by_css_selector("h1.page-header").text
        )

    def test_advertisement_page_has_all_data(self):
        """
        Test that the advertisement page has all the correct data
        """

        for advert in self.provider_adverts:
            self.open(reverse('advertisements.views.view_advert_statistics', args=[advert.pk]))

            self.assertIn(
                "ID number: {0}".format(advert.pk),
                self.driver.find_element_by_css_selector("h1.page-header").text,
            )
            self.driver.find_element_by_css_selector("img")
            self.assertEqual("Active", self.driver.find_element_by_xpath("//td[2]/span").text)
            self.assertEqual(advert.url, self.driver.find_element_by_link_text(advert.url).text)
            self.driver.find_element_by_link_text("Edit URL").click()
            self.assertEqual(advert.url, self.driver.find_element_by_id("id_url").get_attribute("value"))
from selenium.webdriver import PhantomJS
from selenium.webdriver.common.keys import Keys
from time import sleep

driver = PhantomJS()
driver.set_window_size(1120, 550)
driver.get("https://duckduckgo.com/")
driver.find_element_by_id('search_form_input_homepage').send_keys("realpython")
driver.find_element_by_id("search_button_homepage").click()
print(driver.current_url)
driver.quit()
Beispiel #23
0
def main():
    os.makedirs(dlDir, exist_ok=True)
    startCatIdx = int(sys.argv[1]) if len(sys.argv) > 1 else 0
    startFamIdx = int(sys.argv[2]) if len(sys.argv) > 2 else 0
    startPrdIdx = int(sys.argv[3]) if len(sys.argv) > 3 else 0
    executor = ThreadPoolExecutor()
    PhantomJS.waitClickable = waitClickable
    driver = PhantomJS()
    # harvest_utils.driver = driver
    with open('netgear_filelist.csv', 'w') as fout:
        cw = csv.writer(fout)
        cw.writerow([
            'model', 'fw_ver', 'fileName', 'fw_url', 'fw_date', 'fileSize',
            'sha1', 'md5'
        ])
    driver.get('http://downloadcenter.netgear.com/')
    # click DrillDown
    driver.waitClickable(
        '#ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_BasicSearchPanel_btnAdvancedSearch'
    ).click()  # noqa
    ctl00 = "#ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_adsPanel_"  # noqa ignore=E501
    #
    # wait Page2
    try:
        catSel = Select(driver.waitClickable(ctl00 + "lbProductCategory"))
        numCat = len(catSel.options)
        for catIdx in range(startCatIdx, numCat):
            catSel = Select(driver.waitClickable(ctl00 + "lbProductCategory"))
            print('catIdx=', catIdx)
            catTxt = catSel.options[catIdx].text
            uprint('catTxt= ' + catTxt)
            oldText = driver.getText(ctl00 + "lbProductFamily")
            catSel.select_by_index(catIdx)
            driver.waitTextChanged(ctl00 + "lbProductFamily", oldText)
            famSel = Select(driver.waitClickable(ctl00 + "lbProductFamily"))
            numFam = len(famSel.options)
            for famIdx in range(startFamIdx, numFam):
                famSel = Select(
                    driver.waitClickable(ctl00 + "lbProductFamily"))  # noqa
                print('famIdx=', famIdx)
                startFamIdx = 0
                famTxt = famSel.options[famIdx].text
                uprint('famTxt= ' + famTxt)
                oldText = driver.getText(ctl00 + "lbProduct")
                famSel.select_by_index(famIdx)
                driver.waitTextChanged(ctl00 + "lbProduct", oldText)
                prdSel = Select(driver.waitClickable(ctl00 + "lbProduct"))
                numPrd = len(prdSel.options)
                for prdIdx in range(startPrdIdx, numPrd):
                    prdSel = Select(driver.waitClickable(ctl00 + "lbProduct"))
                    startPrdIdx = 0
                    print("catIdx,famIdx,prdIdx=%d, %d, %d" %
                          (catIdx, famIdx, prdIdx))
                    prdTxt = prdSel.options[prdIdx].text
                    uprint('cat,fam,prd="%s","%s","%s"' %
                           (catTxt, famTxt, prdTxt))  # noqa ignore=E501
                    prdWaiting = driver.waitElem(
                        ctl00 +
                        "upProgProductLoader > div > img")  # noqa ignore=E501
                    prdSel.select_by_index(prdIdx)
                    try:
                        WebDriverWait(driver, 1, 0.5).\
                            until(lambda x: prdWaiting.is_displayed() is True)
                    except TimeoutException:
                        pass
                    try:
                        WebDriverWait(driver, 5, 0.5).\
                            until(lambda x: prdWaiting.is_displayed() is False)
                    except TimeoutException as ex:
                        pass
                    numResults = driver.waitText(
                        ctl00 + "lvwAllDownload_lblAllDownloadResult", 3,
                        0.5)  # noqa ignore=E501
                    if numResults is None:
                        continue
                    numResults = int(re.search(r"\d+", numResults).group(0))
                    print('numResults=', numResults)
                    if numResults > 10:
                        driver.waitClickable("#lnkAllDownloadMore", 3).click()
                    try:
                        erItems = driver.getElems(
                            'a.register-product.navlistsearch', 3, 0.5)  # noqa
                    except TimeoutException:
                        erItems = driver.getElems(
                            'div#LargeFirmware > ul > li > div > p > a.navlistsearch',
                            3)  # noqa ignore=E501

                    if len(erItems) != numResults:
                        print('Error, numResults=%d, but len(erItems)=%d' %
                              (numResults, len(erItems)))
                    for itemIdx, erItem in enumerate(erItems):
                        if not erItem.is_displayed():
                            print('itemIdx=%d is not displayed()' % itemIdx)
                            continue
                        erItem.getItemText = getItemText
                        desc = erItem.getElemText(erItem)
                        uprint('desc="%s"' % desc)
                        if 'firmware' not in desc.lower():
                            continue
                        fw_url = erItem.get_attribute('data-durl')
                        if not fw_url:
                            fw_url = erItem.get_attribute('fw_url')
                        print('fw_url=', fw_url)
                        if not fw_url:
                            continue
                        if not fw_url.startswith('http'):
                            print('Error: fw_url=', fw_url)
                            continue
                        executor.submit(download_file, prdTxt, desc, fw_url)
    except BaseException as ex:
        traceback.print_exc()
        import pdb
        pdb.set_trace()
        driver.save_screenshot("netgear_crawler2")
    finally:
        driver.quit()
        executor.shutdown(True)
Beispiel #24
0
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount):
    retail_invoice_url = RETAIL_INVOICE_URL[retail]

    driver = PhantomJS()
    driver.get(retail_invoice_url)

    # 1 Set doc_type 'select'
    try:
        select_doc_type = Select(driver.find_element_by_name('txtTipoDte'))
        value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value']
        select_doc_type.select_by_value(value)
        # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name']
        # select_doc_type.select_by_visible_text(name)
    except Exception:
        print 'ERROR: set doc_type select as Boleta'
        driver.save_screenshot('screen.png')
        return '', ''

    time.sleep(5)

    # 2 Get recaptcha img url
    try:
        recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image')
        recaptcha_img_url = recaptcha_img.get_attribute('src')
    except Exception:
        print 'ERROR: get recaptcha image url'
        driver.save_screenshot('screen.png')
        return '', ''

    # 3 Solve recaptcha
    v = VisionApi()
    recaptcha_value = v.detect_text_from_url(recaptcha_img_url)

    if recaptcha_value is None:
        print 'ERROR: solving recaptcha image'
        driver.save_screenshot('screen.png')
        return '', ''

    # 4 Fill form
    script = u"""
        document.getElementsByName('txtFolio')[0].value = '{invoice_id}';
        document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}';
        document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}';
        document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}';
    """.format(
        invoice_id=invoice_id,
        invoice_date=invoice_date,
        invoice_amount=invoice_amount,
        recaptcha_value=recaptcha_value,
    )
    driver.execute_script(script)

    # 5 Submit form
    try:
        driver.find_element_by_name('frmDatos').submit()
    except Exception:
        print 'ERROR: submitting form'
        driver.save_screenshot('screen.png')
        return '', ''

    # 6 Get url files
    try:
        xml_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]')
        pdf_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]')

        xml_url = xml_a_tag.get_attribute('href')
        pdf_url = pdf_a_tag.get_attribute('href')
    except Exception:
        print 'ERROR: getting url files'
        driver.save_screenshot('screen.png')
        return '', ''

    # 8 Delete driver session
    driver.close()
    driver.quit()

    return xml_url, pdf_url
Beispiel #25
0
        # 将他拷贝到本地文件 w 写  b 二进制  wb代表写入二进制文本
        path = Path('./pic')
        path.mkdir(exist_ok=True)
        path = path / url_info[1]
        with open(path, 'wb') as f:
            f.write(img)


driver = PhantomJS()  # 创建Chrome对象.
# 操作这个对象.
driver.get('http://zxgk.court.gov.cn/shixin/')  # get方式访问百度.
i = 1
j = 10000

while j >= 0:
    a = driver.find_element_by_id("captchaImg")
    url = (a.get_attribute('src'))
    pic_name = f"{i}.png"
    try:
        download_img([url, pic_name])
    except Exception as e:
        print(e)
        continue
    print(f"{pic_name}已经下载成功,共成功下载{i}张验证码")
    i += 1
    j -= 1
    ActionChains(driver).move_to_element(a).click().perform()
    time.sleep(2)  # 防止过快被封ip

driver.quit()  # 使用完, 记得关闭浏览器, 不然chromedriver.exe进程为一直在内存中.
Beispiel #26
0
class PagesCrawler(BaseSpider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kw):
        args = DEFAULT_INPUT.copy()
        args.update(kw)
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['maxdepth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's']]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
        if self.phantom:
            self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0
        dispatcher.connect(self.closed, spider_closed)
        dispatcher.connect(self.crashed, spider_error)

    def start_requests(self):
        self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO)
        self.log("ARGUMENTS : "+str(self.args), log.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(
            scrapyd_config().get('logs_dir'),
            HYPHE_PROJECT,
            self.name,
            self.crawler.settings['JOBID']
        )
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(
            executable_path=PHANTOM['PATH'],
            service_args=phantom_args,
            desired_capabilities=self.capabilities,
            service_log_path="%s-phantomjs.log" % self.prefixfiles
        )
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def crashed(self, spider):
        self.errors += 1
        self.closed("CRASH")

    def closed(self, reason):
        if self.errors:
            self.log("%s error%s encountered during the crawl." %
                (self.errors, 's' if self.errors > 1 else ''), log.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

          # Collect whole DOM of the webpage including embedded iframes
            with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

          # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout,
                        self.ph_idle_timeout, self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", log.INFO)
                except SeleniumTimeout:
                    self.log("Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log("Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

      # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images(response.body), flags=flags, request=response.request)
                self.log("WARNING: page with base64 embedded images was cleaned-up for links extraction")
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        self.log("ERROR : %s" % failure.getErrorMessage(), log.ERROR)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url)
            elif redir_url.startswith('./') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url)
            except ValueError, e:
                self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lru, lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)
Beispiel #27
0
class PagesCrawler(Spider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False,
                                         deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kwargs):
        mongo = MongoClient(MONGO_HOST, MONGO_PORT)[MONGO_DB][MONGO_JOBS_COL]
        job = mongo.find_one({"_id": kwargs["job_id"]})
        args = job["crawl_arguments"]
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['max_depth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.prefixes_trie = LRUTrie()
        for p in self.follow_prefixes:
            self.prefixes_trie.set_lru(p, True)
        for p in self.nofollow_prefixes:
            self.prefixes_trie.set_lru(p, False)
        self.discover_prefixes = [
            url_to_lru_clean(
                "http%s://%s" %
                (https, u.replace('http://', '').replace('https://', '')),
                TLDS_TREE) for u in to_list(args['discover_prefixes'])
            for https in ['', 's']
        ]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args[
            'phantom'] and args['phantom'].lower() != "false"
        self.cookies = None
        if 'cookies' in args and args["cookies"]:
            self.cookies = dict(
                cookie.split('=', 1)
                for cookie in re.split(r'\s*;\s*', args['cookies'])
                if '=' in cookie)
        if self.phantom:
            self.ph_timeout = int(
                args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(
                args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(
                args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(PagesCrawler, cls).from_crawler(crawler, *args,
                                                       **kwargs)
        crawler.signals.connect(spider.spider_closed, signal=spider_closed)
        crawler.signals.connect(spider.spider_crashed, signal=spider_error)
        return spider

    def start_requests(self):
        self.log(
            "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'],
            logging.INFO)
        self.log("ARGUMENTS : " + str(self.args), logging.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'),
                                        HYPHE_PROJECT, self.name,
                                        self.crawler.settings['JOBID'])
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles,
                 logging.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' %
                            self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities[
            'phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(executable_path=PHANTOM['PATH'],
                                 service_args=phantom_args,
                                 desired_capabilities=self.capabilities,
                                 service_log_path="%s-phantomjs.log" %
                                 self.prefixfiles)
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def spider_crashed(self, spider):
        self.errors += 1
        self.spider_closed(spider, reason="CRASH")

    def spider_closed(self, spider, reason=""):
        if self.errors:
            self.log(
                "%s error%s encountered during the crawl (%s)." %
                (self.errors, 's' if self.errors > 1 else '', reason),
                logging.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url, TLDS_TREE)

        if self.phantom:
            self.phantom.get(response.url)

            # Collect whole DOM of the webpage including embedded iframes
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

            # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", logging.INFO)
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout, self.ph_idle_timeout,
                        self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", logging.INFO)
                except SeleniumTimeout:
                    self.log(
                        "Scrolling/Unfolding timed-out (%ss)" %
                        self.ph_timeout, logging.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err,
                             logging.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log(
                        "Scrolling/Unfolding crashed: %s %s" % (type(e), e),
                        logging.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

    # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url,
                                        headers=response.headers,
                                        body=cleanupbase64images(
                                            response.body),
                                        flags=flags,
                                        request=response.request)
                self.log(
                    "WARNING: page with base64 embedded images was cleaned-up for links extraction"
                )
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        error = failure.getErrorMessage()
        self.log("ERROR : %s" % error, logging.ERROR)
        if PROXY and not PROXY.startswith(
                ':') and "OpenSSL.SSL.Error" in error:
            return self._request(failure.request.url, noproxy=True)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                      redir_url)
            elif redir_url.startswith('../'):
                lrustart = lru[:lru.rfind('|p:')]
                while redir_url.startswith('../'):
                    lrustart = lrustart[:lrustart.rfind('|p:')]
                    redir_url = redir_url[3:]
                redir_url = "%s/%s" % (lru_to_url(lrustart + '|'), redir_url)
            elif redir_url.startswith(
                    './') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                      redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log(
                    "ERROR: links extractor crashed on %s: %s %s" %
                    (response, type(e), e), logging.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url, TLDS_TREE)
            except (ValueError, IndexError) as e:
                self.log("Error converting URL %s to LRU: %s" % (url, e),
                         logging.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)

    def _make_html_page(self, response, lru, lrulinks):
        p = self._make_raw_page(response, lru)
        if STORE_HTML:
            p['body'] = Binary(response.body.encode('zip'))
        p['lrulinks'] = lrulinks
        return p

    def _make_raw_page(self, response, lru):
        p = self._new_page(response.url, lru)
        p['status'] = response.status
        p['size'] = len(response.body)
        if isinstance(response, HtmlResponse):
            p['encoding'] = response.encoding
        if response.meta.get('depth'):
            p['depth'] = response.meta['depth']
        if response.headers.get('content-type'):
            p['content_type'] = response.headers.get('content-type').partition(
                ';')[0]
        p['error'] = None
        return p

    def _new_page(self, url, lru=None):
        if lru is None:
            lru = url_to_lru_clean(url, TLDS_TREE)
        p = Page()
        p['url'] = url
        p['lru'] = lru
        p['depth'] = 0
        p['timestamp'] = int(time.time() * 1000)
        return p

    def _should_follow(self, depth, tolru):
        c1 = depth < self.maxdepth
        c2 = self.prefixes_trie.match_lru(tolru)
        return c1 and c2

    def _request(self, url, noproxy=False, **kw):
        kw['meta'] = {'handle_httpstatus_all': True, 'noproxy': noproxy}
        kw['callback'] = self.handle_response
        kw['errback'] = self.handle_error
        if self.cookies:
            kw['cookies'] = self.cookies
        if self.phantom:
            kw['method'] = 'HEAD'
        return Request(url, **kw)
Beispiel #28
0
    def get_applications_in_page(self, scroll_script):
        applications = []
        driver = None
        try:
            desired_capabilities = dict(DesiredCapabilities.PHANTOMJS)
            desired_capabilities["phantomjs.page.settings.userAgent"] = useragent.get_random_agent(google_prop.user_agent_list_url)
            service_args = ['--load-images=no', '--proxy=%s' % (proxy.get_random_proxy(google_prop.proxy_list_url))]
            driver = PhantomJS(desired_capabilities=desired_capabilities, service_args=service_args)
            # driver = Firefox(firefox_profile=self.fp, proxy=self.proxy)

            if self.proxy_test:
                driver.get('http://curlmyip.com/')
                ip = driver.find_element_by_xpath('//body//pre').text
                print('ip : [ ' + ip + ' ]')
                pass
            else:
                driver.get(self.url)
                driver.execute_script(scroll_script)

                acknowledge = 0
                done = False
                while not done:
                    scroll_finished = driver.execute_script("return scraperLoadCompleted")
                    if scroll_finished:
                        if acknowledge == self.acknowledgements:
                            done = driver.execute_script("return scraperLoadCompleted")
                            pass
                        else:
                            acknowledge += 1
                            pass
                        pass
                    else:
                        acknowledge = 0
                        pass
                    time.sleep(5)  # Wait before retry
                    pass

                product_matrix = driver.find_elements_by_class_name("card")
                for application in product_matrix:
                    extracted_application = self.extract_application_data(application)
                    # if extracted_application['app_price'] != -1:
                    applications.append(extracted_application)
                    #pass
                    pass
                pass
            driver.quit()
            pass

        except Exception as e:
            if driver is not None:
                driver.quit()
                pass

            if self.attempt < self.retries:
                self.attempt += 1
                time.sleep(10)
                print 'retry : url [ ' + self.url + ' ] + | attempt [ ' + str(self.attempt) + ' ] | error [ ' + str(e) + ' ]'
                applications = self.get_applications_in_page(scroll_script)
                pass
            else:
                print('fail : url [ ' + self.url + ' ] | error [ ' + str(e) + ' ]')
                pass
            pass
        return applications
        pass
class CamaraCGCrawler(object):
    """ Camara CG Ementa Crawler """
    def __init__(self, starting_year):
        self.base_url = "http://187.115.174.90:8080/ScanLexWeb"
        self.starting_year = starting_year
        self.browser = None

    @staticmethod
    def get_ementa_id(published_date, ementa_type, ementa_doc_number,
                      ementa_situation):
        """ Return the Ementa Unique Id """
        return "%s#%s#%s#%s" % (datetime.strftime(
            published_date,
            "%Y-%m-%d"), ementa_type, ementa_doc_number, ementa_situation)

    def get_all_ementas_summary(self):
        """ Yield the next ementa information row """

        browser_table = self.browser.find_element_by_id(
            "frmMenu:tabEmentas_data")
        bs_ementa_table = BeautifulSoup(
            browser_table.get_attribute("innerHTML"))

        for row in bs_ementa_table.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) == 6:
                published_date = datetime.strptime(
                    cols[0].span.text.encode("utf-8"), "%d/%m/%Y")
                doc_number = int(cols[1].span.text.encode("utf-8"))
                title = cols[2].span.text.encode("utf-8")
                ementa_type = cols[3].span.text.encode("utf-8")
                ementa_situation = cols[4].span.text.encode("utf-8")
                details_js = cols[5].a['onclick'].encode("utf-8")

                if published_date > datetime.now():
                    continue

                yield published_date, doc_number, title, ementa_type, ementa_situation, details_js

    def get_ementa_details(self, ementa_details_js):
        """ Crawl the second ementa page """

        # Waiting...
        _ = WebDriverWait(self.browser, 30).until(
            EC.visibility_of_element_located(
                (By.ID, "frmfuncao:j_idt13_content")))
        _ = WebDriverWait(self.browser, 30).until(
            EC.visibility_of_element_located(
                (By.ID, "frmfuncao:tabProponentes")))

        # Get Ementail Details
        bs_ementa_details = BeautifulSoup(self.browser \
            .find_element_by_id("frmfuncao:j_idt13_content").get_attribute("innerHTML"))

        rows = bs_ementa_details.find_all("tr")

        source = rows[3].td.text
        main_theme = rows[7].td.text
        sys_enter_date = datetime.strptime(rows[9].td.text, "%d/%m/%Y")
        approval_date = datetime.strptime(rows[11].td.text, "%d/%m/%Y")
        process_number = int(rows[15].td.text or "-1")
        autograph_number = int(rows[19].td.text or "-1")
        process_year = int(rows[21].td.text or "-1")
        has_image = rows[23].td.text == "Sim"

        # Get Proponent names
        bs_proponent = BeautifulSoup(
            self.browser.find_element_by_id(
                "frmfuncao:tabProponentes").get_attribute("innerHTML"))

        proponents = ",".join(
            [col.text for col in bs_proponent.find_all("td")])

        return source, proponents, main_theme, sys_enter_date, approval_date, process_number, \
            autograph_number, process_year, has_image

    def next_ementa(self, select_curs):
        """ Iterate in the years onwards and collect all the ementas """

        try:
            LOGGER.info("Opening Browser")
            self.browser = PhantomJS()

            LOGGER.info("GET [%s]", self.base_url)
            self.browser.maximize_window()

            cur_year = int(datetime.now().year)

            # Define the initial collection year
            select_curs.execute(
                "SELECT EXTRACT (YEAR FROM MAX(published_date)) FROM ementas;")
            last_exec_year = select_curs.fetchone()
            if last_exec_year:
                collection_year = max(self.starting_year, last_exec_year[0])
            else:
                collection_year = self.starting_year

            all_proponents = [
                "ANDERSON MAIA", "Afonso Alexandre Régis",
                "Alcides Cavalcante", "Alcindor Villarim", "Aldo Cabral",
                "Alexandre do Sindicato", "Antonio Pereira",
                "Antônio Alves Pimentel Filho", "Aragão Júnior",
                "Bruno Cunha Lima Branco", "Bruno Gaudêncio", "Buchada",
                "Cassiano Pascoal", "Cozete Babosa",
                "Cássio Murilo Galdino de Araujo", "Daniella Ribeiro",
                "Dr. Nunes", "Executivo", "Fabrinni Brito",
                "Fernando carvalho", "Francisco Dantas Lira",
                "Galego do Leite", "Inacio Falcao", "Ivan Batista",
                "Ivonete Ludgerio", "Joao Dantas", "Josimar Henrique da Silva",
                "José Marcos Raia ", "José Ribamar", "João Dantas",
                "Jóia Germano", "Laelson Patricio", "Lafite",
                "Lindaci Medeiros Nápolis", "Lourdes Costa", "Lula Cabral",
                "Marcos Marinho", "Maria Lopes Barbosa", "Marinaldo Cardoso",
                "Metuselá Agra", "Miguel Rodrigues da Silva",
                "Miguel da Construção", "Napoleão Maracajá",
                "Nelson Gomes Filho", "Olimpio Oliveira", "Orlandino Farias",
                "Paulo Muniz", "Paulo de Tarso", "Peron Ribeiro Japiassú",
                "Renato Feliciano", "Rodolfo Rodrigues",
                "Rodrigo Ramos Victor", "Romero Rodrigues", "Rostand Paraíba",
                "Rômulo Gouveia", "Saulo Germano", "Saulo Noronha", "Tia Mila",
                "Tovar Correia Lima", "Vaninho Aragão",
                "Veneziano Vital do rego", "Walter Brito Neto", "Todos"
            ]

            while collection_year <= cur_year:

                for i_prop in range(len(all_proponents)):

                    ementa_prop = all_proponents[i_prop].decode("utf-8")

                    self.browser.get(self.base_url)

                    # Waiting...
                    WebDriverWait(self.browser, 30).until(
                        EC.element_to_be_clickable((By.ID, "frmMenu:button1")))

                    LOGGER.info("Collecting Ementas from [%d][%s - %d/%d]",
                                collection_year, ementa_prop, i_prop + 1,
                                len(all_proponents))

                    # Set Year
                    year_field = self.browser.find_element_by_id("frmMenu:ano")
                    year_field.send_keys(collection_year)

                    # Set Proponent
                    proponent_field = self.browser.find_element_by_id(
                        "frmMenu:autoridade")
                    proponent_field.send_keys(ementa_prop)

                    # Submit the form
                    self.browser.find_element_by_id("frmMenu:button1").click()

                    # Waiting...
                    # _ = WebDriverWait(self.browser, 60).until(EC.visibility_of_element_located((By.ID, "frmMenu:tabEmentas_data")))
                    time.sleep(3)

                    for published_date, document_number, title, ementa_type, ementa_situation, ementa_details_js in self.get_all_ementas_summary(
                    ):
                        ementa_id = self.get_ementa_id(published_date,
                                                       ementa_type,
                                                       document_number,
                                                       ementa_situation)

                        select_curs.execute("""
                            SELECT ementa_id
                            FROM ementas
                            WHERE ementa_id = '%s';
                            """ % ementa_id)

                        if not select_curs.fetchone():
                            # Run the details script
                            self.browser.execute_script(ementa_details_js)
                            ementa_source, proponents, main_theme, sys_enter_date, approval_date, \
                                process_number, autograph_number, process_year, has_image = self.get_ementa_details(ementa_details_js)

                            # Come back to the table page
                            self.browser.back()

                            # Waiting...
                            _ = WebDriverWait(self.browser, 60).until(
                                EC.visibility_of_element_located(
                                    (By.ID, "frmMenu:tabEmentas_data")))

                            yield ementa_id, published_date, ementa_type, document_number, title, \
                                ementa_source, proponents, ementa_situation, main_theme, sys_enter_date, \
                                approval_date, process_number, autograph_number, process_year, has_image

                LOGGER.info("DONE [%d]", collection_year)

                self.browser.back()

                collection_year += 1

        finally:
            if self.browser:
                self.browser.quit()
Beispiel #30
0
class Client(object):
    """Client HTTP pour tester fonctionnellement Strass

    Adapteur du pilote Selenium, avec une interface inspirée de Nightwatch.js,
    et quelques paramètres spécifiques à Strass."""

    def __init__(self):
        self.driver = PhantomJS()
        self.driver.set_window_size(1120, 550)

    def __del__(self):
        self.driver.quit()

    def get(self, query=None):
        server = os.environ.get('STRASS_TEST_SERVER', 'http://localhost:8000')
        url = server + (query or '/')
        self.driver.get(url)
        return self

    def find(self, selector):
        return self.driver.find_element_by_css_selector(selector)

    def click(self, selector):
        self.find(selector).click()
        return self

    def fill(self, selector, value):
        if isinstance(value, datetime.date):
            self.fill(selector + ' input.day', str(value.day))
            self.fill(selector + ' input.month', str(value.month))
            self.fill(selector + ' input.year', str(value.year))
        else:
            control = self.find(selector)
            try:
                control.clear()
            except selexc.InvalidElementStateException:
                # On doit tenter de nettoyer un input[type=file]. On zap.
                pass
            control.send_keys(value)
        return self

    def select(self, selector, value):
        Select(self.find(selector)).select_by_value(value)
        return self

    def submit(self, selector='#document button[type=submit]'):
        return self.click(selector)

    def close(self):
        self.driver.close()
        if self.driver.window_handles:
            self.driver.switch_to.window(self.driver.window_handles[0])
        self.driver.set_window_size(1120, 550)
        return self

    def screenshot(self, filename):
        self.driver.get_screenshot_as_file(filename)
        sys.stderr.write("Capture d'écran enregistrée dans %r\n" % (filename,))
        return self

    def save(self, filename):
        with open(filename, 'w') as fo:
            fo.write(self.driver.page_source)
        sys.stderr.write("HTML enregistré dans %r\n" % (filename,))
        return self

    def __getattr__(self, name):
        return getattr(self.driver, name)
class RequestUtil:
    __browserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0'

    def __init__(self):
        self.cookies = ''
        self._lock = threading.RLock()

    def http_get_request(self, url, referer, timeout=''):
        self._lock.acquire()
        cookie = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie),
                                      SmartRedirectHandler())
        urllib2.install_opener(opener)
        headers = {
            'User-Agent': self.__browserAgent,
            'Referer': referer,
            'Cache-Control': 'max-age=0',
            'Accept': '*/*',
            'Connection': 'Keep-Alive',
            'Accept-encoding': 'gzip'
        }
        req = urllib2.Request(url=url, headers=headers)
        if timeout == '':
            open = urllib2.urlopen(req)
        else:
            open = urllib2.urlopen(req, timeout=timeout)
        if self.cookies == '':
            for item in cookie:
                self.cookies = self.cookies + item.name + '=' + item.value + ';'
            self.cookies = self.cookies[:-1]
        if url != open.url:
            req = urllib2.Request(url=open.url, headers=headers)
        self._lock.release()
        return (open, req)

    def http_post_request(self, url, datas, referer, timeout=''):
        self._lock.acquire()
        postdata = urllib.urlencode(datas)
        headers = {
            'User-Agent': self.__browserAgent,
            'Referer': referer,
            'Content-Type': 'application/x-www-form-urlencoded',
            'Cache-Control': 'no-cache',
            'Accept': '*/*',
            'Connection': 'Keep-Alive',
            'Accept-encoding': 'gzip',
            'Cookie': self.cookies
        }
        req = urllib2.Request(url=url, data=postdata, headers=headers)
        req.get_host()
        if timeout == '':
            open = urllib2.urlopen(req)
        else:
            open = urllib2.urlopen(req, timeout=timeout)
        if url != open.url:
            req = urllib2.Request(url=open.url, headers=headers)
        self._lock.release()
        return (open, req)

    def http_get(self, url, refer='https://www.baidu.com'):
        return self.http_get_request(url, refer, 60)

    def http_post(self, url, datas, refer='https://www.baidu.com'):
        return self.http_post_request(url, datas, refer, 60)

    def http_post_request2(self, url, datas, timeout=''):
        if timeout == '':
            open = urllib2.urlopen(url, datas)
        else:
            open = urllib2.urlopen(url, datas, timeout=timeout)
        data = open.read()
        return data

    def http_post2(self, url, datas):
        return self.http_post_request2(url, datas, 300)

    def create_phandomjs(self, service_args, caps, timeout=30):
        self.driver = PhantomJS(desired_capabilities=caps,
                                service_args=service_args)
        self.driver.set_page_load_timeout(timeout)
        self.driver.set_script_timeout(timeout)
        self.driver.implicitly_wait(timeout)

    def close_phandomjs(self):
        try:
            self.driver.quit()
        except:
            pass

    def http_get_phandomjs(self,
                           url,
                           refer='https://www.baidu.com',
                           timeout=1000):
        caps = dict(DesiredCapabilities.PHANTOMJS)
        caps['browserName'] = 'chrome'
        caps["phantomjs.page.settings.resourceTimeout"] = timeout
        caps["phantomjs.page.settings.loadImages"] = False
        caps["phantomjs.page.settings.userAgent"] = (self.__browserAgent)
        caps["phantomjs.page.customHeaders.Referer"] = (refer)

        service_args = []
        service_args.append('--load-images=no')
        service_args.append('--disk-cache=yes')
        service_args.append('--cookies-file=')

        self.create_phandomjs(timeout=timeout,
                              service_args=service_args,
                              caps=caps)
        self.driver.get(url)
        return self.driver.page_source
Beispiel #32
0
class RouteStatistic(object):
    def __init__(self,
                 url,
                 phantomjs=None,
                 resolution=None,
                 ya_class=None,
                 screen_path=None,
                 screen_pattern=None,
                 csv_path=None):
        self.url = url

        self.phantomjs = phantomjs or DEFAULT_PHANTOMJS
        assert os.path.isfile(self.phantomjs), "phantomjs не найден"

        resolution = resolution or FULLHD
        assert isinstance(resolution, (list, tuple))
        assert len(resolution) == 2

        self.ya_class = ya_class or DEFAULT_YA_CLASS
        self.screen_path = screen_path or PATH

        self.screen_pattern = screen_pattern or '%s.png'
        assert '%s' in self.screen_pattern

        self.csv_path = csv_path or os_join(PATH, 'statistic.csv')

        self.driver = PhantomJS(self.phantomjs)
        self.driver.set_window_size(*resolution)

    def track(self):
        self.driver.get(self.url)
        WebDriverWait(self.driver, 5).until(is_class_exist(self.ya_class))
        time = self.driver.find_element_by_class_name(self.ya_class).text
        now = datetime.now()
        self._save_screenshot(now)
        self._update_file(now, *[t.strip() for t in time.split(',')])

    def _save_screenshot(self, now):
        if '%s' in self.screen_pattern:
            file_name = self.screen_pattern % (now, )
        else:
            file_name = self.screen_pattern
        file_name = os_join(self.screen_path, file_name)
        self.driver.save_screenshot(file_name)

    def _update_file(self, now, time, distance):
        with open(self.csv_path, 'a') as csvfile:
            writer = csv.writer(csvfile, delimiter=str('\t'))
            writer.writerow([
                now,
                time,
                distance,
            ])

    def __call__(self):
        return self.track()

    def __del__(self):
        if hasattr(self, 'driver') and self.driver:
            self.driver.service.process.send_signal(signal.SIGTERM)
            self.driver.quit()