Example #1
0
class Translator(threading.Thread):
    def __init__(self, queue, executable_path=None,
                              desired_capabilities=None,
                              service_args=None,
                              google_translate_url=config['google_translate_url'],
                              window_size=config['window_size']):

        super(self.__class__, self).__init__()
        self._queue = queue

        kwargs = {}

        if executable_path is not None:
            kwargs['executable_path'] = executable_path
        if desired_capabilities is not None:
            kwargs['desired_capabilities'] = desired_capabilities
        if service_args is not None:
            kwargs['service_args'] = service_args

        self._driver = PhantomJS(**kwargs)
        self._driver.set_window_size(*window_size)
        self._driver.get(google_translate_url)

    def run(self):
        while True:
            task = self._queue.get()

            if task is None:
                self._queue.task_done()
                self._driver.quit()
                break

            task.do(self._driver)
            self._queue.task_done()
Example #2
0
class SeleniumTestCase(LiveServerTestCase):
    def _pre_setup(self):
        super(SeleniumTestCase, self)._pre_setup()
        self.driver = PhantomJS()

    def _post_teardown(self):
        self.driver.quit()
        super(SeleniumTestCase, self)._post_teardown()

    def login(self, username='******', password='******', url='login'):
        """
        Login to the server and be authenticated
        """
        self.open(reverse(url))
        self.driver.find_element_by_id("id_username").clear()
        self.driver.find_element_by_id("id_username").send_keys(username)
        self.driver.find_element_by_id("id_password").clear()
        self.driver.find_element_by_id("id_password").send_keys(password)
        self.driver.find_element_by_id("submit-id-login").click()

    def open(self, url):
        self.driver.get("%s%s" %(self.live_server_url, url))

    def is_element_present(self, how, what):
        try:
            self.driver.find_element(by=how, value=what)
        except NoSuchElementException, e:
            return False
        return True
    def _init_robot(self, id):
        robot = WDriver()
        logging.debug("initialize")
        self.robots.update({str(id): robot})
        logging.debug("get facebook.com")
        robot.get('http://fb.com')

        logging.debug("login")
        robot.find_element_by_name('email').send_keys('*****@*****.**')
        robot.find_element_by_name('pass').send_keys('2855930022040')
        robot.find_element_by_name('pass').send_keys(Keys.RETURN)

        for index in range(len(self.remain_ids)):
            self.lock.acquire()
            user_id = self.remain_ids.pop()
            self.lock.release()
            try:
                self.get_name_for_id(robot, user_id)
            except:
                logging.debug("error while updating record with id=%s" % str(user_id))
                self.error_ids.add(user_id)
            else:
                self.done_ids.add(user_id)
        robot.close()
        return
Example #4
0
def export(plot, filename, width=800, height=600):
    """
    Export plot to file.

    Args:
        plot (quorra.Plot): Quorra plot object to export.
        width (int): Width for plot (pixels).
        height (int): Height for plot (pixels).
        filename (str): Filename to export to.
    """
    global _phantom, __templates__, __cwd__
    if _phantom is None:
        from selenium.webdriver import PhantomJS
        _phantom = PhantomJS(service_log_path=os.path.devnull)
    tmpl = os.path.join(__templates__, 'export.html')
    exp = os.path.join(__cwd__, '.' + str(uuid.uuid1()) + '.html')
    try:
        with open(tmpl, 'r') as fi, open(exp, 'w') as fo:
            dat = fi.read()
            dat = dat.replace('var plot = undefined;', 'var plot = {};'.format(str(plot)))
            dat = dat.replace('width: 800px;', 'width: {}px;'.format(width))
            dat = dat.replace('height: 500px;', 'height: {}px;'.format(height))
            fo.write(dat)
        _phantom.get('file://' + exp)
        _phantom.save_screenshot(filename.replace('.png', '') + '.png')
    finally:
        if os.path.exists(exp):
            os.remove(exp)
    return
Example #5
0
    def selenium(self, webdriverOption=0):
        """
        # 调用浏览器下载,适用于任何情形
        :return:
        """
        if not self.url[:4] == "http":
            return None

        driver = None
        if webdriverOption == 0:
            from selenium.webdriver import PhantomJS

            driver = PhantomJS()
        elif webdriverOption == 1:
            from selenium.webdriver import Chrome

            driver = Chrome()
        elif webdriverOption == 2:
            from selenium.webdriver import Firefox

            driver = Firefox()

        if not driver:
            print(u"-->DownLoader->Selenium driver初始化出错,请检查运行环境或webdriverOption选项")

        driver.get(self.url)
        src = driver.page_source
        driver.quit()
        self.pageSource = src
        return src
Example #6
0
def test_plotly(remove_build):
    """Tests plotly."""
    viz = Plotly()
    ctrl = Nouislider()
    ctrl2 = Button()

    path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'build')
    layout = Layout(directory=path)
    layout.add(viz)
    layout.add_sidebar(ctrl)
    layout.add_sidebar(ctrl2)
    layout.subscribe(callback, ctrl.on_change)
    layout.subscribe(callback, ctrl2.on_click)
    layout.build()

    env = os.environ
    env['PYTHONPATH'] = '{}:{}'.format(os.getcwd(),
                                       os.environ.get('PYTHONPATH', ''))
    server = subprocess.Popen(os.path.join(path, 'src/server.py'), env=env)
    time.sleep(5)

    driver = PhantomJS()
    driver.get('http://localhost:9991')

    assert driver.title == 'Bowtie App'

    server.kill()
Example #7
0
 def scrape_statuses(self):
     headless_browser = PhantomJS()
     headless_browser.get(MTA_URL)
     soup = BeautifulSoup(headless_browser.page_source, "html.parser")
     for line_name in LINES:
         line = self.get_line(soup, line_name)
         self.lines.append(line)
Example #8
0
def generate_image(structure):
    image_path = os.path.join(mkdtemp(), 'okc.png')
    html_path = os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
        'okc.html',
    )
    url = 'file://{}'.format(html_path)
    driver = PhantomJS(service_log_path=mkstemp()[1])
    driver.set_window_size(2000, 500)
    driver.get(url)
    driver.execute_script('setText({});'.format(json.dumps(structure)))

    if random() > 0.4:
        driver.execute_script('hideForm();')
    elif random() > 0.5:
        driver.execute_script('uncheckForm();')

    driver.set_window_size(*driver.execute_script('return getSize();'))
    driver.save_screenshot(image_path)

    # twitter's gonna make our beautiful screenshot a jpeg unless we make it
    # think that we're using transparency for a reason, so,,
    img = Image.open(image_path)
    origin = img.getpixel((0, 0))
    new_origin = origin[:3] + (254,)
    img.putpixel((0, 0), new_origin)
    img.save(image_path)

    subprocess.check_call(['optipng', '-quiet', image_path])

    return image_path
Example #9
0
class KeywordTool(object):
    sources = {'google', 'youtube', 'bing', 'amazon', 'ebay', 'app-store'}

    def __init__(self, source='google', timeout=5):
        self.source = source
        self.base_url = None
        self.timeout = timeout
        self.driver = PhantomJS()
        self.driver.get(self.base_url)

    def search(self, search_term):
        if self.current_url != self.base_url:
            self.source = self.source  # forces page load
        self.driver.find_element_by_xpath(
            '//input[@id="edit-keyword"]').send_keys(search_term)
        self.driver.find_element_by_xpath(
            '//button[@id="edit-submit"]').click()
        """Wait for at least one element to load. In practice, most of them load. You can't get them all without scrolling."""
        element_not_present = EC.invisibility_of_element_located(
            (By.XPATH, '//td[@class="col-keywords"]//div'))
        WebDriverWait(self.driver, self.timeout).until(element_not_present)

    def parse(self):
        tree = html.fromstring(self.driver.page_source)
        L = tree.xpath('//td[@class="col-keywords"]//text()')
        L = map(lambda s: s.strip(), ''.join(L).split('\n'))
        return [s for s in L if s]

    def get_keywords(self, search_term, source='google'):
        if self.source != source:
            self.source = source
        self.search(search_term)
        return self.parse()

    @property
    def source(self):
        return self._source

    @source.setter
    def source(self, val):
        self._source = val if val in self.sources else 'google'
        if 'driver' in self.__dict__:
            self.driver.get(self.base_url)

    @property
    def base_url(self):
        return ''.join(['https://keywordtool.io/', self.source])

    @base_url.setter
    def base_url(self, val):
        pass

    @property
    def current_url(self):
        return self.driver.current_url

    @current_url.setter
    def current_url(self, val):
        pass
Example #10
0
def is_logged_in(browser: PhantomJS):
    browser.get('https://tiki.vn/sales/order/history/')
    full_name = browser.find_element_by_css_selector('.profiles > h6:nth-child(3)')
    if full_name.text:
        logger.info("You has been login with name : {}".format(full_name.text))
        return True
    else:
        return False
Example #11
0
def main():
    global HEAD
    if len(sys.argv) > 1:
        try: HEAD = int(sys.argv[1])
        except: HEAD = 10
    # test mirror list
    mirror_list = read_mirrors()
    for i in mirror_list:
        try:
            cururl = i
            print("Testing:",i)
            res = request.urlopen(i)
        except:
            print("Testing on",i,"failed")
            continue
        try:
            update_mirrors(cururl)
            break;
        except:
            continue;

    try: res
    except: raise Warning('All mirrors unavailable!')
    print('Available mirror:',cururl)

    # get vpn table
    countries = dict()
    dr = PhantomJS()
    dr.get(cururl)
    page = Selector(text=dr.page_source)\
            .xpath('.//td[@id="vpngate_inner_contents_td"]/'
                    'table[@id="vg_hosts_table_id"]//tr')

    if HEAD < len(page): page = page[:HEAD]

    print('Pagelen:',len(page))

    for vpn in page:
        if len(vpn.xpath('./td[@class="vg_table_header"]')) > 0:
            continue

        row = vpn.xpath('./td')
        country = row[0].xpath('./text()').extract_first()
        country = '_'.join(country.split(' '))
        ovpn = row[6].xpath('./a/@href').extract_first()

        if ovpn:
            if country in countries:
                countries[country] += 1
                get_ovpn(url=cururl+ovpn, save_to=country+'/'+str(countries[country]))
            else:
                countries[country] = 0
                if not os.path.exists(country):
                    os.mkdir(country)
                get_ovpn(url=cururl+ovpn, save_to=country+'/'+str(countries[country]))

    dr.quit()
def getHtmlSource(url, time=10):
    driver = PhantomJS(service_args=[
        '--ignore-ssl-errors=true', '--ssl-protocol=any',
        '--web-security=false'
    ])
    driver.get(url)
    WebDriverWait(driver, timeout=time)
    source = driver.page_source
    #driver.save_screenshot('a.png')
    return source
Example #13
0
class Crawler:
    def __init__(self, timeout=20, phantomjs_cfg_file='python-utils/config/phantomjs_cfg.json', use_cfg_file=False, proxy_pool_server='http://127.0.0.1:15110'):
        self.timeout = timeout
        if use_cfg_file:
            phantomjs_service_args = ['--config={}'.format(phantomjs_cfg_file)]
        else:
            _, proxy_type, proxy, proxy_auth = get_proxy(proxy_pool_server)
            phantomjs_service_args = [
                '--proxy-type={}'.format(proxy_type),
                '--proxy={}'.format(proxy),
                '--proxy-auth={}'.format(proxy_auth),
            ]

        self.driver = PhantomJS(
            desired_capabilities=self.new_desired_capabilities(),
            service_args=phantomjs_service_args)
        self.check_client_info()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def close(self):
        self.driver.quit()

    @contextmanager
    def wait_for_page_load(self, old_element):
        yield
        WebDriverWait(self.driver, self.timeout).until(EC.staleness_of(old_element))

    def new_desired_capabilities(self, user_agent=default_ua):
        desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
        if not user_agent:
            user_agent = ua.random
        desired_capabilities["phantomjs.page.settings.userAgent"] = user_agent
        return desired_capabilities

    def check_client_info(self):
        url='http://www.whoishostingthis.com/tools/user-agent/'
        self.driver.get(url)
        ip_addr = get_xpath_element(self.driver, '//*[@id="user-agent"]/div[2]/span').text.strip()
        user_agent = get_xpath_element(self.driver, '//*[@id="user-agent"]/div[1]').text.strip()
        logger.info('IP: {}, User-Agent: {}'.format(ip_addr, user_agent))
        if self.wrong_ip(ip_addr):
            logger.error('Proxy not set correctly!')
            sys.exit(-1)

    def wrong_ip(self, ip_addr):
        if ip_addr.startswith('166.111.') or ip_addr.startswith('59.66.') or ip_addr.startswith('101.5.') or ip_addr.startswith('101.6.'):
            return True
        else:
            return False
def main(argv=sys.argv[1:]):
    parser = argparse.ArgumentParser()
    parser.add_argument('--url', default='http://127.0.0.1:8000/static/index.html')
    args = parser.parse_args(argv)

    url = args.url

    browser = WebDriver()
    browser.get(url)
    tags = browser.find_elements_by_css_selector('li')
    for tag in tags:
        print(tag.text)
    browser.close()
Example #15
0
def catalog_url(url='http://www.meitun.com/'):
    # catalog_url is AJAX,use phantomJS
    driver = PhantomJS()
    driver.get(url)
    driver.maximize_window()
    mov_ele = driver.find_element_by_css_selector('.nav>ul>li:nth-child(1)')
    # the mouse move to the lazy layout element,and perform
    ActionChains(driver).move_to_element(mov_ele).perform()
    time.sleep(3)
    response = driver.page_source
    driver.quit()
    # use pyquery parser the page source,more quickly
    d = pq(response)
    return map(lambda x: 'http:' + pq(x).attr('href'), d.find('.cg-pdts a'))
Example #16
0
def main(argv=sys.argv[1:]):
    parser = argparse.ArgumentParser()
    parser.add_argument('--url',
                        default='http://127.0.0.1:8000/static/index.html')
    args = parser.parse_args(argv)

    url = args.url

    browser = WebDriver()
    browser.get(url)
    tags = browser.find_elements_by_css_selector('li')
    for tag in tags:
        print(tag.text)
    browser.close()
Example #17
0
 def onegoogolePR(self, url):
     '''返回单个PR'''
     prUrl = 'http://pr.chinaz.com'  # 谷歌PR查询地址
     driver = PhantomJS()
     driver.get(prUrl)
     driver.find_element_by_id('PRAddress').send_keys(url)
     driver.find_element_by_class_name('search-write-btn').click()
     try:
         imgsrc = driver.find_element_by_css_selector('span#pr>img').get_attribute('src')
         pr = search(r'\d', imgsrc).group()
     except:
         pr = '暂无数据'
     driver.quit()
     return pr
Example #18
0
class AdvertisementAdvancedViewTests(LiveServerTestCase):
    def setUp(self):
        self.driver = PhantomJS()

        self.user = User.objects.create_user('admin', '*****@*****.**', 'pass')
        self.user.save()

        self.provider = Provider(
            name='provider',
            user=self.user,
        )
        self.provider.save()

        self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider)

    def tearDown(self):
        self.driver.quit()

    def open(self, url):
        self.driver.get("%s%s" % (self.live_server_url, url))

    def test_side_ad_display(self):
        """
        Test that the side ads display properly
        """
        self.open(reverse('advertisements.views.side_ads'))

        self.assertEqual(len(self.driver.find_elements_by_xpath("//a")), 4)

        self.driver.find_element_by_xpath("//a[1]/img")
        self.driver.find_element_by_xpath("//a[2]/img")
        self.driver.find_element_by_xpath("//a[3]/img")
        self.driver.find_element_by_xpath("//a[4]/img")

        self.assertNotEqual(self.driver.find_element_by_xpath("//a[1]").get_attribute("href"), '')
        self.assertNotEqual(self.driver.find_element_by_xpath("//a[2]").get_attribute("href"), '')
        self.assertNotEqual(self.driver.find_element_by_xpath("//a[3]").get_attribute("href"), '')
        self.assertNotEqual(self.driver.find_element_by_xpath("//a[4]").get_attribute("href"), '')

    def test_top_ad_display(self):
        """
        Test that the top ad displays properly
        """
        self.open(reverse('advertisements.views.top_ad'))

        self.assertEqual(len(self.driver.find_elements_by_xpath("//a")), 1)
        self.driver.find_element_by_xpath("//a/img")
        self.assertNotEqual(self.driver.find_element_by_xpath("//a").get_attribute("href"), '')
Example #19
0
class Client:
    def __init__(self, ig_id):
        self.b = PhantomJS()
        self.ig_id = ig_id
        self.b.get('https://instagram.com/%s' % ig_id)

    def close(self):
        self.b.close()

    def get_media(self) -> list:
        js = self.b.execute_script('return window._sharedData;')
        ed = js['entry_data']
        pp = ed['PostPage'][0]
        g = pp['graphql']
        sc = g['shortcode_media']
        if sc['__typename'] == 'GraphSidecar':
            edges = sc['edge_sidecar_to_children']['edges']
            medias = list(
                map(
                    lambda x: {
                        'id': x['node']['id'],
                        'url': x['node']['display_url'],
                        'caption': x['node']['accessibility_caption']
                    }, edges))
        elif sc['__typename'] == 'GraphImage':
            medias = [{
                'id': sc['id'],
                'url': sc['display_url'],
                'caption': sc['accessibility_caption']
            }]
        return list(
            filter(
                lambda x: 'person' in x['caption'] or 'people' in x['caption'],
                medias))

    def get_user(self) -> dict:
        js = self.b.execute_script('return window._sharedData;')
        ed = js['entry_data']
        pp = ed['ProfilePage'][0]
        g = pp['graphql']
        return g['user']

    def get_posts(self) -> set:
        ps = self.b.find_elements_by_css_selector('a[href^="/p/"]')
        return set(map(lambda x: x.get_attribute('href'), ps))

    def scroll(self):
        self.b.execute_script('window.scroll(0, document.body.scrollHeight);')
Example #20
0
def run_get_logic(driver: PhantomJS, command_id, token):
    if not token:
        return {"code": 103, "public": "Session troubles!"}
    driver.add_cookie({
        'name': 'token',
        'value': token,
        'domain': "." + command_id.split(":")[0],
        'path': '/'
    })
    driver.get("http://{}/cabinet".format(command_id))
    try:
        flag_there = driver.find_element_by_xpath('//html//body//div//h5//i')
        flag_container = flag_there.get_attribute('innerHTML')
        return flag_container
    except NoSuchElementException as e:
        return "error_no_flag_in_cabinet"
Example #21
0
def start(n, comic_url):
    urllists.append(comic_url)
    driver = PhantomJS()
    driver.get(comic_url)
    get_images_url(n, driver, comic_url)
    while True:
        try:
            driver.find_element_by_xpath(
                "//li[@id='next_item']/a[@id='mainControlNext']").click()
            comic_url = driver.current_url
            if comic_url not in urllists:
                urllists.append(comic_url)
                get_images_url(n, driver, comic_url)
                driver.find_element_by_xpath(
                    "//li[@id='next_item']/a[@id='mainControlNext']").click()
                # print n + '\t' + comic_url
        except:
            print 'All done!'
            break
def check_agree(link, soup):
    # Agree if asked to (click on accept)

    if soup.find('input',
                 {'id': 'ctl00_mainContentArea_disclaimerContent_yesButton'}):
        print("Agreeing the terms of use - please wait...")
        driver = PhantomJS('.\phantomjs.exe' if platform.
                           startswith('win32') else './phantomjs')
        driver.get(link)
        driver.find_element_by_id(
            'ctl00_mainContentArea_disclaimerContent_yesButton').click()
        for cookie in driver.get_cookies():
            s.cookies.set(cookie['name'], cookie['value'])
        driver.quit()
        resp_inner = s.get(link)
        soup = Soup(resp_inner.text, features="lxml")
        print("Done, now let's get back to the scraping process.")

    return soup
Example #23
0
 def on_start_again(self, url):
     driver = PhantomJS()
     driver.get(url)
     time.sleep(2)
     driver.maximize_window()
     t = driver.find_element_by_css_selector('.page-txt').text
     res_t = []
     if t:
         t = int(t.split('/')[1][:-1]) - 1  # get the page count
         # the count of page turning should be i-1
         while t:
             t -= 1
             move_ele = driver.find_element_by_css_selector('#next')
             ActionChains(driver).move_to_element(move_ele).click()
             time.sleep(1)
             res_t.append(driver.page_source)
     driver.quit()
     for item in res_t:
         self.step_first(item)
Example #24
0
class Premiumgeneratorlink(object):
	def __init__(self, url):
		self.url = url
		self.browser = PhantomJS()

	def get_link(self):
		try:
			self.browser.get('http://premiumgeneratorlink.com/')
			self.browser.find_element_by_name('link').send_keys(self.url)
			self.browser.find_element_by_xpath('//a[@class="input"]').click()
			wdw = WebDriverWait(self.browser, 10)
			wdw.until(EC.element_to_be_clickable((By.ID, 'check'))).click()
			wdw.until(EC.element_to_be_clickable((By.ID, 'generate'))).click()
			link = wdw.until(EC.visibility_of_element_located((By.XPATH, '//form[@class="center"]'))).get_attribute('action')
		except (WebDriverException, NoSuchElementException, TimeoutException):
			return False
		finally:
			self.browser.quit()
		return link
Example #25
0
def run_get_logic(driver: PhantomJS, comand_id, post, flag, cookies):
    if 'sessions' not in cookies:
        return {"code": MUMBLE, "public": "Session troubles!"}
    driver.add_cookie({
        'name': 'sessions',
        'value': cookies['sessions'],
        'domain': "." + comand_id.split(":")[0],
        'path': '/'
    })
    driver.get("http://{}/{}".format(comand_id, post))
    try:
        flag_there = driver.find_element_by_xpath('//li/a[@href="#"]')
        flag_container = flag_there.get_attribute('innerHTML')
        if flag in flag_container:
            return {"code": OK}
        else:
            return {"code": CORRUPT, "public": "Can't find my private data!"}
    except NoSuchElementException:
        return {"code": CORRUPT, "public": "Can't find my private data!"}
Example #26
0
def get_item(browser: PhantomJS, item: dict, check_inverter):
    price_expect = get_price(item['price'])
    max_retry = 5
    retry = 0
    while retry < max_retry:
        retry += 1
        browser.get(item.get('url'))
        item_title = browser.find_element_by_css_selector('#product-name')
        item_name = item_title.text

        item_price = browser.find_element_by_css_selector('#span-price')
        logger.info("{} -> {}".format(item_price.text, item_name))
        price_seller = get_price(item_price.text)
        screen_shot(browser=browser, file_name='buy.png', item_name=item_name)
        if price_seller <= price_expect:
            browser.find_element_by_css_selector('#\#mainAddToCart').click()
            return item_name
        else:
            logger.info("Retry : {}. {}".format(retry, item_title.text))
            time.sleep(check_inverter)
Example #27
0
class Leecherus(object):
	def __init__(self, url):
		self.url = url
		self.browser = PhantomJS()

	def get_link(self):
		try:
			self.browser.get('http://leecher.us')
			wdw = WebDriverWait(self.browser, 10)
			wdw.until(EC.visibility_of_element_located((By.NAME, 'link'))).send_keys(self.url)
			wdw.until(EC.element_to_be_clickable((By.XPATH, '//button[@class="subscribe"]'))).click()
			wdw.until(EC.element_to_be_clickable((By.XPATH, '//input[@class="subscribe"]'))).click()
			self.browser.switch_to_window(self.browser.window_handles[1])
			onclick = wdw.until(EC.element_to_be_clickable((By.ID, 'get_link'))).get_attribute('onclick')
		except (WebDriverException, NoSuchElementException, TimeoutException, IndexError):
			return False
		finally:
			self.browser.quit()
		m = re.search("'(http://[^']+)'", onclick)
		return m.group(1) if m else False
Example #28
0
 def post(self):
     id = request.values['page']
     page = Page.objects.get_or_404(id=id)
     # html = requests.get(page.baseurl).text
     screenshot = None
     try:
         phantom = PhantomJS(desired_capabilities={'acceptSslCerts': True},
                             service_args=['--web-security=false',
                                           '--ssl-protocol=any',
                                           '--ignore-ssl-errors=true'], port=8888)
         phantom.set_window_size(1024, 768)
         phantom.get(page.baseurl)
         html = phantom.page_source
         screenshot = phantom.get_screenshot_as_png()
         phantom.close()
     except Exception as ex:
         html = "error when i snap your page ... %s" % ex
         snap = Snap(html, datetime.datetime.now(), screenshot).save()
         page.update(push__snaps=snap)
     snap = Snap(html, datetime.datetime.now(), screenshot).save()
     page.update(push__snaps=snap)
     return jsonify({'id': "%s" % snap.id})
def render(gist_id, commit):
    block_url = 'http://bl.ocks.org/' + gist_id
    d3_block_rec = {'gist_id': gist_id}
    try:
        driver = PhantomJS()
        driver.get(block_url)
        time.sleep(RENDER_DELAY)  # let it render
        fullpage_im = Image.open(BytesIO(driver.get_screenshot_as_png()))
        fimb = BytesIO()
        fullpage_im.save(fimb, 'png')
        d3_block_rec['fullpage_base64'] = base64.b64encode(fimb.getvalue())
        d3_block_rec['block_url'] = driver.current_url
    except Exception as e:
        # we got nothing
        with LittlePGer('dbname=' + DB_NAME, commit=commit) as pg:
            d3_block_rec['error'] = str(e)
            pg.insert('d3_block', values=d3_block_rec)
        exit(10)

    try:
        f = driver.find_element_by_xpath('//iframe')
        x, y = int(f.location['x']), int(f.location['y'])
        w, h = x + int(f.size['width']), y + int(f.size['height'])
        block_im = fullpage_im.crop((x, y, w, h))
        bimb = BytesIO()
        block_im.save(bimb, 'png')
        d3_block_rec['block_base64'] = base64.b64encode(bimb.getvalue())
        d3_block_rec['block_size'] = list(block_im.size)
    except Exception as e:
        # at least we got the fullpage im, save it
        with LittlePGer('dbname=' + DB_NAME, commit=commit) as pg:
            d3_block_rec['error'] = str(e)
            pg.insert('d3_block', values=d3_block_rec)
        exit(11)

    # all good, save everything
    with LittlePGer('dbname=' + DB_NAME, commit=commit) as pg:
        pg.insert('d3_block', values=d3_block_rec)
Example #30
0
def run_get_logic(driver: PhantomJS, team_ip, user, password):
    session_cookie = beacons_api.sign_in(team_ip, user, password)
    print('cookie: ' + session_cookie)
    beacons = beacons_api.get_all_user_beacons(team_ip, session_cookie)
    if not beacons:
        print('no beacons')
        return {"code": 103}
    print('beacons: ' + str(beacons))
    driver.get(f"http://{team_ip}:{SERVICE_PORT}/")
    driver.add_cookie({
        'name': 'session',
        'value': session_cookie,
        'domain': "." + team_ip,
        'path': '/'
    })
    for beacon_id in beacons:
        driver.get(f'http://{team_ip}:{SERVICE_PORT}/Beacon/{beacon_id}')
        print(driver.current_url)
        # print(driver.page_source)
        if beacon_id in driver.current_url:
            print('fine')
            return {"code": 101}
    return {'code': 110}
Example #31
0
    def get_video_src(self):

        #room url check
        o = urlparse(self.room_url)
        rule_key = o.netloc

        if rule_key in self.rules:
            cap = webdriver.DesiredCapabilities.PHANTOMJS

            cap["phantomjs.page.settings.resourceTimeout"] = 1000
            cap["phantomjs.page.settings.loadImages"] = True
            cap["phantomjs.page.settings.disk-cache"] = True
            cap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0(iPhone;CPU iPhone OS 9_1 like Mac OSX) AppleWebKit / 601.1"
            ".46(KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1"
            driver = PhantomJS(self.driverPath["PhantomJS"],
                               desired_capabilities=cap)
            # 指定使用的浏览器
            # driver = webdriver.Firefox()
            driver.implicitly_wait(10)
            my_rule = self.rules[rule_key]

            url_prefix = my_rule["url_prefix"]
            driver.get("%s%s" % (url_prefix, o.path))
            try:
                result_video = driver.find_element_by_tag_name(
                    'video').get_attribute('src')
                driver.close()
                return result_video

            except:
                # return "未能获得该直播间地址直播流地址"
                driver.close()
                return "主播不在了"
                # return " "
        else:

            return "不支持的网站(not support url)"
Example #32
0
class CNStock(SentimentCrawler):
    def __init__(self):
        super().__init__(init=False)
        self.driver = PhantomJS()
        self.driver.maximize_window()
        self.wait = WebDriverWait(self.driver, 15)
        self.url = 'http://www.cnstock.com/'
        self.name = '中国证券网'

    def crawl_main_page(self, keyword):
        self.driver.set_page_load_timeout(10)
        try:
            self.driver.get(self.url)
        except TimeoutException:
            self.driver.execute_script('window.stop();')

        try:
            self.wait.until(
                ec.presence_of_element_located((By.ID, 'nav_keywords')))
        except:
            CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR)

        self.driver.find_element_by_id('nav_keywords').clear()
        self.driver.find_element_by_id('nav_keywords').send_keys(keyword +
                                                                 Keys.ENTER)

        return self.crawl_search_results()

    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()

        exit_flag = 0
        while True:
            try:
                self.wait.until(
                    ec.presence_of_element_located(
                        (By.CLASS_NAME, 'result-cont')))
            except TimeoutException:
                CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_class_name(
                    'result-article')

                for each_article in result_articles:
                    item = Entity()

                    publish_date = each_article.find_element_by_class_name(
                        'g').text
                    item.publish_date = re.search(
                        re.compile(
                            '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d'
                        ), publish_date).group()

                    if not in_date_range(
                            conv_pub_date(item.publish_date, 'cnstock'),
                            self.year_range):
                        exit_flag = 1
                        # 跳出for循环
                        break
                    item.short_description = each_article.find_element_by_class_name(
                        'des').text
                    item.title = each_article.find_element_by_tag_name(
                        'a').text
                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    if item.title in self.titles:
                        continue
                    else:
                        self.titles.append(item.title)

                    item.url = each_article.find_element_by_tag_name(
                        'a').get_attribute('href')
                    threading.Thread(target=super().download_and_save_item,
                                     args=(item, )).start()

                if exit_flag == 1:
                    break
            except NoSuchElementException:
                CustomLogging.log_to_file('没有搜索结果', LogType.INFO)
                break

            try:
                next_page = self.driver.find_element_by_xpath(
                    '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]'
                )
                self.driver.get(next_page.get_attribute('href'))
                # next_page.click()
            except NoSuchElementException:
                break

        return search_results

    def parse_html(self, url, html):
        bs = BeautifulSoup(html, 'lxml')
        try:
            full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text
            return full_content
        except Exception:
            CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url),
                                      LogType.ERROR)
            pass
Example #33
0
def get_tickers(stock_list):

    list = pd.read_csv(stock_list)
    try:
        tickers = [c for c in list[list.columns[1]]]
    except Exception as e:
        print(e)
        sys.exit(1)

    #List for the stocks that had some error being collected
    error_stocks = []

    # Creates directory for the stock data CSV files
    if not os.path.exists('stock_dfs'):
        os.makedirs('stock_dfs')

    global source

    # When data collecting will start and end for the Dates
    global start
    global end

    print(f'>>>Getting Stock Data from {source} from {end}')

    #Iterating through each ticker
    for ticker in tqdm.tqdm(tickers):

        # Reading data on the stock. If grabbing todays data failes, tries to grab data from yesterday
        try:
            df = web.DataReader(ticker, source, start, end)
        except:
            #Changing end date to yesterday
            end = (dt.datetime.now() - dt.timedelta(1)).strftime('%Y-%m-%d')
            df = web.DataReader(ticker, source, start, end)

        #High/Low Open/Close percentage
        df['HL_pct'] = ((df['High'] - df['Low']) / df['Low']) * 100
        df['OC_pct'] = ((df['Close'] - df['Open']) / df['Open']) * 100

        #Boolinger Band
        df['Middle Boolinger'] = df['Adj Close'].rolling(20).mean()
        df['Sup_Boolinger'] = df['Middle Boolinger'] + (
            2 * df['Adj Close'].rolling(20).std())
        df['Inf_Boolinger'] = df['Middle Boolinger'] - (
            2 * df['Adj Close'].rolling(20).std())

        #Exponential Moving Mean
        df['Exp20_Close'] = df['Adj Close'].ewm(span=20, adjust=False).mean()

        #Expantion/Contraction of stock price
        df['Deviation_band'] = df['Adj Close'].rolling(20).std()

        #RSI
        change = df['Adj Close'].diff(1)

        gain = change.mask(change < 0, 0)
        loss = change.mask(change > 0, 0)
        avg_gain = gain.ewm(min_periods=rsi_period, com=rsi_period - 1).mean()
        avg_loss = loss.ewm(min_periods=rsi_period, com=rsi_period - 1).mean()
        rs = abs(avg_gain / avg_loss)
        df['RSI'] = 100 - (100 / (1 + rs))
        '''
            Now the code will do a webscrape on some pages on yahoo finance to get more details and info. It will do this by table
            reading or span-string reading since some pages don't have tables. With table reading it's straight up but with span reading
            we need to get the reactID of each line we want. And for that it's kind of hardcoded, I read through all the span lines 
            and wrote down the useful ones.  
        '''

        #Reading into page
        resp = requests.get(
            f'https://finance.yahoo.com/quote/{ticker}/financials')
        #BeautifulSoup scrapes the page in TXT form
        soup = bs.BeautifulSoup(resp.text, 'lxml')

        #Number of span lines we got
        length = int(np.array(soup.find_all('span')).shape[0])

        #All lines with the span class, which has the info we want
        lines = np.array(soup.find_all('span'))

        #List to store the span lines that have the reactID codes we want
        spans = []

        #Dates we want to find
        find_dates = ['12/30/2019', '12/30/2018', '12/30/2017', '12/30/2016']

        #List for the dates we actually find
        dates = []

        #Iterating through the lines and grabbing all lines from the span class
        for line in range(0, length):
            spans.append(BeautifulSoup(str(lines[line]), features='lxml').span)

        #Iterating through each date we want to find in the website
        for date in find_dates:

            #Iterating through each span-class line
            for line in range(0, length):

                #If the text line and date match then put the date in the found dates list
                if spans[line].string == date:
                    dates.append(spans[line].string)
                    break

        #Changes date format for indexing with the webreader dataframe
        for index, date in enumerate(dates):

            #If any string dpesn't match the format than it's not a date and will be removed
            try:
                dates[index] = dt.datetime.strptime(
                    date, "%m/%d/%Y").strftime("%Y-%m-%d")
            except:

                #dates.remove will raise exception when there is no more of such content in the list, stopping the loop
                removed = False
                while (removed == False):
                    try:
                        dates.remove(dates[index])
                    except:
                        removed = True

        #Adding 3 days to the dates, because most stocks don't opperate on the last day of the year. Which is
        #the date time for the data to appear on the website.
        for index, date in enumerate(dates):
            dates[index] = (dt.datetime.strptime(date, '%Y-%m-%d') +
                            dt.timedelta(3)).strftime('%Y-%m-%d')

        #Info we want to get from the webiste
        interesting_lines = [
            'Total Revenue', 'Cost of Revenue', 'Gross Profit',
            'Selling General and Administrative', 'Total Operating Expenses',
            'Operating Income or Loss', 'Interest Expense',
            'Total Other Income/Expenses Net', 'Income Before Tax',
            'Income Tax Expense', 'Income from Continuing Operations',
            'Net Income', 'Net Income available to common shareholders',
            'EBITDA'
        ]

        #List for the info we actually find on the website
        infos = []

        #List for the ReactIDs of the lines that have the data about the infos above
        number_ids = []

        #Column renaming
        column_names = [
            'Total Revenue (TTM)', 'Cost of Revenue (TTM)',
            'Gross Profit (TTM)',
            'Selling General and Administrative Expenses (TTM)',
            'Total Operating Expenses (TTM)', 'Operating Income or Loss (TTM)',
            'Interest Expense (TTM)', 'Total Other Income/Expenses Net',
            'Income Before Tax (TTM)', 'Income Tax Expense (TTM)',
            'Income from Coninuing Operations (TTM)', 'Net Income (TTM)',
            'Net Income available to Shareholders (TTM)', 'EBITDA (TTM)'
        ]

        #Iterating through the informations we want
        for index, info in enumerate(interesting_lines):

            #Boolean for if the information was found
            check = False

            #Iterating through the span lines
            for line in range(0, length):

                #If line contains the information we want, appends it to the found infos list.
                if spans[line].string == info:
                    infos.append(spans[line].string)

                    #Appends the info's reactID +5, one line below, where the numbers and data are
                    number_ids.append(
                        str(int(spans[line]['data-reactid']) + 5))
                    check = True
                    pass

            #In case the information isn't found, the respective column name is changed to a NAN, to be removed later
            if check == False:
                column_names[index] = np.nan

        #Removing NANs from column name list
        column_names = [c for c in column_names if str(c) != 'nan']

        #Creating the columns for the information
        for column in column_names:
            df[f'{column}'] = np.nan

        #Iterating through dates, with indexing
        for index, date in enumerate(dates):

            #Iterating through new columns, with indexing
            for column, string in enumerate(column_names):

                #Iterating through span lines
                for line in range(0, length):

                    #Fetching data for the respective information column in order
                    if spans[line]['data-reactid'] == number_ids[column]:

                        #Locates the date in dataframe index, formats the string of the data, turns it into a Integer and
                        #puts the data in it's correct place in time.
                        try:
                            df[f'{string}'].loc[dates[index]] = int(
                                (spans[line].string).replace(',', ''))
                        except Exception as e:
                            print(e)
                            print(
                                f'Error formating/alocating string to int for stock {ticker}'
                            )

                            #Appending to stocks with errors list
                            error_stocks.append(ticker)
                            continue
            #Adding 2 to the IDs for each iteration so we get the lines of previous dates for the information
            number_ids = [int(c) for c in number_ids]
            number_ids = [c + 2 for c in number_ids]
            number_ids = [str(c) for c in number_ids]

        #Page URL that we will pass to PhantomJS
        url = f'https://finance.yahoo.com/quote/{ticker}/key-statistics'

        #Initiating PhantomJS
        driver = PhantomJS(executable_path=r'phantomjs.exe')

        #Opening URL with PhantomJS to fully load the page
        driver.get(url)

        #Returning page source after all the JavaScript codes have been loaded
        resp = driver.page_source

        #Closing PhantomJS
        driver.quit()

        #List of tables that Pandas found in the web page
        dfs = pd.read_html(resp)

        #Dataframe to put all the tables in just one
        key_stats = pd.DataFrame()

        #Iterating through the tables
        for dframe in dfs:

            #If dataframe is empty, passes the first table
            if key_stats.empty:
                key_stats = dframe

            #If it already has a table, appends the new ones
            else:
                key_stats = key_stats.append(dframe)

        #Fixing dataframe index, with numbers from 0 to length of dataframe
        key_stats.index = [c for c in range(0, key_stats.shape[0])]

        #There´s some info that we don´t have interest so we drop what we don´t need
        stats = key_stats.loc[:8]

        #Removing columns 0 and 1
        stats = stats.drop([0, 1], axis=1)

        #Passing the information names as the dataframe index
        stats.index = [c for c in stats['Unnamed: 0'].values]

        #Removing the column with information names, since it´s all in the index
        stats = stats.drop(['Unnamed: 0'], axis=1)

        #Transposing the dataframe, so that the Dates become the index and the information names become the column
        stats = stats.transpose()

        #Criating the new columns in the main dataframe
        for column in stats.columns:
            df[f'{column}'] = np.nan

        #Putting all the dates in a list
        dates = [c for c in stats.index]

        #Iterating through the dates
        for index, date in enumerate(dates):
            #Changing date format
            try:
                dates[index] = dt.datetime.strptime(
                    date, "%m/%d/%Y").strftime("%Y-%m-%d")
            except:
                #One of the dates actually has more things than the date so we remove all that
                date = date.replace('As of Date: ', '')
                date = date.replace('Current', '')
                dates[index] = dt.datetime.strptime(
                    date, "%m/%d/%Y").strftime("%Y-%m-%d")

        #Adding 3 days because stocks don´t opperate in the last day of the year
        for index, date in enumerate(dates):
            dates[index] = (dt.datetime.strptime(date, '%Y-%m-%d') +
                            dt.timedelta(3)).strftime('%Y-%m-%d')

        #Passing changed dates back into the dataframe´s index
        stats.index = dates

        #Iterating through dates again
        for date in stats.index:

            #Iterating through the new columns
            for column in stats.columns:

                #Locating the dates and columns in the main dataframe and putting the respetive data in it´s place
                try:
                    df[f'{column}'].loc[date] = stats[f'{column}'].loc[date]

                #If any errr occurs in this process, shows the error for the respective stock and adds it to the
                #stocks-with-error list
                except Exception as e:
                    print(e)
                    print(
                        f'Error formating/alocating string to int for stock {ticker}'
                    )

                    #Appending to stocks with errors list
                    error_stocks.append(ticker)
        '''
        Since we only have info year by year and the .loc funtion only puts the data in the specific index, we need to
        fill the NANs with the previous data that isn't a NAN (ffill method). This way, from each data alocated, all future 
        lines will have this exact data, until a new data (the most recent) appears, and the process repeats.
        '''
        df.fillna(method='ffill', inplace=True)

        # Saving csv file
        df.to_csv('stock_dfs/{}.csv'.format(ticker))

    #Showing any stocks with errors if there are any
    if error_stocks != []:
        print('\n ------ Inspect Errors ------- \n')
        print([c for c in error_stocks])
Example #34
0
from selenium.webdriver import PhantomJS

driver = PhantomJS(
    executable_path=
    r'E:\Documents\Apps\phantomjs-2.1.1-windows\bin\phantomjs.exe')
url = 'http://cxwh.kexing100.com:82/?app_act=detail&id=328&from=groupmessage'
driver.get(url)
while True:
    driver.refresh()
    print driver.find_element_by_xpath("//div[@class='xinfo']").text
    # driver.execute_script("return localStorage.setItem('toupiao','0')")
    driver.execute_script("return localStorage.removeItem('toupiao')")
    driver.delete_all_cookies()
    driver.refresh()
    vote = driver.find_element_by_xpath("//span/input[@class='btn1']").click()
    # break
Example #35
0
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount):
    retail_invoice_url = RETAIL_INVOICE_URL[retail]

    driver = PhantomJS()
    driver.get(retail_invoice_url)

    # 1 Set doc_type 'select'
    try:
        select_doc_type = Select(driver.find_element_by_name('txtTipoDte'))
        value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value']
        select_doc_type.select_by_value(value)
        # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name']
        # select_doc_type.select_by_visible_text(name)
    except Exception:
        print 'ERROR: set doc_type select as Boleta'
        driver.save_screenshot('screen.png')
        return '', ''

    time.sleep(5)

    # 2 Get recaptcha img url
    try:
        recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image')
        recaptcha_img_url = recaptcha_img.get_attribute('src')
    except Exception:
        print 'ERROR: get recaptcha image url'
        driver.save_screenshot('screen.png')
        return '', ''

    # 3 Solve recaptcha
    v = VisionApi()
    recaptcha_value = v.detect_text_from_url(recaptcha_img_url)

    if recaptcha_value is None:
        print 'ERROR: solving recaptcha image'
        driver.save_screenshot('screen.png')
        return '', ''

    # 4 Fill form
    script = u"""
        document.getElementsByName('txtFolio')[0].value = '{invoice_id}';
        document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}';
        document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}';
        document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}';
    """.format(
        invoice_id=invoice_id,
        invoice_date=invoice_date,
        invoice_amount=invoice_amount,
        recaptcha_value=recaptcha_value,
    )
    driver.execute_script(script)

    # 5 Submit form
    try:
        driver.find_element_by_name('frmDatos').submit()
    except Exception:
        print 'ERROR: submitting form'
        driver.save_screenshot('screen.png')
        return '', ''

    # 6 Get url files
    try:
        xml_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]')
        pdf_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]')

        xml_url = xml_a_tag.get_attribute('href')
        pdf_url = pdf_a_tag.get_attribute('href')
    except Exception:
        print 'ERROR: getting url files'
        driver.save_screenshot('screen.png')
        return '', ''

    # 8 Delete driver session
    driver.close()
    driver.quit()

    return xml_url, pdf_url
    'http://www.greatschools.org/north-carolina/morrisville/3360-Cedar-Fork-Elementary/',
]

indicators = ['EquityRaceEthnicity', 'EquityLowIncome', 'EquityDisabilities']
wd = PhantomJS()

output_cols = [
    'school', 'url', 'students_per_grade', 'teachers_to_student', 'counselors_to_student', 'reading', 'math',
    'science'
]
output_df = DataFrame(columns=output_cols)
output_ind = 0

for url in urls:
    t1 = time()
    wd.get(url)
    school_name = wd.title.split(' -')[0]
    print school_name,

    school_info = wd.find_elements_by_class_name('school-info__item')
    for s in school_info:
        inner_html = sub(r'<.*?>|\n', ' ', s.get_attribute('innerHTML'))
        inner_html = sub(r'\s+', ' ', inner_html).strip()
        if 'grades' in inner_html.lower():
            min_grade, max_grade = inner_html.split(' ')[-1].split('-')
            if min_grade.lower() == 'pk':
                min_grade = -1
            elif min_grade.lower() == 'k':
                min_grade = 0
            n_grades = int(max_grade) - int(min_grade) + 1
        elif 'students' in inner_html.lower():
Example #37
0
class PagesCrawler(BaseSpider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kw):
        args = DEFAULT_INPUT.copy()
        args.update(kw)
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['maxdepth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's']]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
        if self.phantom:
            self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0
        dispatcher.connect(self.closed, spider_closed)
        dispatcher.connect(self.crashed, spider_error)

    def start_requests(self):
        self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO)
        self.log("ARGUMENTS : "+str(self.args), log.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(
            scrapyd_config().get('logs_dir'),
            HYPHE_PROJECT,
            self.name,
            self.crawler.settings['JOBID']
        )
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(
            executable_path=PHANTOM['PATH'],
            service_args=phantom_args,
            desired_capabilities=self.capabilities,
            service_log_path="%s-phantomjs.log" % self.prefixfiles
        )
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def crashed(self, spider):
        self.errors += 1
        self.closed("CRASH")

    def closed(self, reason):
        if self.errors:
            self.log("%s error%s encountered during the crawl." %
                (self.errors, 's' if self.errors > 1 else ''), log.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

          # Collect whole DOM of the webpage including embedded iframes
            with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

          # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout,
                        self.ph_idle_timeout, self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", log.INFO)
                except SeleniumTimeout:
                    self.log("Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log("Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

      # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images(response.body), flags=flags, request=response.request)
                self.log("WARNING: page with base64 embedded images was cleaned-up for links extraction")
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        self.log("ERROR : %s" % failure.getErrorMessage(), log.ERROR)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url)
            elif redir_url.startswith('./') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url)
            except ValueError, e:
                self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lru, lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)
def get_ratio_data():
    import re
    import time
    from bs4 import BeautifulSoup

    from app.models import Company, Indicators
    from app.utils import cash_to_float, depercentize


    # Dict item with list: element attribute, attribute value to look for, optional transform function
    indicators = {'roe': {
                          'attribute': 'data-reactid',
                          'value': re.compile(".*RETURN_ON_EQUITY\.1$"),
                          'transform': depercentize,
                          },
                  'fcf': {
                          'attribute': 'data-reactid',
                          'value': re.compile(".*LEVERED_FREE_CASH_FLOW\.1$"),
                          'transform': cash_to_float,
                          },
                  'ev2ebitda': {
                          'attribute': 'data-reactid',
                          'value': re.compile(".*ENTERPRISE_VALUE_TO_EBITDA\.1$"),
                          },
                  }


    companies = Company.query.with_entities(Company.symbol).all()
    symbols = [company[0] for company in companies]

    print("Iterate through symbols")

    for symbol in symbols:
        print("{} Fetching {}  :".format(time.strftime("%H:%M:%S"), symbol))

        #driver = MyWebDriver()
        retry_current = 0
        retry_limit = 5
        while retry_current < retry_limit:
            try:
                driver = PhantomJS()
            except URLError:
                time.sleep(retry_current**2)
            retry_current += 1

        driver.set_window_size(1120, 550)
        driver.get("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol))
        try:
            #WebDriverWait(driver, 10).until(EC.title_contains("AAPL Key Statistics | Apple Inc. Stock - Yahoo Finance"))
            #element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "td[reactid]")))
            #element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//td[@data-reactid[ends-with(., 'RETURN_ON_EQUITY.1')]]")))

            # these two seem to work...
            element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[substring(@data-reactid, string-length(@data-reactid) - string-length('RETURN_ON_EQUITY.1') +1) = 'RETURN_ON_EQUITY.1']")))
            #element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[contains(@data-reactid,'RETURN_ON_EQUITY.1')]")))

            #"//input[@id[ends-with(.,'register')]]"
        except TimeoutException as e:
            print  "Caught", e
            print driver.title
            continue

        #time.sleep(5)

        #with open("{}.out".format(symbol), "w") as f:
        #    f.write(driver.page_source.encode('utf-8'))

        soup = BeautifulSoup(driver.page_source, "lxml")

        d = {'symbol': symbol}
        for indicator in indicators.keys():
            curr_ind = indicators[indicator]
            s = soup.find_all(attrs={curr_ind['attribute']: curr_ind['value']})
            print indicator, s

            for element in s:
                if curr_ind.has_key('transform'):
                    f = curr_ind['transform']
                    #print(f(element.text))
                    d[indicator] = f(element.text)
                else:
                    #print(element.text)
                    d[indicator] = element.text

        try:
            db.session.add(Indicators.from_json(d))
            db.session.commit()
        except (IntegrityError, UnmappedInstanceError) as e:
            print "Caught", e
            db.session.rollback()

        print "indicators", d
Example #39
0
class Client(object):
    """Client HTTP pour tester fonctionnellement Strass

    Adapteur du pilote Selenium, avec une interface inspirée de Nightwatch.js,
    et quelques paramètres spécifiques à Strass."""

    def __init__(self):
        self.driver = PhantomJS()
        self.driver.set_window_size(1120, 550)

    def __del__(self):
        self.driver.quit()

    def get(self, query=None):
        server = os.environ.get('STRASS_TEST_SERVER', 'http://localhost:8000')
        url = server + (query or '/')
        self.driver.get(url)
        return self

    def find(self, selector):
        return self.driver.find_element_by_css_selector(selector)

    def click(self, selector):
        self.find(selector).click()
        return self

    def fill(self, selector, value):
        if isinstance(value, datetime.date):
            self.fill(selector + ' input.day', str(value.day))
            self.fill(selector + ' input.month', str(value.month))
            self.fill(selector + ' input.year', str(value.year))
        else:
            control = self.find(selector)
            try:
                control.clear()
            except selexc.InvalidElementStateException:
                # On doit tenter de nettoyer un input[type=file]. On zap.
                pass
            control.send_keys(value)
        return self

    def select(self, selector, value):
        Select(self.find(selector)).select_by_value(value)
        return self

    def submit(self, selector='#document button[type=submit]'):
        return self.click(selector)

    def close(self):
        self.driver.close()
        if self.driver.window_handles:
            self.driver.switch_to.window(self.driver.window_handles[0])
        self.driver.set_window_size(1120, 550)
        return self

    def screenshot(self, filename):
        self.driver.get_screenshot_as_file(filename)
        sys.stderr.write("Capture d'écran enregistrée dans %r\n" % (filename,))
        return self

    def save(self, filename):
        with open(filename, 'w') as fo:
            fo.write(self.driver.page_source)
        sys.stderr.write("HTML enregistré dans %r\n" % (filename,))
        return self

    def __getattr__(self, name):
        return getattr(self.driver, name)
	pass
outputfile = open(cityName + '-instagram-output.csv', 'a', 0)

print colorama.Back.RED+colorama.Fore.YELLOW+str(len(setUrlDefined))+' URLs already defined! Lets Rock more now...'+colorama.Back.RESET+colorama.Fore.RESET

driver = PhantomJS('./phantomjs') # in case of PhantomJS not available, we can use Firefox
for line in tqdm(inputfile, total=numLines, desc='Crawling Instagram', leave=True):
	try:
		idtweet, url = line.replace('\n', '').split(',')
		if idtweet in setUrlDefined:
			continue
	except IndexError:
		print colorama.Fore.RED, 'Corrupted Line', colorama.Fore.RESET
		continue
	try:
		driver.get(url)
		placetag = driver.find_element_by_class_name('_kul9p')
		placeurl = placetag.get_attribute('href').encode('utf-8')
		placename = placetag.get_attribute('title').encode('utf-8')

		usernametag = driver.find_element_by_class_name('_4zhc5')
		username = usernametag.get_attribute('title').encode('utf-8')
		
	except selenium.common.exceptions.NoSuchElementException:
		try:
			error = driver.find_element_by_class_name('error-container')
			print colorama.Fore.RED, 'Sample Not Available Anymore', colorama.Fore.RESET
			outputfile.write(idtweet + ',' + url + ',404\n')
			continue
		except selenium.common.exceptions.NoSuchElementException:
			print colorama.Fore.RED, 'No Coords Available', colorama.Fore.RESET
Example #41
0
class ProviderAdvancedViewTests(LiveServerTestCase):
    def setUp(self):
        self.driver = PhantomJS()

        self.user = User.objects.create_user('admin', '*****@*****.**', 'password')
        self.user.save()

        self.provider = Provider(
            name='provider',
            user=self.user,
        )
        self.provider.save()

        self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider)

        self.login()

    def tearDown(self):
        self.driver.quit()

    def open(self, url):
        self.driver.get("%s%s" % (self.live_server_url, url))

    def login(self):
        self.open(settings.LOGIN_URL)
        self.driver.find_element_by_id("id_username").send_keys("admin")
        self.driver.find_element_by_id("id_password").send_keys("password")
        self.driver.find_element_by_css_selector("button.btn.btn-default").click()

        self.assertEqual(
            self.driver.current_url,
            self.live_server_url + reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk]),
        )

    def test_can_login(self):
        """
        Test that the user can login
        """
        pass

    def test_provider_page_has_all_data(self):
        """
        Test that the provider statistics page has all the correct data
        """
        self.open(reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk]))

        self.assertEqual("Open Ads", self.driver.title)
        self.assertIn(
            "{0} advertisements".format(self.provider.name),
            self.driver.find_element_by_css_selector("h1.page-header").text
        )
        self.assertIn(
            "{0} advertisements in rotation".format(20),
            self.driver.find_element_by_css_selector("h1.page-header").text
        )

    def test_advertisement_page_has_all_data(self):
        """
        Test that the advertisement page has all the correct data
        """

        for advert in self.provider_adverts:
            self.open(reverse('advertisements.views.view_advert_statistics', args=[advert.pk]))

            self.assertIn(
                "ID number: {0}".format(advert.pk),
                self.driver.find_element_by_css_selector("h1.page-header").text,
            )
            self.driver.find_element_by_css_selector("img")
            self.assertEqual("Active", self.driver.find_element_by_xpath("//td[2]/span").text)
            self.assertEqual(advert.url, self.driver.find_element_by_link_text(advert.url).text)
            self.driver.find_element_by_link_text("Edit URL").click()
            self.assertEqual(advert.url, self.driver.find_element_by_id("id_url").get_attribute("value"))
Example #42
0
class WeixinPhantomjs(Base):
    def __init__(self):
        self.start_page = START_PAGE
        self.end_page = END_PAGE
        self.weixin_url = REFER_FIRST

        # self.driver = Firefox()
        if hasattr(config, 'PHANTOMJS_PATH'):
            self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH'))
        else:
            self.driver = PhantomJS()

        self.client = MongoClient(HOST, PORT)
        self.collection = self.client[DB][COLLECTION]
        self.all_uids = self.uids

    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            time.sleep(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False

    def get_total_pages_to_word(self):
        pages = []
        page_id_css = 'pagebar_container'

        try:
            e = self.driver.find_element_by_id(page_id_css)
            for _p in e.text.split():
                _p = _p.strip()

                if not _p.isdigit():
                    return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1]
                else:
                    pages.append(int(_p))
            return 1
        except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError):
            pass
        return 1

    def get_query_words(self, word):
        query_words = []

        for docs in self.collection.find({}, {'rel': 1, 'conp': 1}).sort([('_id', 1)]):
            w = docs['conp']

            if w not in query_words:
                query_words.append(w)

            for item in docs['rel']:
                if item not in query_words:
                    query_words.append(item)

        self.client.close()

        return self.query_index(query_words, word)

    @property
    def uids(self):
        return {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs}

    def extract_urls_uids(self, word):
        urls_uids = []
        timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')]
        urls_tits = [(t.get_attribute('href'), self.trim(t.text))
                     for t in self.driver.find_elements_by_css_selector('h4 a')]

        if len(urls_tits) != len(timestamp):
            return urls_uids

        for index, url_tit in enumerate(urls_tits):
            try:
                uid = self.md5(timestamp[index] + url_tit[1] + word)

                if uid not in self.all_uids:
                    self.all_uids.add(uid)
                    urls_uids.append({'url': url_tit[0], 'uid': uid})
            except (TypeError, IndexError):
                pass
        return urls_uids

    @staticmethod
    def query_index(words, cut_word):
        temp_words = words[START_INDEX:END_INDEX]

        try:
            index = temp_words.index(cut_word)
            return temp_words[index:], index + START_INDEX
        except ValueError:
            pass
        return temp_words, START_INDEX

    @property
    def is_forbidden(self):
        css_id = 'seccodeForm'

        try:
            if self.driver.find_element_by_id(css_id):
                return True
        except NoSuchElementException:
            pass
        return False

    def appear_element(self, by):
        try:
            # Have `click` function to specified element
            tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by))
            tag.click()
            return True
        except (TimeoutException, NoSuchWindowException, NoSuchElementException):
            pass
        return False

    def crawl(self, word=None, go=0):
        is_go = True
        is_break = False
        go_page = int(go)
        next_page_css = 'sogou_page_%s'
        query_words, ind = self.get_query_words(word)

        for index, word in enumerate(query_words, 1):
            next_ind = ind + index
            is_break = self.open_weixin_browser(word)
            pages = self.get_total_pages_to_word()

            for page in range(self.start_page + 1, (pages or self.end_page) + 1):
                if is_go and page < go_page:
                    continue
                else:
                    is_go = False

                if not self.appear_element(by=next_page_css % page):
                    is_break = True
                    msg = '\tNot appear next page element, will break'
                elif self.is_forbidden:
                    is_break = True
                    msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

                if is_break:
                    storage_word.append([word, page])
                    self.logger.info(msg)
                    break

                urls_uids = self.extract_urls_uids(word=word)
                Article(urls_uids=urls_uids, word=word).extract()

                # self.driver.find_element_by_id(next_page_css % page).click()
                wt = randint(10, 40) if page % 3 == 0 else randint(5, 18)
                self.logger.info('Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.format(next_ind, word, page, wt))
                # self.driver.implicitly_wait(wt)
                time.sleep(wt)

            if is_break:
                break

        in_client.close()
        self.close_browser()

    def close_browser(self):
        try:
            self.driver.close()
        except (NoSuchWindowException,):
            pass
Example #43
0
class PagesCrawler(BaseSpider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False,
                                         deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kw):
        args = DEFAULT_INPUT.copy()
        args.update(kw)
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['maxdepth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.discover_prefixes = [
            url_to_lru_clean(
                "http%s://%s" %
                (https, u.replace('http://', '').replace('https://', '')))
            for u in to_list(args['discover_prefixes']) for https in ['', 's']
        ]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args[
            'phantom'] and args['phantom'].lower() != "false"
        if self.phantom:
            self.ph_timeout = int(
                args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(
                args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(
                args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0
        dispatcher.connect(self.closed, spider_closed)
        dispatcher.connect(self.crashed, spider_error)

    def start_requests(self):
        self.log(
            "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'],
            log.INFO)
        self.log("ARGUMENTS : " + str(self.args), log.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'),
                                        HYPHE_PROJECT, self.name,
                                        self.crawler.settings['JOBID'])
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles,
                 log.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' %
                            self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities[
            'phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(executable_path=PHANTOM['PATH'],
                                 service_args=phantom_args,
                                 desired_capabilities=self.capabilities,
                                 service_log_path="%s-phantomjs.log" %
                                 self.prefixfiles)
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def crashed(self, spider):
        self.errors += 1
        self.closed("CRASH")

    def closed(self, reason):
        if self.errors:
            self.log(
                "%s error%s encountered during the crawl." %
                (self.errors, 's' if self.errors > 1 else ''), log.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

            # Collect whole DOM of the webpage including embedded iframes
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

            # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout, self.ph_idle_timeout,
                        self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", log.INFO)
                except SeleniumTimeout:
                    self.log(
                        "Scrolling/Unfolding timed-out (%ss)" %
                        self.ph_timeout, log.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err,
                             log.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log(
                        "Scrolling/Unfolding crashed: %s %s" % (type(e), e),
                        log.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

    # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url,
                                        headers=response.headers,
                                        body=cleanupbase64images(
                                            response.body),
                                        flags=flags,
                                        request=response.request)
                self.log(
                    "WARNING: page with base64 embedded images was cleaned-up for links extraction"
                )
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        error = failure.getErrorMessage()
        self.log("ERROR : %s" % error, log.ERROR)
        if PROXY and not PROXY.startswith(
                ':') and "OpenSSL.SSL.Error" in error:
            return self._request(failure.request.url, noproxy=True)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                      redir_url)
            elif redir_url.startswith(
                    './') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                      redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log(
                    "ERROR: links extractor crashed on %s: %s %s" %
                    (response, type(e), e), log.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url)
            except ValueError, e:
                self.log("Error converting URL %s to LRU: %s" % (url, e),
                         log.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lru, lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)
Example #44
0
def get_ratio_data():
    import socket
    import re
    import time
    import dryscrape
    import webkit_server
    from random import randint
    from fake_useragent import UserAgent
    from bs4 import BeautifulSoup
    from selenium.webdriver import PhantomJS


    from app.models import Company, Indicators
    from app.utils import cash_to_float, depercentize


    # Dict item with list: element attribute, attribute value to look for, optional transform function
    indicators = {'roe': {
                          'attribute': 'data-reactid',
                          'value': re.compile(".*RETURN_ON_EQUITY\.1$"),
                          'transform': depercentize,
                          },
                  'fcf': {
                          'attribute': 'data-reactid',
                          'value': re.compile(".*LEVERED_FREE_CASH_FLOW\.1$"),
                          'transform': cash_to_float,
                          },
                  }


    ua = UserAgent()


    #with open("10.csv", "r") as f:
    #with open("sp500-2.csv", "r") as f:
    with open("10_stocks", "r") as f:
        data = f.read()

    symbols = []
    for i in data.split("\n"):
        if i:

            symbols.append(i.split(",")[0])

    print("Iterate through symbols")
    ## dryscrape
    #session = dryscrape.Session()
    #session.set_header('User-Agent', ua.random)
    #session.set_timeout(5)

    for symbol in symbols:
        print("{} Fetching {}  :".format(time.strftime("%H:%M:%S"), symbol))

        import pdb; pdb.set_trace()
        #driver = MyWebDriver()
        driver = PhantomJS()
        driver.set_window_size(1120, 550)
        driver.get("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol))

        ##try:
        ##    session = dryscrape.Session()
        ##except socket.error as e:
        ##    print("Failed to configure session {}".format(e))
        ##    continue

        ##session.set_header('User-Agent', ua.random)
        ##session.set_timeout(30)
        #try:
        #    #session.visit("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol))
        #except Exception as e:
        #    print e, "try once more......"
        #    session.reset()
        #    time.sleep(5)
        #    session = dryscrape.Session()
        #    #session.set_header('User-Agent', ua.random)
        #    try:
        #        session.set_timeout(5)
        #        session.visit("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol))
        #    except Exception as e:
        #        print e, "done trying..."
        #        session.reset()
        #        time.sleep(2)
        #        session = dryscrape.Session()
        #        continue


        #except socket.error as e:
        #    print("Failed to get {}, {} (1)".format(symbol, e))
        #    continue
        #except webkit_server.EndOfStreamError as e:
        #    print("Failed to get {}, {}, breaking (2)".format(symbol, e))
        #    continue
        #except webkit_server.InvalidResponseError as e:
        #    print("Failed to get {}, {}, breaking (3)".format(symbol, e))
        #    continue

        #response = session.body()
        #soup = BeautifulSoup(response, "lxml")

        with open("{}.out".format(symbol), "w") as f:
            f.write(driver.page_source.encode('utf-8'))

        soup = BeautifulSoup(driver.page_source, "lxml")

        d = {'symbol': symbol}
        for indicator in indicators.keys():
            curr_ind = indicators[indicator]
            s = soup.find_all(attrs={curr_ind['attribute']: curr_ind['value']})
            print indicator, s

            for element in s:
                if curr_ind.has_key('transform'):
                    f = curr_ind['transform']
                    #print(f(element.text))
                    d[indicator] = f(element.text)
                else:
                    #print(element.text)
                    d[indicator] = element.text

        try:
            db.session.add(Indicators.from_json(d))
            db.session.commit()
        except (IntegrityError, UnmappedInstanceError) as e:
            print "Caught", e
            db.session.rollback()

        print "indicators", d
Example #45
0
def selenium_opener(url):
    driver=PhantomJS(executable_path = 'phantomjs的路径')
    driver.get(url)
    hrml=driver.page_source
    driver.quit()
    return html
Example #46
0
def main():
    os.makedirs(dlDir, exist_ok=True)
    startCatIdx = int(sys.argv[1]) if len(sys.argv) > 1 else 0
    startFamIdx = int(sys.argv[2]) if len(sys.argv) > 2 else 0
    startPrdIdx = int(sys.argv[3]) if len(sys.argv) > 3 else 0
    executor = ThreadPoolExecutor()
    PhantomJS.waitClickable = waitClickable
    driver = PhantomJS()
    # harvest_utils.driver = driver
    with open('netgear_filelist.csv', 'w') as fout:
        cw = csv.writer(fout)
        cw.writerow([
            'model', 'fw_ver', 'fileName', 'fw_url', 'fw_date', 'fileSize',
            'sha1', 'md5'
        ])
    driver.get('http://downloadcenter.netgear.com/')
    # click DrillDown
    driver.waitClickable(
        '#ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_BasicSearchPanel_btnAdvancedSearch'
    ).click()  # noqa
    ctl00 = "#ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_adsPanel_"  # noqa ignore=E501
    #
    # wait Page2
    try:
        catSel = Select(driver.waitClickable(ctl00 + "lbProductCategory"))
        numCat = len(catSel.options)
        for catIdx in range(startCatIdx, numCat):
            catSel = Select(driver.waitClickable(ctl00 + "lbProductCategory"))
            print('catIdx=', catIdx)
            catTxt = catSel.options[catIdx].text
            uprint('catTxt= ' + catTxt)
            oldText = driver.getText(ctl00 + "lbProductFamily")
            catSel.select_by_index(catIdx)
            driver.waitTextChanged(ctl00 + "lbProductFamily", oldText)
            famSel = Select(driver.waitClickable(ctl00 + "lbProductFamily"))
            numFam = len(famSel.options)
            for famIdx in range(startFamIdx, numFam):
                famSel = Select(
                    driver.waitClickable(ctl00 + "lbProductFamily"))  # noqa
                print('famIdx=', famIdx)
                startFamIdx = 0
                famTxt = famSel.options[famIdx].text
                uprint('famTxt= ' + famTxt)
                oldText = driver.getText(ctl00 + "lbProduct")
                famSel.select_by_index(famIdx)
                driver.waitTextChanged(ctl00 + "lbProduct", oldText)
                prdSel = Select(driver.waitClickable(ctl00 + "lbProduct"))
                numPrd = len(prdSel.options)
                for prdIdx in range(startPrdIdx, numPrd):
                    prdSel = Select(driver.waitClickable(ctl00 + "lbProduct"))
                    startPrdIdx = 0
                    print("catIdx,famIdx,prdIdx=%d, %d, %d" %
                          (catIdx, famIdx, prdIdx))
                    prdTxt = prdSel.options[prdIdx].text
                    uprint('cat,fam,prd="%s","%s","%s"' %
                           (catTxt, famTxt, prdTxt))  # noqa ignore=E501
                    prdWaiting = driver.waitElem(
                        ctl00 +
                        "upProgProductLoader > div > img")  # noqa ignore=E501
                    prdSel.select_by_index(prdIdx)
                    try:
                        WebDriverWait(driver, 1, 0.5).\
                            until(lambda x: prdWaiting.is_displayed() is True)
                    except TimeoutException:
                        pass
                    try:
                        WebDriverWait(driver, 5, 0.5).\
                            until(lambda x: prdWaiting.is_displayed() is False)
                    except TimeoutException as ex:
                        pass
                    numResults = driver.waitText(
                        ctl00 + "lvwAllDownload_lblAllDownloadResult", 3,
                        0.5)  # noqa ignore=E501
                    if numResults is None:
                        continue
                    numResults = int(re.search(r"\d+", numResults).group(0))
                    print('numResults=', numResults)
                    if numResults > 10:
                        driver.waitClickable("#lnkAllDownloadMore", 3).click()
                    try:
                        erItems = driver.getElems(
                            'a.register-product.navlistsearch', 3, 0.5)  # noqa
                    except TimeoutException:
                        erItems = driver.getElems(
                            'div#LargeFirmware > ul > li > div > p > a.navlistsearch',
                            3)  # noqa ignore=E501

                    if len(erItems) != numResults:
                        print('Error, numResults=%d, but len(erItems)=%d' %
                              (numResults, len(erItems)))
                    for itemIdx, erItem in enumerate(erItems):
                        if not erItem.is_displayed():
                            print('itemIdx=%d is not displayed()' % itemIdx)
                            continue
                        erItem.getItemText = getItemText
                        desc = erItem.getElemText(erItem)
                        uprint('desc="%s"' % desc)
                        if 'firmware' not in desc.lower():
                            continue
                        fw_url = erItem.get_attribute('data-durl')
                        if not fw_url:
                            fw_url = erItem.get_attribute('fw_url')
                        print('fw_url=', fw_url)
                        if not fw_url:
                            continue
                        if not fw_url.startswith('http'):
                            print('Error: fw_url=', fw_url)
                            continue
                        executor.submit(download_file, prdTxt, desc, fw_url)
    except BaseException as ex:
        traceback.print_exc()
        import pdb
        pdb.set_trace()
        driver.save_screenshot("netgear_crawler2")
    finally:
        driver.quit()
        executor.shutdown(True)
Example #47
0
    'http://www.greatschools.org/north-carolina/morrisville/3360-Cedar-Fork-Elementary/',
]

indicators = ['EquityRaceEthnicity', 'EquityLowIncome', 'EquityDisabilities']
wd = PhantomJS()

output_cols = [
    'school', 'url', 'students_per_grade', 'teachers_to_student',
    'counselors_to_student', 'reading', 'math', 'science'
]
output_df = DataFrame(columns=output_cols)
output_ind = 0

for url in urls:
    t1 = time()
    wd.get(url)
    school_name = wd.title.split(' -')[0]
    print school_name,

    school_info = wd.find_elements_by_class_name('school-info__item')
    for s in school_info:
        inner_html = sub(r'<.*?>|\n', ' ', s.get_attribute('innerHTML'))
        inner_html = sub(r'\s+', ' ', inner_html).strip()
        if 'grades' in inner_html.lower():
            min_grade, max_grade = inner_html.split(' ')[-1].split('-')
            if min_grade.lower() == 'pk':
                min_grade = -1
            elif min_grade.lower() == 'k':
                min_grade = 0
            n_grades = int(max_grade) - int(min_grade) + 1
        elif 'students' in inner_html.lower():
Example #48
0
class plugin:
    def __init__(self):
        APP_ROOT = os.path.dirname(os.path.abspath(__file__))
        print(APP_ROOT)
        self.req = 0
        self.driver = PhantomJS(APP_ROOT + "/phantomjs",
                                service_log_path=os.path.devnull)
        self.driver.implicitly_wait(3)

    def restart(self):
        self.__init__()

    def frame_search(self, path):
        framedict = {}
        for child_frame in self.driver.find_elements_by_tag_name('frame'):
            child_frame_name = child_frame.get_attribute('name')
            framedict[child_frame_name] = {'framepath': path, 'children': {}}
            xpath = '//frame[@name="{}"]'.format(child_frame_name)
            self.driver.switch_to.frame(
                self.driver.find_element_by_xpath(xpath))
            framedict[child_frame_name]['children'] = self.frame_search(
                framedict[child_frame_name]['framepath'] + [child_frame_name])

            self.driver.switch_to.default_content()
            if len(framedict[child_frame_name]['framepath']) > 0:
                for parent in framedict[child_frame_name]['framepath']:
                    parent_xpath = '//frame[@name="{}"]'.format(parent)
                    self.driver.switch_to.frame(
                        self.driver.find_element_by_xpath(parent_xpath))
        return framedict

    def tmon(self):
        self.driver.get(
            "https://login.ticketmonster.co.kr/user/loginform?return_url=")
        self.driver.find_element_by_name('userid').send_keys(
            config['ACCOUNT']['tmon_id'])
        self.driver.find_element_by_name('password').send_keys(
            config['ACCOUNT']['tmon_pw'])
        self.driver.find_element_by_xpath('//*[@id="loginFrm"]/a[2]').click()
        self.driver.get(
            'http://m.benefit.ticketmonster.co.kr/promotions/page/attendance?view_mode=app'
        )
        self.driver.find_element_by_xpath(
            '//*[@id="attn_wrap"]/div/div/div[3]/div[2]/div[1]/button').click(
            )

        print(self.driver.find_element_by_class_name('content').text)
        self.tmon_ret = self.driver.find_element_by_class_name('content').text

    def ondisk(self):
        try:
            self.driver.get("http://ondisk.co.kr/index.php")
            self.driver.implicitly_wait(3)
            self.driver.find_element_by_xpath('//*[@id="mb_id"]').send_keys(
                config['ACCOUNT']['ondisk_id'])
            self.driver.find_element_by_xpath(
                '//*[@id="page-login"]/form/div[2]/p[2]/input').send_keys(
                    config['ACCOUNT']['ondisk_pw'])
            self.driver.find_element_by_xpath(
                '//*[@id="page-login"]/form/div[2]/p[3]/input').click()
            self.driver.get(
                "http://ondisk.co.kr/index.php?mode=eventMarge&sm=event&action=view&idx=746&event_page=1"
            )
            self.driver.switch_to_frame(1)
            self.driver.execute_script(
                "window.alert = function(msg){ window.msg = msg; };")
            self.driver.find_element_by_class_name('button').click()

            alert_text = self.driver.execute_script("return window.msg;")
            print(alert_text)
        except:
            print("ERR")
            print(self.driver.page_source)
        self.ondisk_ret = alert_text

    def ok_cash_bag(self):
        today = datetime.datetime.now().strftime("%Y%m%d")
        sess = requests.session()
        getdata = sess.get(
            "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101"
        )
        param = {
            "lsd": "AVpmy4vJ",
            "api_key": "645711852239977",
            "cancel_url":
            "https://member.okcashbag.com/ocb/socialId/facebookProcessor?error=access_denied&error_code=200&error_description=Permissions+error&error_reason=user_denied#_=_",
            "display": "page",
            "enable_profile_selector": "",
            "isprivate": "",
            "legacy_return": "0",
            "profile_selector_ids": "",
            "return_session": "",
            "skip_api_login": "******",
            "signed_next": "1",
            "trynum": "1",
            "timezone": "-540",
            "lgndim":
            "eyJ3IjoxOTIwLCJoIjoxMDgwLCJhdyI6MTkyMCwiYWgiOjEwNDAsImMiOjI0fQ==",
            "lgnrnd": "173648_UqkK",
            "lgnjs": "1528418208",
            "email": config['ACCOUNT']['fb_id'],
            "pass": config['ACCOUNT']['fb_pw'],
            "prefill_contact_point": config['ACCOUNT']['fb_id'],
            "prefill_source": "last_login",
            "prefill_type": "contact_point",
            "first_prefill_source": "last_login",
            "first_prefill_type": "contact_point",
            "had_cp_prefilled": "true",
            "had_password_prefilled": "false"
        }
        postdata = sess.post(
            "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101",
            data=param)

        # print(postdata.text)
        postdata = sess.post(
            "https://member.okcashbag.com//ocb/socialId/socialIdLoginProcess/42100/687474703A2F2F7777772e6f6b636173686261672e636f6d2F696e6465782e646f3F6c6f67696e3D59"
        )
        samlResponse = postdata.text.split("samlResponse.value = \"")[1].split(
            "\"")[0]
        # print(samlResponse)
        param = {"samlResponse": samlResponse, "sst_cd": "", "return_url": ""}
        postdata = sess.post("http://www.okcashbag.com/index.do?login=Y",
                             data=param)
        print(
            postdata.text.split('<span id="profileNickname" class="name">')
            [1].split("</span>")[0] + "님 로그인")
        print(
            postdata.text.split('<span id="spanUsablePoint">')[1].split(
                '</span>')[0] + "포인트")
        getdata = sess.get(
            "http://www.okcashbag.com/life/event/attend/attendMain.do")
        param = {"method": "", "myUrl": "", "recommUser": "", "today": today}
        postdata = sess.post(
            "http://www.okcashbag.com/life/event/attend/attend.do", data=param)
        print(postdata.text)
        if len(postdata.text.split('<i class="win-point">')) > 1:
            print(postdata.text.split('<i class="win-point">')[1] + "포인트 적립")
        elif len(postdata.text.split("success")) > 1:
            print("출석체크 완료 ")
            self.ok_ret = "출석체크 완료"
        else:
            print('이미 출석체크 완료')
            self.ok_ret = "이미 출석체크 완료"
Example #49
0
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date,
                  invoice_amount):
    retail_invoice_url = RETAIL_INVOICE_URL[retail]

    driver = PhantomJS()
    driver.get(retail_invoice_url)

    # 1 Set doc_type 'select'
    try:
        select_doc_type = Select(driver.find_element_by_name('txtTipoDte'))
        value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value']
        select_doc_type.select_by_value(value)
        # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name']
        # select_doc_type.select_by_visible_text(name)
    except Exception:
        print 'ERROR: set doc_type select as Boleta'
        driver.save_screenshot('screen.png')
        return '', ''

    time.sleep(5)

    # 2 Get recaptcha img url
    try:
        recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image')
        recaptcha_img_url = recaptcha_img.get_attribute('src')
    except Exception:
        print 'ERROR: get recaptcha image url'
        driver.save_screenshot('screen.png')
        return '', ''

    # 3 Solve recaptcha
    v = VisionApi()
    recaptcha_value = v.detect_text_from_url(recaptcha_img_url)

    if recaptcha_value is None:
        print 'ERROR: solving recaptcha image'
        driver.save_screenshot('screen.png')
        return '', ''

    # 4 Fill form
    script = u"""
        document.getElementsByName('txtFolio')[0].value = '{invoice_id}';
        document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}';
        document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}';
        document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}';
    """.format(
        invoice_id=invoice_id,
        invoice_date=invoice_date,
        invoice_amount=invoice_amount,
        recaptcha_value=recaptcha_value,
    )
    driver.execute_script(script)

    # 5 Submit form
    try:
        driver.find_element_by_name('frmDatos').submit()
    except Exception:
        print 'ERROR: submitting form'
        driver.save_screenshot('screen.png')
        return '', ''

    # 6 Get url files
    try:
        xml_a_tag = driver.find_element_by_xpath(
            '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]')
        pdf_a_tag = driver.find_element_by_xpath(
            '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]')

        xml_url = xml_a_tag.get_attribute('href')
        pdf_url = pdf_a_tag.get_attribute('href')
    except Exception:
        print 'ERROR: getting url files'
        driver.save_screenshot('screen.png')
        return '', ''

    # 8 Delete driver session
    driver.close()
    driver.quit()

    return xml_url, pdf_url
Example #50
0
class WeixinPhantomjs(Base):
    all_uids = {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs}

    def __init__(self):
        self.start_page = START_PAGE
        self.end_page = END_PAGE
        self.weixin_url = REFER_FIRST

        # self.driver = Firefox()
        if hasattr(config, 'PHANTOMJS_PATH'):
            self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH'))
        else:
            self.driver = PhantomJS()

    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            time.sleep(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False

    def get_total_pages_to_word(self):
        pages = []
        page_id_css = 'pagebar_container'

        try:
            e = self.driver.find_element_by_id(page_id_css)
            for _p in e.text.split():
                _p = _p.strip()

                if not _p.isdigit():
                    return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1]
                else:
                    pages.append(int(_p))
            return 1
        except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError):
            pass
        return 1

    def extract_urls_uids(self, word):
        urls_uids = []
        timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')]
        urls_tits = [(t.get_attribute('href'), self.trim(t.text))
                     for t in self.driver.find_elements_by_css_selector('h4 a')]

        if len(urls_tits) != len(timestamp):
            return urls_uids

        for index, url_tit in enumerate(urls_tits):
            try:
                uid = self.md5(timestamp[index] + url_tit[1] + word)

                if uid not in self.__class__.all_uids:
                    self.__class__.all_uids.add(uid)
                    urls_uids.append({'url': url_tit[0], 'uid': uid})
            except (TypeError, IndexError):
                pass
        return urls_uids

    @property
    def is_forbidden(self):
        css_id = 'seccodeForm'

        try:
            if self.driver.find_element_by_id(css_id):
                return True
        except NoSuchElementException:
            pass
        return False

    def appear_element(self, by):
        try:
            # Have `click` function to specified element
            tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by))
            tag.click()
            return True
        except (TimeoutException, NoSuchWindowException, NoSuchElementException):
            pass
        return False

    def crawl_single(self, word=None, go=0):
        is_go = True
        go_page = int(go)
        next_page_css = 'sogou_page_%s'

        is_break = self.open_weixin_browser(word)
        pages = self.get_total_pages_to_word()

        for page in range(self.start_page + 1, (pages or self.end_page) + 1):
            if is_go and page < go_page:
                continue
            else:
                is_go = False

            if not self.appear_element(by=next_page_css % page):
                is_break = True
                msg = '\tNot appear next page element, will break'
            elif self.is_forbidden:
                is_break = True
                msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

            if is_break:
                storage_word.append([word, page])
                self.logger.info(msg)
                break

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()

            # self.driver.find_element_by_id(next_page_css % page).click()
            # wt = randint(10, 40) if page % 5 == 0 else randint(5, 18)
            wt = randint(1, 5)
            self.logger.info('Word <{}>, Page <{}> Done, sleeping {}s!'.format(word, page, wt))
            # self.driver.implicitly_wait(wt)
            time.sleep(wt)

        self.close_browser()

    @classmethod
    def crawl_with_threads(cls):
        pool = ThreadPool(4)
        total_words = QueryWords().get_query_words()

        for bulk_words in total_words:
            try:
                pool.map(lambda w: cls().crawl_single(w), bulk_words)
            except Exception as e:
                cls.logger.info('Threads crawl error: type <{}>, msg <{}>'.format(e.__class__, e))

        pool.close()
        pool.join()
        in_client.close()

    def close_browser(self):
        try:
            self.driver.close()
        except (NoSuchWindowException,):
            pass
from selenium.webdriver import PhantomJS as Browser
import json
import time
import re

proxy_list_url = "http://spys.one/socks/"
proxies = []
br = Browser()
br.get(proxy_list_url)
sizes = [25, 50, 100, 200, 300, 500]
pattern = re.compile(r"[.\s]+\((\d+)\)")
for country_id in range(1, 171):
    try_counter = 0
    count = 0
    while (elm := br.find_element_by_id('tldc')).find_element_by_xpath(
            f"./option[@selected]").get_attribute("value") != str(country_id):
        elm = elm.find_element_by_xpath(f'./option[@value="{country_id}"]')
        elm.click()
        try_counter += 1
        if try_counter >= 2:
            break
    if try_counter >= 2:
        continue
    count = int(pattern.findall(elm.text)[0])
    key = 0
    for key, size in enumerate(sizes):
        if int(size) > count:
            break
    try_counter = 0
    while (elm := br.find_element_by_id("xpp")).find_element_by_xpath(
            "./option[@selected]").get_attribute("value") != str(key):
Example #52
0
    def get_applications_in_page(self, scroll_script):
        applications = []
        driver = None
        try:
            desired_capabilities = dict(DesiredCapabilities.PHANTOMJS)
            desired_capabilities["phantomjs.page.settings.userAgent"] = useragent.get_random_agent(google_prop.user_agent_list_url)
            service_args = ['--load-images=no', '--proxy=%s' % (proxy.get_random_proxy(google_prop.proxy_list_url))]
            driver = PhantomJS(desired_capabilities=desired_capabilities, service_args=service_args)
            # driver = Firefox(firefox_profile=self.fp, proxy=self.proxy)

            if self.proxy_test:
                driver.get('http://curlmyip.com/')
                ip = driver.find_element_by_xpath('//body//pre').text
                print('ip : [ ' + ip + ' ]')
                pass
            else:
                driver.get(self.url)
                driver.execute_script(scroll_script)

                acknowledge = 0
                done = False
                while not done:
                    scroll_finished = driver.execute_script("return scraperLoadCompleted")
                    if scroll_finished:
                        if acknowledge == self.acknowledgements:
                            done = driver.execute_script("return scraperLoadCompleted")
                            pass
                        else:
                            acknowledge += 1
                            pass
                        pass
                    else:
                        acknowledge = 0
                        pass
                    time.sleep(5)  # Wait before retry
                    pass

                product_matrix = driver.find_elements_by_class_name("card")
                for application in product_matrix:
                    extracted_application = self.extract_application_data(application)
                    # if extracted_application['app_price'] != -1:
                    applications.append(extracted_application)
                    #pass
                    pass
                pass
            driver.quit()
            pass

        except Exception as e:
            if driver is not None:
                driver.quit()
                pass

            if self.attempt < self.retries:
                self.attempt += 1
                time.sleep(10)
                print 'retry : url [ ' + self.url + ' ] + | attempt [ ' + str(self.attempt) + ' ] | error [ ' + str(e) + ' ]'
                applications = self.get_applications_in_page(scroll_script)
                pass
            else:
                print('fail : url [ ' + self.url + ' ] | error [ ' + str(e) + ' ]')
                pass
            pass
        return applications
        pass
Example #53
0
class RequestUtil:
    __browserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0'

    def __init__(self):
        self.cookies = ''
        self._lock = threading.RLock()

    def http_get_request(self, url, referer, timeout=''):
        self._lock.acquire()
        cookie = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie),
                                      SmartRedirectHandler())
        urllib2.install_opener(opener)
        headers = {
            'User-Agent': self.__browserAgent,
            'Referer': referer,
            'Cache-Control': 'max-age=0',
            'Accept': '*/*',
            'Connection': 'Keep-Alive',
            'Accept-encoding': 'gzip'
        }
        req = urllib2.Request(url=url, headers=headers)
        if timeout == '':
            open = urllib2.urlopen(req)
        else:
            open = urllib2.urlopen(req, timeout=timeout)
        if self.cookies == '':
            for item in cookie:
                self.cookies = self.cookies + item.name + '=' + item.value + ';'
            self.cookies = self.cookies[:-1]
        if url != open.url:
            req = urllib2.Request(url=open.url, headers=headers)
        self._lock.release()
        return (open, req)

    def http_post_request(self, url, datas, referer, timeout=''):
        self._lock.acquire()
        postdata = urllib.urlencode(datas)
        headers = {
            'User-Agent': self.__browserAgent,
            'Referer': referer,
            'Content-Type': 'application/x-www-form-urlencoded',
            'Cache-Control': 'no-cache',
            'Accept': '*/*',
            'Connection': 'Keep-Alive',
            'Accept-encoding': 'gzip',
            'Cookie': self.cookies
        }
        req = urllib2.Request(url=url, data=postdata, headers=headers)
        req.get_host()
        if timeout == '':
            open = urllib2.urlopen(req)
        else:
            open = urllib2.urlopen(req, timeout=timeout)
        if url != open.url:
            req = urllib2.Request(url=open.url, headers=headers)
        self._lock.release()
        return (open, req)

    def http_get(self, url, refer='https://www.baidu.com'):
        return self.http_get_request(url, refer, 60)

    def http_post(self, url, datas, refer='https://www.baidu.com'):
        return self.http_post_request(url, datas, refer, 60)

    def http_post_request2(self, url, datas, timeout=''):
        if timeout == '':
            open = urllib2.urlopen(url, datas)
        else:
            open = urllib2.urlopen(url, datas, timeout=timeout)
        data = open.read()
        return data

    def http_post2(self, url, datas):
        return self.http_post_request2(url, datas, 300)

    def create_phandomjs(self, service_args, caps, timeout=30):
        self.driver = PhantomJS(desired_capabilities=caps,
                                service_args=service_args)
        self.driver.set_page_load_timeout(timeout)
        self.driver.set_script_timeout(timeout)
        self.driver.implicitly_wait(timeout)

    def close_phandomjs(self):
        try:
            self.driver.quit()
        except:
            pass

    def http_get_phandomjs(self,
                           url,
                           refer='https://www.baidu.com',
                           timeout=1000):
        caps = dict(DesiredCapabilities.PHANTOMJS)
        caps['browserName'] = 'chrome'
        caps["phantomjs.page.settings.resourceTimeout"] = timeout
        caps["phantomjs.page.settings.loadImages"] = False
        caps["phantomjs.page.settings.userAgent"] = (self.__browserAgent)
        caps["phantomjs.page.customHeaders.Referer"] = (refer)

        service_args = []
        service_args.append('--load-images=no')
        service_args.append('--disk-cache=yes')
        service_args.append('--cookies-file=')

        self.create_phandomjs(timeout=timeout,
                              service_args=service_args,
                              caps=caps)
        self.driver.get(url)
        return self.driver.page_source
Example #54
0
class SaveFromScraper(object):
    """
    Web scraper for savefrom.net
    """

    def __init__(self):
        """
        Default constructor

        ARGS:
            None
        RETURNS:
            None
        """
        self.browser = PhantomJS(executable_path='./drivers/phantomjs',
                                 port=free_port())  # Optional argument, if not specified will search path.
        self.timeout = 5 # seconds

    def get_video(self, video_id, url, quality):
        """
        Main function that does heavy lifting
        Select video quality in this function.
        """
        pass

    def _get_html(self, video_id):
        """
        For a given video, returns the HTML for the download page

        ARGS:
            video_id: unique video identifier
        RETURNS:
            Tuple: whether page has loaded, and HTML for page
        """
        url = 'https://www.ssyoutube.com/watch?v=' + video_id ## TODO: remove - this is only for testing
        self.browser.get(url)

        try:
            class_name = "link-download"
            WebDriverWait(self.browser, self.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, 'link-download')))
            has_loaded = True
            print "Page is ready!"
        except TimeoutException:
            has_loaded = False
            print "Loading took too much time!"

        if has_loaded:
            html = self.browser.page_source
        else:
            html = ""

        return (has_loaded, html)

    def _parse_html(self, html):
        """
        Find the links for downloading the video in the HTML

        ARGS:
            html: web page source
        RETURNS:
            Dictionary containing all links for downloading the video
        """
        link_dict = dict()
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.findAll("a", { "class" : "link-download" }):
            if 'title' in link.attrs:
                title = link.attrs['title'].split(': ')[1]
                url = link.attrs['href']
                link_dict[title] = url
            else:
                pass
        return link_dict

    def _download_file(self, video_id, download_url):
        """
        Download video file from SaveFromNet

        ARGS:
            video_id: unique video identifier
        RETURNS:
            None
        """
        f = urllib2.urlopen(download_url)
        with open(video_id + ".mp4", "wb") as code:
            code.write(f.read())

    def quit(self):
        """
        Quit browser session

        ARGS:
            None
        RETURNS:
            None
        """
        browser.close()
        browser.quit()