count = int(pattern.findall(elm.text)[0])
    key = 0
    for key, size in enumerate(sizes):
        if int(size) > count:
            break
    try_counter = 0
    while (elm := br.find_element_by_id("xpp")).find_element_by_xpath(
            "./option[@selected]").get_attribute("value") != str(key):
        elm = elm.find_element_by_xpath(f'./option[@value="{key}"]')
        elm.click()
        try_counter += 1
        if try_counter >= 5:
            break
    if try_counter >= 5:
        continue
    elms = br.find_elements_by_class_name(
        "spy1xx")[1:] + br.find_elements_by_class_name("spy1x")[1:]
    i = 0
    start = time.time()
    for elm in elms:
        tds = elm.find_elements_by_tag_name("td")
        proxies.append(
            dict(proxie_url=tds[0].text,
                 type=tds[1].text,
                 delay=float(tds[5].text)))
        i += 1
        if i % 50 == 0:
            stop = time.time()
            print(stop - start)
            start = stop
proxies.sort(key=lambda x: x['delay'])
with open("proxies.json", "w") as f:
Beispiel #2
0
wd = PhantomJS()

output_cols = [
    'school', 'url', 'students_per_grade', 'teachers_to_student',
    'counselors_to_student', 'reading', 'math', 'science'
]
output_df = DataFrame(columns=output_cols)
output_ind = 0

for url in urls:
    t1 = time()
    wd.get(url)
    school_name = wd.title.split(' -')[0]
    print school_name,

    school_info = wd.find_elements_by_class_name('school-info__item')
    for s in school_info:
        inner_html = sub(r'<.*?>|\n', ' ', s.get_attribute('innerHTML'))
        inner_html = sub(r'\s+', ' ', inner_html).strip()
        if 'grades' in inner_html.lower():
            min_grade, max_grade = inner_html.split(' ')[-1].split('-')
            if min_grade.lower() == 'pk':
                min_grade = -1
            elif min_grade.lower() == 'k':
                min_grade = 0
            n_grades = int(max_grade) - int(min_grade) + 1
        elif 'students' in inner_html.lower():
            n_students = int(sub(r'[^0-9]', '', inner_html.split(' ')[-1]))
    students_per_grade = float(n_students) / float(n_grades)

    staff_info = wd.find_element_by_id(
Beispiel #3
0
class CNStock(SentimentCrawler):
    def __init__(self):
        super().__init__(init=False)
        self.driver = PhantomJS()
        self.driver.maximize_window()
        self.wait = WebDriverWait(self.driver, 15)
        self.url = 'http://www.cnstock.com/'
        self.name = '中国证券网'

    def crawl_main_page(self, keyword):
        self.driver.set_page_load_timeout(10)
        try:
            self.driver.get(self.url)
        except TimeoutException:
            self.driver.execute_script('window.stop();')

        try:
            self.wait.until(
                ec.presence_of_element_located((By.ID, 'nav_keywords')))
        except:
            CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR)

        self.driver.find_element_by_id('nav_keywords').clear()
        self.driver.find_element_by_id('nav_keywords').send_keys(keyword +
                                                                 Keys.ENTER)

        return self.crawl_search_results()

    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()

        exit_flag = 0
        while True:
            try:
                self.wait.until(
                    ec.presence_of_element_located(
                        (By.CLASS_NAME, 'result-cont')))
            except TimeoutException:
                CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_class_name(
                    'result-article')

                for each_article in result_articles:
                    item = Entity()

                    publish_date = each_article.find_element_by_class_name(
                        'g').text
                    item.publish_date = re.search(
                        re.compile(
                            '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d'
                        ), publish_date).group()

                    if not in_date_range(
                            conv_pub_date(item.publish_date, 'cnstock'),
                            self.year_range):
                        exit_flag = 1
                        # 跳出for循环
                        break
                    item.short_description = each_article.find_element_by_class_name(
                        'des').text
                    item.title = each_article.find_element_by_tag_name(
                        'a').text
                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    if item.title in self.titles:
                        continue
                    else:
                        self.titles.append(item.title)

                    item.url = each_article.find_element_by_tag_name(
                        'a').get_attribute('href')
                    threading.Thread(target=super().download_and_save_item,
                                     args=(item, )).start()

                if exit_flag == 1:
                    break
            except NoSuchElementException:
                CustomLogging.log_to_file('没有搜索结果', LogType.INFO)
                break

            try:
                next_page = self.driver.find_element_by_xpath(
                    '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]'
                )
                self.driver.get(next_page.get_attribute('href'))
                # next_page.click()
            except NoSuchElementException:
                break

        return search_results

    def parse_html(self, url, html):
        bs = BeautifulSoup(html, 'lxml')
        try:
            full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text
            return full_content
        except Exception:
            CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url),
                                      LogType.ERROR)
            pass
Beispiel #4
0
    def get_applications_in_page(self, scroll_script):
        applications = []
        driver = None
        try:
            desired_capabilities = dict(DesiredCapabilities.PHANTOMJS)
            desired_capabilities["phantomjs.page.settings.userAgent"] = useragent.get_random_agent(google_prop.user_agent_list_url)
            service_args = ['--load-images=no', '--proxy=%s' % (proxy.get_random_proxy(google_prop.proxy_list_url))]
            driver = PhantomJS(desired_capabilities=desired_capabilities, service_args=service_args)
            # driver = Firefox(firefox_profile=self.fp, proxy=self.proxy)

            if self.proxy_test:
                driver.get('http://curlmyip.com/')
                ip = driver.find_element_by_xpath('//body//pre').text
                print('ip : [ ' + ip + ' ]')
                pass
            else:
                driver.get(self.url)
                driver.execute_script(scroll_script)

                acknowledge = 0
                done = False
                while not done:
                    scroll_finished = driver.execute_script("return scraperLoadCompleted")
                    if scroll_finished:
                        if acknowledge == self.acknowledgements:
                            done = driver.execute_script("return scraperLoadCompleted")
                            pass
                        else:
                            acknowledge += 1
                            pass
                        pass
                    else:
                        acknowledge = 0
                        pass
                    time.sleep(5)  # Wait before retry
                    pass

                product_matrix = driver.find_elements_by_class_name("card")
                for application in product_matrix:
                    extracted_application = self.extract_application_data(application)
                    # if extracted_application['app_price'] != -1:
                    applications.append(extracted_application)
                    #pass
                    pass
                pass
            driver.quit()
            pass

        except Exception as e:
            if driver is not None:
                driver.quit()
                pass

            if self.attempt < self.retries:
                self.attempt += 1
                time.sleep(10)
                print 'retry : url [ ' + self.url + ' ] + | attempt [ ' + str(self.attempt) + ' ] | error [ ' + str(e) + ' ]'
                applications = self.get_applications_in_page(scroll_script)
                pass
            else:
                print('fail : url [ ' + self.url + ' ] | error [ ' + str(e) + ' ]')
                pass
            pass
        return applications
        pass
wd = PhantomJS()

output_cols = [
    'school', 'url', 'students_per_grade', 'teachers_to_student', 'counselors_to_student', 'reading', 'math',
    'science'
]
output_df = DataFrame(columns=output_cols)
output_ind = 0

for url in urls:
    t1 = time()
    wd.get(url)
    school_name = wd.title.split(' -')[0]
    print school_name,

    school_info = wd.find_elements_by_class_name('school-info__item')
    for s in school_info:
        inner_html = sub(r'<.*?>|\n', ' ', s.get_attribute('innerHTML'))
        inner_html = sub(r'\s+', ' ', inner_html).strip()
        if 'grades' in inner_html.lower():
            min_grade, max_grade = inner_html.split(' ')[-1].split('-')
            if min_grade.lower() == 'pk':
                min_grade = -1
            elif min_grade.lower() == 'k':
                min_grade = 0
            n_grades = int(max_grade) - int(min_grade) + 1
        elif 'students' in inner_html.lower():
            n_students = int(sub(r'[^0-9]', '', inner_html.split(' ')[-1]))
    students_per_grade = float(n_students) / float(n_grades)

    staff_info = wd.find_element_by_id('TeachersStaff').find_elements_by_class_name('rating-container__score-item')
Beispiel #6
0
from selenium.webdriver import Firefox, PhantomJS


driver = PhantomJS()

url = ('https://www.google.com/finance?start=0&num=5000&q=%5B(exchange%20%3D'
       '%3D%20"{}")%20%26%20(last_price%20>%200.1)%20%26%20(last_price%20<'
       '%201500)%5D&restype=company&noIL=1')

driver.get(url.format('NYSE'))
nyse = (elem.text for elem in driver.find_elements_by_class_name('symbol'))
driver.get('https://www.google.com/finance?q=NYSE%3A{}'.format(list(nyse)[0]))
print driver.find_element_by_class_name('pr').text


# driver.get(url.format('NASDAQ'))
# nasdaq = (elem.text for elem in driver.find_elements_by_class_name('symbol'))
# print '\n'.join(list(nasdaq))