Esempio n. 1
0
class SeleniumTestCase(LiveServerTestCase):
    def _pre_setup(self):
        super(SeleniumTestCase, self)._pre_setup()
        self.driver = PhantomJS()

    def _post_teardown(self):
        self.driver.quit()
        super(SeleniumTestCase, self)._post_teardown()

    def login(self, username='******', password='******', url='login'):
        """
        Login to the server and be authenticated
        """
        self.open(reverse(url))
        self.driver.find_element_by_id("id_username").clear()
        self.driver.find_element_by_id("id_username").send_keys(username)
        self.driver.find_element_by_id("id_password").clear()
        self.driver.find_element_by_id("id_password").send_keys(password)
        self.driver.find_element_by_id("submit-id-login").click()

    def open(self, url):
        self.driver.get("%s%s" %(self.live_server_url, url))

    def is_element_present(self, how, what):
        try:
            self.driver.find_element(by=how, value=what)
        except NoSuchElementException, e:
            return False
        return True
Esempio n. 2
0
 def onegoogolePR(self, url):
     '''返回单个PR'''
     prUrl = 'http://pr.chinaz.com'  # 谷歌PR查询地址
     driver = PhantomJS()
     driver.get(prUrl)
     driver.find_element_by_id('PRAddress').send_keys(url)
     driver.find_element_by_class_name('search-write-btn').click()
     try:
         imgsrc = driver.find_element_by_css_selector('span#pr>img').get_attribute('src')
         pr = search(r'\d', imgsrc).group()
     except:
         pr = '暂无数据'
     driver.quit()
     return pr
def check_agree(link, soup):
    # Agree if asked to (click on accept)

    if soup.find('input',
                 {'id': 'ctl00_mainContentArea_disclaimerContent_yesButton'}):
        print("Agreeing the terms of use - please wait...")
        driver = PhantomJS('.\phantomjs.exe' if platform.
                           startswith('win32') else './phantomjs')
        driver.get(link)
        driver.find_element_by_id(
            'ctl00_mainContentArea_disclaimerContent_yesButton').click()
        for cookie in driver.get_cookies():
            s.cookies.set(cookie['name'], cookie['value'])
        driver.quit()
        resp_inner = s.get(link)
        soup = Soup(resp_inner.text, features="lxml")
        print("Done, now let's get back to the scraping process.")

    return soup
Esempio n. 4
0
    for s in school_info:
        inner_html = sub(r'<.*?>|\n', ' ', s.get_attribute('innerHTML'))
        inner_html = sub(r'\s+', ' ', inner_html).strip()
        if 'grades' in inner_html.lower():
            min_grade, max_grade = inner_html.split(' ')[-1].split('-')
            if min_grade.lower() == 'pk':
                min_grade = -1
            elif min_grade.lower() == 'k':
                min_grade = 0
            n_grades = int(max_grade) - int(min_grade) + 1
        elif 'students' in inner_html.lower():
            n_students = int(sub(r'[^0-9]', '', inner_html.split(' ')[-1]))
    students_per_grade = float(n_students) / float(n_grades)

    staff_info = wd.find_element_by_id(
        'TeachersStaff').find_elements_by_class_name(
            'rating-container__score-item')
    teacher_info = sub(r'<.*?>|\n', ' ',
                       staff_info[0].get_attribute('innerHTML'))
    teacher_info = sub(r'\s+', ' ', teacher_info).strip()
    counsel_info = sub(r'<.*?>|\n', ' ',
                       staff_info[1].get_attribute('innerHTML'))
    counsel_info = sub(r'\s+', ' ', counsel_info).strip()
    t_to_s_school = int(sub(r'.*?(\d+) :1.*', r'\1', teacher_info))
    c_to_s_school = int(sub(r'.*?(\d+) :1.*', r'\1', counsel_info))

    columns = [
        'indicator', 'subject', 'category', 'school_score', 'state_average'
    ]
    df = DataFrame(columns=columns)
    ind = 0
Esempio n. 5
0
class CNStock(SentimentCrawler):
    def __init__(self):
        super().__init__(init=False)
        self.driver = PhantomJS()
        self.driver.maximize_window()
        self.wait = WebDriverWait(self.driver, 15)
        self.url = 'http://www.cnstock.com/'
        self.name = '中国证券网'

    def crawl_main_page(self, keyword):
        self.driver.set_page_load_timeout(10)
        try:
            self.driver.get(self.url)
        except TimeoutException:
            self.driver.execute_script('window.stop();')

        try:
            self.wait.until(
                ec.presence_of_element_located((By.ID, 'nav_keywords')))
        except:
            CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR)

        self.driver.find_element_by_id('nav_keywords').clear()
        self.driver.find_element_by_id('nav_keywords').send_keys(keyword +
                                                                 Keys.ENTER)

        return self.crawl_search_results()

    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()

        exit_flag = 0
        while True:
            try:
                self.wait.until(
                    ec.presence_of_element_located(
                        (By.CLASS_NAME, 'result-cont')))
            except TimeoutException:
                CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_class_name(
                    'result-article')

                for each_article in result_articles:
                    item = Entity()

                    publish_date = each_article.find_element_by_class_name(
                        'g').text
                    item.publish_date = re.search(
                        re.compile(
                            '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d'
                        ), publish_date).group()

                    if not in_date_range(
                            conv_pub_date(item.publish_date, 'cnstock'),
                            self.year_range):
                        exit_flag = 1
                        # 跳出for循环
                        break
                    item.short_description = each_article.find_element_by_class_name(
                        'des').text
                    item.title = each_article.find_element_by_tag_name(
                        'a').text
                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    if item.title in self.titles:
                        continue
                    else:
                        self.titles.append(item.title)

                    item.url = each_article.find_element_by_tag_name(
                        'a').get_attribute('href')
                    threading.Thread(target=super().download_and_save_item,
                                     args=(item, )).start()

                if exit_flag == 1:
                    break
            except NoSuchElementException:
                CustomLogging.log_to_file('没有搜索结果', LogType.INFO)
                break

            try:
                next_page = self.driver.find_element_by_xpath(
                    '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]'
                )
                self.driver.get(next_page.get_attribute('href'))
                # next_page.click()
            except NoSuchElementException:
                break

        return search_results

    def parse_html(self, url, html):
        bs = BeautifulSoup(html, 'lxml')
        try:
            full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text
            return full_content
        except Exception:
            CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url),
                                      LogType.ERROR)
            pass
Esempio n. 6
0
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date,
                  invoice_amount):
    retail_invoice_url = RETAIL_INVOICE_URL[retail]

    driver = PhantomJS()
    driver.get(retail_invoice_url)

    # 1 Set doc_type 'select'
    try:
        select_doc_type = Select(driver.find_element_by_name('txtTipoDte'))
        value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value']
        select_doc_type.select_by_value(value)
        # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name']
        # select_doc_type.select_by_visible_text(name)
    except Exception:
        print 'ERROR: set doc_type select as Boleta'
        driver.save_screenshot('screen.png')
        return '', ''

    time.sleep(5)

    # 2 Get recaptcha img url
    try:
        recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image')
        recaptcha_img_url = recaptcha_img.get_attribute('src')
    except Exception:
        print 'ERROR: get recaptcha image url'
        driver.save_screenshot('screen.png')
        return '', ''

    # 3 Solve recaptcha
    v = VisionApi()
    recaptcha_value = v.detect_text_from_url(recaptcha_img_url)

    if recaptcha_value is None:
        print 'ERROR: solving recaptcha image'
        driver.save_screenshot('screen.png')
        return '', ''

    # 4 Fill form
    script = u"""
        document.getElementsByName('txtFolio')[0].value = '{invoice_id}';
        document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}';
        document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}';
        document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}';
    """.format(
        invoice_id=invoice_id,
        invoice_date=invoice_date,
        invoice_amount=invoice_amount,
        recaptcha_value=recaptcha_value,
    )
    driver.execute_script(script)

    # 5 Submit form
    try:
        driver.find_element_by_name('frmDatos').submit()
    except Exception:
        print 'ERROR: submitting form'
        driver.save_screenshot('screen.png')
        return '', ''

    # 6 Get url files
    try:
        xml_a_tag = driver.find_element_by_xpath(
            '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]')
        pdf_a_tag = driver.find_element_by_xpath(
            '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]')

        xml_url = xml_a_tag.get_attribute('href')
        pdf_url = pdf_a_tag.get_attribute('href')
    except Exception:
        print 'ERROR: getting url files'
        driver.save_screenshot('screen.png')
        return '', ''

    # 8 Delete driver session
    driver.close()
    driver.quit()

    return xml_url, pdf_url
from selenium.webdriver import PhantomJS
from selenium.webdriver.common.keys import Keys
from time import sleep

driver = PhantomJS()
driver.set_window_size(1120, 550)
driver.get("https://duckduckgo.com/")
driver.find_element_by_id('search_form_input_homepage').send_keys("realpython")
driver.find_element_by_id("search_button_homepage").click()
print(driver.current_url)
driver.quit()
Esempio n. 8
0
driver = PhantomJS(
    'D:/vscodeproject/chapter16爬虫/install/phantomjs-2.1.1-windows/bin/phantomjs.exe'
)
driver.set_window_size(1280, 1080)


def savepic():
    basepath = 'D:/vscodeproject/python/search-generates-wordcloud/out/'
    filename = basepath + "{:%Y%m%d%H%M%S}-{:03}.png".format(
        datetime.datetime.now(), random.randint(1, 100))
    driver.save_screenshot(filename)


url = 'http://www.baidu.com'
driver.get(url)
logger.flow(driver.current_url, '1')
savepic()

select = driver.find_element_by_id('kw')
select.send_keys('哥斯拉2  百度云下载')

logger.flow(driver.current_url, '2')

select.send_keys(Keys.ENTER)
time.sleep(4)
logger.flow(driver.current_url, '3')
savepic()

driver.quit()
class CamaraCGCrawler(object):
    """ Camara CG Ementa Crawler """
    def __init__(self, starting_year):
        self.base_url = "http://187.115.174.90:8080/ScanLexWeb"
        self.starting_year = starting_year
        self.browser = None

    @staticmethod
    def get_ementa_id(published_date, ementa_type, ementa_doc_number,
                      ementa_situation):
        """ Return the Ementa Unique Id """
        return "%s#%s#%s#%s" % (datetime.strftime(
            published_date,
            "%Y-%m-%d"), ementa_type, ementa_doc_number, ementa_situation)

    def get_all_ementas_summary(self):
        """ Yield the next ementa information row """

        browser_table = self.browser.find_element_by_id(
            "frmMenu:tabEmentas_data")
        bs_ementa_table = BeautifulSoup(
            browser_table.get_attribute("innerHTML"))

        for row in bs_ementa_table.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) == 6:
                published_date = datetime.strptime(
                    cols[0].span.text.encode("utf-8"), "%d/%m/%Y")
                doc_number = int(cols[1].span.text.encode("utf-8"))
                title = cols[2].span.text.encode("utf-8")
                ementa_type = cols[3].span.text.encode("utf-8")
                ementa_situation = cols[4].span.text.encode("utf-8")
                details_js = cols[5].a['onclick'].encode("utf-8")

                if published_date > datetime.now():
                    continue

                yield published_date, doc_number, title, ementa_type, ementa_situation, details_js

    def get_ementa_details(self, ementa_details_js):
        """ Crawl the second ementa page """

        # Waiting...
        _ = WebDriverWait(self.browser, 30).until(
            EC.visibility_of_element_located(
                (By.ID, "frmfuncao:j_idt13_content")))
        _ = WebDriverWait(self.browser, 30).until(
            EC.visibility_of_element_located(
                (By.ID, "frmfuncao:tabProponentes")))

        # Get Ementail Details
        bs_ementa_details = BeautifulSoup(self.browser \
            .find_element_by_id("frmfuncao:j_idt13_content").get_attribute("innerHTML"))

        rows = bs_ementa_details.find_all("tr")

        source = rows[3].td.text
        main_theme = rows[7].td.text
        sys_enter_date = datetime.strptime(rows[9].td.text, "%d/%m/%Y")
        approval_date = datetime.strptime(rows[11].td.text, "%d/%m/%Y")
        process_number = int(rows[15].td.text or "-1")
        autograph_number = int(rows[19].td.text or "-1")
        process_year = int(rows[21].td.text or "-1")
        has_image = rows[23].td.text == "Sim"

        # Get Proponent names
        bs_proponent = BeautifulSoup(
            self.browser.find_element_by_id(
                "frmfuncao:tabProponentes").get_attribute("innerHTML"))

        proponents = ",".join(
            [col.text for col in bs_proponent.find_all("td")])

        return source, proponents, main_theme, sys_enter_date, approval_date, process_number, \
            autograph_number, process_year, has_image

    def next_ementa(self, select_curs):
        """ Iterate in the years onwards and collect all the ementas """

        try:
            LOGGER.info("Opening Browser")
            self.browser = PhantomJS()

            LOGGER.info("GET [%s]", self.base_url)
            self.browser.maximize_window()

            cur_year = int(datetime.now().year)

            # Define the initial collection year
            select_curs.execute(
                "SELECT EXTRACT (YEAR FROM MAX(published_date)) FROM ementas;")
            last_exec_year = select_curs.fetchone()
            if last_exec_year:
                collection_year = max(self.starting_year, last_exec_year[0])
            else:
                collection_year = self.starting_year

            all_proponents = [
                "ANDERSON MAIA", "Afonso Alexandre Régis",
                "Alcides Cavalcante", "Alcindor Villarim", "Aldo Cabral",
                "Alexandre do Sindicato", "Antonio Pereira",
                "Antônio Alves Pimentel Filho", "Aragão Júnior",
                "Bruno Cunha Lima Branco", "Bruno Gaudêncio", "Buchada",
                "Cassiano Pascoal", "Cozete Babosa",
                "Cássio Murilo Galdino de Araujo", "Daniella Ribeiro",
                "Dr. Nunes", "Executivo", "Fabrinni Brito",
                "Fernando carvalho", "Francisco Dantas Lira",
                "Galego do Leite", "Inacio Falcao", "Ivan Batista",
                "Ivonete Ludgerio", "Joao Dantas", "Josimar Henrique da Silva",
                "José Marcos Raia ", "José Ribamar", "João Dantas",
                "Jóia Germano", "Laelson Patricio", "Lafite",
                "Lindaci Medeiros Nápolis", "Lourdes Costa", "Lula Cabral",
                "Marcos Marinho", "Maria Lopes Barbosa", "Marinaldo Cardoso",
                "Metuselá Agra", "Miguel Rodrigues da Silva",
                "Miguel da Construção", "Napoleão Maracajá",
                "Nelson Gomes Filho", "Olimpio Oliveira", "Orlandino Farias",
                "Paulo Muniz", "Paulo de Tarso", "Peron Ribeiro Japiassú",
                "Renato Feliciano", "Rodolfo Rodrigues",
                "Rodrigo Ramos Victor", "Romero Rodrigues", "Rostand Paraíba",
                "Rômulo Gouveia", "Saulo Germano", "Saulo Noronha", "Tia Mila",
                "Tovar Correia Lima", "Vaninho Aragão",
                "Veneziano Vital do rego", "Walter Brito Neto", "Todos"
            ]

            while collection_year <= cur_year:

                for i_prop in range(len(all_proponents)):

                    ementa_prop = all_proponents[i_prop].decode("utf-8")

                    self.browser.get(self.base_url)

                    # Waiting...
                    WebDriverWait(self.browser, 30).until(
                        EC.element_to_be_clickable((By.ID, "frmMenu:button1")))

                    LOGGER.info("Collecting Ementas from [%d][%s - %d/%d]",
                                collection_year, ementa_prop, i_prop + 1,
                                len(all_proponents))

                    # Set Year
                    year_field = self.browser.find_element_by_id("frmMenu:ano")
                    year_field.send_keys(collection_year)

                    # Set Proponent
                    proponent_field = self.browser.find_element_by_id(
                        "frmMenu:autoridade")
                    proponent_field.send_keys(ementa_prop)

                    # Submit the form
                    self.browser.find_element_by_id("frmMenu:button1").click()

                    # Waiting...
                    # _ = WebDriverWait(self.browser, 60).until(EC.visibility_of_element_located((By.ID, "frmMenu:tabEmentas_data")))
                    time.sleep(3)

                    for published_date, document_number, title, ementa_type, ementa_situation, ementa_details_js in self.get_all_ementas_summary(
                    ):
                        ementa_id = self.get_ementa_id(published_date,
                                                       ementa_type,
                                                       document_number,
                                                       ementa_situation)

                        select_curs.execute("""
                            SELECT ementa_id
                            FROM ementas
                            WHERE ementa_id = '%s';
                            """ % ementa_id)

                        if not select_curs.fetchone():
                            # Run the details script
                            self.browser.execute_script(ementa_details_js)
                            ementa_source, proponents, main_theme, sys_enter_date, approval_date, \
                                process_number, autograph_number, process_year, has_image = self.get_ementa_details(ementa_details_js)

                            # Come back to the table page
                            self.browser.back()

                            # Waiting...
                            _ = WebDriverWait(self.browser, 60).until(
                                EC.visibility_of_element_located(
                                    (By.ID, "frmMenu:tabEmentas_data")))

                            yield ementa_id, published_date, ementa_type, document_number, title, \
                                ementa_source, proponents, ementa_situation, main_theme, sys_enter_date, \
                                approval_date, process_number, autograph_number, process_year, has_image

                LOGGER.info("DONE [%d]", collection_year)

                self.browser.back()

                collection_year += 1

        finally:
            if self.browser:
                self.browser.quit()
    school_info = wd.find_elements_by_class_name('school-info__item')
    for s in school_info:
        inner_html = sub(r'<.*?>|\n', ' ', s.get_attribute('innerHTML'))
        inner_html = sub(r'\s+', ' ', inner_html).strip()
        if 'grades' in inner_html.lower():
            min_grade, max_grade = inner_html.split(' ')[-1].split('-')
            if min_grade.lower() == 'pk':
                min_grade = -1
            elif min_grade.lower() == 'k':
                min_grade = 0
            n_grades = int(max_grade) - int(min_grade) + 1
        elif 'students' in inner_html.lower():
            n_students = int(sub(r'[^0-9]', '', inner_html.split(' ')[-1]))
    students_per_grade = float(n_students) / float(n_grades)

    staff_info = wd.find_element_by_id('TeachersStaff').find_elements_by_class_name('rating-container__score-item')
    teacher_info = sub(r'<.*?>|\n', ' ', staff_info[0].get_attribute('innerHTML'))
    teacher_info = sub(r'\s+', ' ', teacher_info).strip()
    counsel_info = sub(r'<.*?>|\n', ' ', staff_info[1].get_attribute('innerHTML'))
    counsel_info = sub(r'\s+', ' ', counsel_info).strip()
    t_to_s_school = int(sub(r'.*?(\d+) :1.*', r'\1', teacher_info))
    c_to_s_school = int(sub(r'.*?(\d+) :1.*', r'\1', counsel_info))

    columns = ['indicator', 'subject', 'category', 'school_score', 'state_average']
    df = DataFrame(columns=columns)
    ind = 0

    for indicator in indicators:
        elem = wd.find_element_by_id(indicator)
        buttons = elem.find_elements_by_class_name('sub-nav-item')
        for b in buttons:
Esempio n. 11
0
class WeixinPhantomjs(Base):
    all_uids = {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs}

    def __init__(self):
        self.start_page = START_PAGE
        self.end_page = END_PAGE
        self.weixin_url = REFER_FIRST

        # self.driver = Firefox()
        if hasattr(config, 'PHANTOMJS_PATH'):
            self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH'))
        else:
            self.driver = PhantomJS()

    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            time.sleep(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False

    def get_total_pages_to_word(self):
        pages = []
        page_id_css = 'pagebar_container'

        try:
            e = self.driver.find_element_by_id(page_id_css)
            for _p in e.text.split():
                _p = _p.strip()

                if not _p.isdigit():
                    return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1]
                else:
                    pages.append(int(_p))
            return 1
        except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError):
            pass
        return 1

    def extract_urls_uids(self, word):
        urls_uids = []
        timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')]
        urls_tits = [(t.get_attribute('href'), self.trim(t.text))
                     for t in self.driver.find_elements_by_css_selector('h4 a')]

        if len(urls_tits) != len(timestamp):
            return urls_uids

        for index, url_tit in enumerate(urls_tits):
            try:
                uid = self.md5(timestamp[index] + url_tit[1] + word)

                if uid not in self.__class__.all_uids:
                    self.__class__.all_uids.add(uid)
                    urls_uids.append({'url': url_tit[0], 'uid': uid})
            except (TypeError, IndexError):
                pass
        return urls_uids

    @property
    def is_forbidden(self):
        css_id = 'seccodeForm'

        try:
            if self.driver.find_element_by_id(css_id):
                return True
        except NoSuchElementException:
            pass
        return False

    def appear_element(self, by):
        try:
            # Have `click` function to specified element
            tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by))
            tag.click()
            return True
        except (TimeoutException, NoSuchWindowException, NoSuchElementException):
            pass
        return False

    def crawl_single(self, word=None, go=0):
        is_go = True
        go_page = int(go)
        next_page_css = 'sogou_page_%s'

        is_break = self.open_weixin_browser(word)
        pages = self.get_total_pages_to_word()

        for page in range(self.start_page + 1, (pages or self.end_page) + 1):
            if is_go and page < go_page:
                continue
            else:
                is_go = False

            if not self.appear_element(by=next_page_css % page):
                is_break = True
                msg = '\tNot appear next page element, will break'
            elif self.is_forbidden:
                is_break = True
                msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

            if is_break:
                storage_word.append([word, page])
                self.logger.info(msg)
                break

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()

            # self.driver.find_element_by_id(next_page_css % page).click()
            # wt = randint(10, 40) if page % 5 == 0 else randint(5, 18)
            wt = randint(1, 5)
            self.logger.info('Word <{}>, Page <{}> Done, sleeping {}s!'.format(word, page, wt))
            # self.driver.implicitly_wait(wt)
            time.sleep(wt)

        self.close_browser()

    @classmethod
    def crawl_with_threads(cls):
        pool = ThreadPool(4)
        total_words = QueryWords().get_query_words()

        for bulk_words in total_words:
            try:
                pool.map(lambda w: cls().crawl_single(w), bulk_words)
            except Exception as e:
                cls.logger.info('Threads crawl error: type <{}>, msg <{}>'.format(e.__class__, e))

        pool.close()
        pool.join()
        in_client.close()

    def close_browser(self):
        try:
            self.driver.close()
        except (NoSuchWindowException,):
            pass
Esempio n. 12
0
class WeixinPhantomjs(Base):
    def __init__(self):
        self.start_page = START_PAGE
        self.end_page = END_PAGE
        self.weixin_url = REFER_FIRST

        # self.driver = Firefox()
        if hasattr(config, 'PHANTOMJS_PATH'):
            self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH'))
        else:
            self.driver = PhantomJS()

        self.client = MongoClient(HOST, PORT)
        self.collection = self.client[DB][COLLECTION]
        self.all_uids = self.uids

    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            time.sleep(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False

    def get_total_pages_to_word(self):
        pages = []
        page_id_css = 'pagebar_container'

        try:
            e = self.driver.find_element_by_id(page_id_css)
            for _p in e.text.split():
                _p = _p.strip()

                if not _p.isdigit():
                    return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1]
                else:
                    pages.append(int(_p))
            return 1
        except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError):
            pass
        return 1

    def get_query_words(self, word):
        query_words = []

        for docs in self.collection.find({}, {'rel': 1, 'conp': 1}).sort([('_id', 1)]):
            w = docs['conp']

            if w not in query_words:
                query_words.append(w)

            for item in docs['rel']:
                if item not in query_words:
                    query_words.append(item)

        self.client.close()

        return self.query_index(query_words, word)

    @property
    def uids(self):
        return {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs}

    def extract_urls_uids(self, word):
        urls_uids = []
        timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')]
        urls_tits = [(t.get_attribute('href'), self.trim(t.text))
                     for t in self.driver.find_elements_by_css_selector('h4 a')]

        if len(urls_tits) != len(timestamp):
            return urls_uids

        for index, url_tit in enumerate(urls_tits):
            try:
                uid = self.md5(timestamp[index] + url_tit[1] + word)

                if uid not in self.all_uids:
                    self.all_uids.add(uid)
                    urls_uids.append({'url': url_tit[0], 'uid': uid})
            except (TypeError, IndexError):
                pass
        return urls_uids

    @staticmethod
    def query_index(words, cut_word):
        temp_words = words[START_INDEX:END_INDEX]

        try:
            index = temp_words.index(cut_word)
            return temp_words[index:], index + START_INDEX
        except ValueError:
            pass
        return temp_words, START_INDEX

    @property
    def is_forbidden(self):
        css_id = 'seccodeForm'

        try:
            if self.driver.find_element_by_id(css_id):
                return True
        except NoSuchElementException:
            pass
        return False

    def appear_element(self, by):
        try:
            # Have `click` function to specified element
            tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by))
            tag.click()
            return True
        except (TimeoutException, NoSuchWindowException, NoSuchElementException):
            pass
        return False

    def crawl(self, word=None, go=0):
        is_go = True
        is_break = False
        go_page = int(go)
        next_page_css = 'sogou_page_%s'
        query_words, ind = self.get_query_words(word)

        for index, word in enumerate(query_words, 1):
            next_ind = ind + index
            is_break = self.open_weixin_browser(word)
            pages = self.get_total_pages_to_word()

            for page in range(self.start_page + 1, (pages or self.end_page) + 1):
                if is_go and page < go_page:
                    continue
                else:
                    is_go = False

                if not self.appear_element(by=next_page_css % page):
                    is_break = True
                    msg = '\tNot appear next page element, will break'
                elif self.is_forbidden:
                    is_break = True
                    msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

                if is_break:
                    storage_word.append([word, page])
                    self.logger.info(msg)
                    break

                urls_uids = self.extract_urls_uids(word=word)
                Article(urls_uids=urls_uids, word=word).extract()

                # self.driver.find_element_by_id(next_page_css % page).click()
                wt = randint(10, 40) if page % 3 == 0 else randint(5, 18)
                self.logger.info('Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.format(next_ind, word, page, wt))
                # self.driver.implicitly_wait(wt)
                time.sleep(wt)

            if is_break:
                break

        in_client.close()
        self.close_browser()

    def close_browser(self):
        try:
            self.driver.close()
        except (NoSuchWindowException,):
            pass
Esempio n. 13
0
class ProviderAdvancedViewTests(LiveServerTestCase):
    def setUp(self):
        self.driver = PhantomJS()

        self.user = User.objects.create_user('admin', '*****@*****.**', 'password')
        self.user.save()

        self.provider = Provider(
            name='provider',
            user=self.user,
        )
        self.provider.save()

        self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider)

        self.login()

    def tearDown(self):
        self.driver.quit()

    def open(self, url):
        self.driver.get("%s%s" % (self.live_server_url, url))

    def login(self):
        self.open(settings.LOGIN_URL)
        self.driver.find_element_by_id("id_username").send_keys("admin")
        self.driver.find_element_by_id("id_password").send_keys("password")
        self.driver.find_element_by_css_selector("button.btn.btn-default").click()

        self.assertEqual(
            self.driver.current_url,
            self.live_server_url + reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk]),
        )

    def test_can_login(self):
        """
        Test that the user can login
        """
        pass

    def test_provider_page_has_all_data(self):
        """
        Test that the provider statistics page has all the correct data
        """
        self.open(reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk]))

        self.assertEqual("Open Ads", self.driver.title)
        self.assertIn(
            "{0} advertisements".format(self.provider.name),
            self.driver.find_element_by_css_selector("h1.page-header").text
        )
        self.assertIn(
            "{0} advertisements in rotation".format(20),
            self.driver.find_element_by_css_selector("h1.page-header").text
        )

    def test_advertisement_page_has_all_data(self):
        """
        Test that the advertisement page has all the correct data
        """

        for advert in self.provider_adverts:
            self.open(reverse('advertisements.views.view_advert_statistics', args=[advert.pk]))

            self.assertIn(
                "ID number: {0}".format(advert.pk),
                self.driver.find_element_by_css_selector("h1.page-header").text,
            )
            self.driver.find_element_by_css_selector("img")
            self.assertEqual("Active", self.driver.find_element_by_xpath("//td[2]/span").text)
            self.assertEqual(advert.url, self.driver.find_element_by_link_text(advert.url).text)
            self.driver.find_element_by_link_text("Edit URL").click()
            self.assertEqual(advert.url, self.driver.find_element_by_id("id_url").get_attribute("value"))
Esempio n. 14
0
        # 将他拷贝到本地文件 w 写  b 二进制  wb代表写入二进制文本
        path = Path('./pic')
        path.mkdir(exist_ok=True)
        path = path / url_info[1]
        with open(path, 'wb') as f:
            f.write(img)


driver = PhantomJS()  # 创建Chrome对象.
# 操作这个对象.
driver.get('http://zxgk.court.gov.cn/shixin/')  # get方式访问百度.
i = 1
j = 10000

while j >= 0:
    a = driver.find_element_by_id("captchaImg")
    url = (a.get_attribute('src'))
    pic_name = f"{i}.png"
    try:
        download_img([url, pic_name])
    except Exception as e:
        print(e)
        continue
    print(f"{pic_name}已经下载成功,共成功下载{i}张验证码")
    i += 1
    j -= 1
    ActionChains(driver).move_to_element(a).click().perform()
    time.sleep(2)  # 防止过快被封ip

driver.quit()  # 使用完, 记得关闭浏览器, 不然chromedriver.exe进程为一直在内存中.
Esempio n. 15
0
	def __init__(self, browser: webdriver.PhantomJS, click_to_display_id: str):
		self._browser = browser
		self.click_to_display_id = click_to_display_id
		self.click_to_display_element = browser.find_element_by_id(click_to_display_id)
from selenium.webdriver import PhantomJS as Browser
import json
import time
import re

proxy_list_url = "http://spys.one/socks/"
proxies = []
br = Browser()
br.get(proxy_list_url)
sizes = [25, 50, 100, 200, 300, 500]
pattern = re.compile(r"[.\s]+\((\d+)\)")
for country_id in range(1, 171):
    try_counter = 0
    count = 0
    while (elm := br.find_element_by_id('tldc')).find_element_by_xpath(
            f"./option[@selected]").get_attribute("value") != str(country_id):
        elm = elm.find_element_by_xpath(f'./option[@value="{country_id}"]')
        elm.click()
        try_counter += 1
        if try_counter >= 2:
            break
    if try_counter >= 2:
        continue
    count = int(pattern.findall(elm.text)[0])
    key = 0
    for key, size in enumerate(sizes):
        if int(size) > count:
            break
    try_counter = 0
    while (elm := br.find_element_by_id("xpp")).find_element_by_xpath(
            "./option[@selected]").get_attribute("value") != str(key):
Esempio n. 17
0
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount):
    retail_invoice_url = RETAIL_INVOICE_URL[retail]

    driver = PhantomJS()
    driver.get(retail_invoice_url)

    # 1 Set doc_type 'select'
    try:
        select_doc_type = Select(driver.find_element_by_name('txtTipoDte'))
        value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value']
        select_doc_type.select_by_value(value)
        # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name']
        # select_doc_type.select_by_visible_text(name)
    except Exception:
        print 'ERROR: set doc_type select as Boleta'
        driver.save_screenshot('screen.png')
        return '', ''

    time.sleep(5)

    # 2 Get recaptcha img url
    try:
        recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image')
        recaptcha_img_url = recaptcha_img.get_attribute('src')
    except Exception:
        print 'ERROR: get recaptcha image url'
        driver.save_screenshot('screen.png')
        return '', ''

    # 3 Solve recaptcha
    v = VisionApi()
    recaptcha_value = v.detect_text_from_url(recaptcha_img_url)

    if recaptcha_value is None:
        print 'ERROR: solving recaptcha image'
        driver.save_screenshot('screen.png')
        return '', ''

    # 4 Fill form
    script = u"""
        document.getElementsByName('txtFolio')[0].value = '{invoice_id}';
        document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}';
        document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}';
        document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}';
    """.format(
        invoice_id=invoice_id,
        invoice_date=invoice_date,
        invoice_amount=invoice_amount,
        recaptcha_value=recaptcha_value,
    )
    driver.execute_script(script)

    # 5 Submit form
    try:
        driver.find_element_by_name('frmDatos').submit()
    except Exception:
        print 'ERROR: submitting form'
        driver.save_screenshot('screen.png')
        return '', ''

    # 6 Get url files
    try:
        xml_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]')
        pdf_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]')

        xml_url = xml_a_tag.get_attribute('href')
        pdf_url = pdf_a_tag.get_attribute('href')
    except Exception:
        print 'ERROR: getting url files'
        driver.save_screenshot('screen.png')
        return '', ''

    # 8 Delete driver session
    driver.close()
    driver.quit()

    return xml_url, pdf_url