コード例 #1
0
    def parse(self, response):
        driver = driver_manager.initialization()
        try:
            driver.get('about:blank')
            driver.get(response.url)
            while driver.find_elements_by_class_name('checkall'):
                driver.find_element_by_class_name('checkall').click()
                print u"等待数据加载完成"
                time.sleep(2)

            bs_obj = bs4.BeautifulSoup(driver.page_source, 'html.parser')
            #log_obj.update_error(bs_obj.prettify(encoding='utf8'))
            e_trs = bs_obj.find('table', id='oTable').tbody.find_all('tr')
            for e_tr in e_trs:
                item = fund_monitor.items.FundMonitorItem()

                item['fund_code'] = e_tr.find('td', class_='bzdm').get_text(strip=True)
                item['fund_name'] = e_tr.find('td', class_='tol').a.get('title')
                item['url'] = 'http://fund.eastmoney.com/' + e_tr.find('td', class_='tol').a.get('href')

                yield scrapy.Request(item['url'], meta={'item': item}, callback=self.parse1, dont_filter=True)
        except:
            log_obj.error("%s中无法解析\n原因:%s" %(self.name, traceback.format_exc()))
        finally:
            driver.quit()
コード例 #2
0
    def get_max_page(self, subject, url):
        driver = driver_manager.initialization(engine='Chrome')
        try:
            driver.get('about:blank')
            driver.get(url)
            bs_obj = bs4.BeautifulSoup(driver.page_source, 'html.parser')

            page_row = bs_obj.find('div', class_='pagination')
            max_page = page_row.find_all('a')[-2].get_text(strip=True)
            self.page_count[subject] = int(max_page)
            print "这学科%s排名一共%s页" % (subject, max_page)
            with open('page_count.json', 'w') as f:
                 json.dump(self.page_count, f)
        except:
            log_obj.error(url)
            log_obj.error(traceback.format_exc())
        driver.quit()
コード例 #3
0
    def parse_catalog(self, subject, url0):
        max_page = self.page_count[subject]
        urls0 = [url0 + ('?page=%s' %(i+1)) for i in range(max_page) if i > 0]
        urls = [url0,] + urls0
        for url in urls:
            print "正在解析:", url
            if url in self.used_urls:
                print '%s pass page %s' %(subject, url)
                continue

            driver = driver_manager.initialization(engine='Chrome')
            try:
                driver.get('about:blank')
                driver.get(url)
                bs_obj = bs4.BeautifulSoup(driver.page_source, 'html.parser')
                e_div = bs_obj.find('div', id='resultsMain')
                e_rows = e_div.find_all('div', class_='sep')

                for e_row in e_rows:
                    point = e_row.find('div', class_='t-large t-strong t-constricted').get_text(strip=True)
                    rank = e_row.find('span', class_='rankscore-bronze').get_text(strip=True)
                    university = e_row.find('h2', class_='h-taut').get_text()
                    address = e_row.find('div', class_='t-taut').get_text()
                    addition = e_row.find_all('div')[-1].get_text()
                    d = {
                        u"得分":point,
                        u"排名":rank,
                        u"大学":university,
                        u"地址":address,
                        u"其他":addition,
                        u"url":url
                    }

                    d = {key: d[key].strip() for key in d}
                    ser = pd.Series(d)

                    print pd.DataFrame(ser).T
                    yield ser

            except:
                log_obj.error(url)
                log_obj.error(traceback.format_exc())

            driver.quit()
    def parse(self, response):
        driver = driver_manager.initialization()
        try:
            driver.get('about:blank')
            driver.get(response.url)

            e_table = driver.find_element_by_id('dbtable')
            e_as = e_table.find_element_by_tag_name(
                'thead').find_elements_by_tag_name('a')
            code_set = set()
            for e_a in e_as[6:15]:
                title = e_a.text
                e_a.click()
                time.sleep(1)
                bs_obj = bs4.BeautifulSoup(driver.page_source, 'html.parser')
                e_table = bs_obj.find('table', id='dbtable')
                df = pd.read_html(e_table.prettify(encoding='utf8'),
                                  encoding='utf8')[0]
                df.iloc[:, 2] = df.iloc[:, 2].apply(
                    lambda i: '{0:0>6}'.format(i))  # 数字转全6位字符串
                code_set.update(df.iloc[:, 2].tolist())

            for code in code_set:
                item = fund_monitor.items.FundMonitorItem()
                item['fund_code'] = code
                item[
                    'url'] = 'http://fund.eastmoney.com/f10/ccmx_%s.html' % code

                yield scrapy.Request(item['url'],
                                     meta={'item': item},
                                     callback=self.parse1,
                                     dont_filter=True)
        except:
            log_obj.error("%s中无法解析\n原因:%s" %
                          (self.name, traceback.format_exc()))
        finally:
            driver.quit()
コード例 #5
0
    def get_cookies(self, url):
        while True:
            driver = driver_manager.initialization(engine='Chrome')
            try:
                driver.get(url) #'http://www.simuwang.com/')

                while not driver.find_elements_by_id('gr-login-box'):
                    driver.find_element_by_class_name('topRight').find_element_by_tag_name('a').click()
                    time.sleep(2)
                time.sleep(10)
                    # driver.save_screenshot('screenshot.png')
                cookies = driver.get_cookies()
                # print {d[u'name']:d[u'value'] for d in cookies}

                # driver.save_screenshot('screenshot.png')
                login_box = driver.find_element_by_id('gr-login-box')
                login_box.find_elements_by_tag_name('input')[0].send_keys('13575486859')
                login_box.find_elements_by_tag_name('input')[0].send_keys(Keys.TAB)
                login_box.find_elements_by_tag_name('input')[2].send_keys('137982')
                # passwd_input.click()
                # passwd_input.send_keys('137482')

                login_buttom = login_box.find_element_by_class_name('gr-big-btn')
                login_buttom.click()
                time.sleep(3)

                # driver.save_screenshot('screenshot.png')

                cookies = driver.get_cookies()
                # print {d[u'name']:d[u'value'] for d in cookies}
                return cookies# {d[u'name']:d[u'value'] for d in cookies}

            except:
                log_obj.error("%s中无法解析\n原因:%s" %(self.name, traceback.format_exc()))
            finally:
                driver.quit()
コード例 #6
0
    def start_requests(self):
        # d1 = self.get_cookies('http://dc.simuwang.com/product/HF00001MTU')
        # d1 = {d['name']: d['value'] for d in d1}
        # for key in d1:
        #     print '"%s":"%s",' %(key,d1[key])

        url_dict = {
            '股票策略':[
                "http://dc.simuwang.com/ranking/get?page=",
                "&condition=fund_type%3A1%2C6%2C4%2C3%2C8%2C2%3Bret%3A4%3Brating_year%3A1%3Bstrategy%3A1%3Bistiered%3A0%3Bcompany_type%3A1%3Bsort_name%3Aprofit_col2%3Bsort_asc%3Adesc%3Bkeyword%3A"
            ],
            '宏观策略':[
                "http://dc.simuwang.com/ranking/get?page=",
               "&condition=fund_type%3A1%2C6%2C4%2C3%2C8%2C2%3Bret%3A4%3Brating_year%3A1%3Bstrategy%3A2%3Bistiered%3A0%3Bcompany_type%3A1%3Bsort_name%3Aprofit_col2%3Bsort_asc%3Adesc%3Bkeyword%3A"
            ],
            '管理期货':[
                "http://dc.simuwang.com/ranking/get?page=",
                "&condition=fund_type%3A1%2C6%2C4%2C3%2C8%2C2%3Bret%3A4%3Brating_year%3A1%3Bstrategy%3A3%3Bistiered%3A0%3Bcompany_type%3A1%3Bsort_name%3Aprofit_col2%3Bsort_asc%3Adesc%3Bkeyword%3A"
            ],
            '事件驱动':[
                "http://dc.simuwang.com/ranking/get?page=",
                "&condition=fund_type%3A1%2C6%2C4%2C3%2C8%2C2%3Bret%3A4%3Brating_year%3A1%3Bstrategy%3A4%3Bistiered%3A0%3Bcompany_type%3A1%3Bsort_name%3Aprofit_col2%3Bsort_asc%3Adesc%3Bkeyword%3A"
            ],
            '相对价值':[
                "http://dc.simuwang.com/ranking/get?page=",
                "&condition=fund_type%3A1%2C6%2C4%2C3%2C8%2C2%3Bret%3A4%3Brating_year%3A1%3Bstrategy%3A5%3Bistiered%3A0%3Bcompany_type%3A1%3Bsort_name%3Aprofit_col2%3Bsort_asc%3Adesc%3Bkeyword%3A"
            ],
            '固定收益':[
                "http://dc.simuwang.com/ranking/get?page=",
                "&condition=fund_type%3A1%2C6%2C4%2C3%2C8%2C2%3Bret%3A4%3Brating_year%3A1%3Bstrategy%3A6%3Bistiered%3A0%3Bcompany_type%3A1%3Bsort_name%3Aprofit_col2%3Bsort_asc%3Adesc%3Bkeyword%3A"
            ]
        }

        for key in url_dict:
            url = "%s1%s" %(url_dict[key][0], url_dict[key][1])
            print url
            # 获取总页数
            driver = driver_manager.initialization()
            driver.get(url)
            # print driver.get_cookies()
            # print {d['name']:d['value'] for d in driver.get_cookies()}
            global catlog_cookies
            for cookie0 in catlog_cookies:
                driver.add_cookie(cookie0)
            driver.get(url)
            # print driver.page_source

            data = json.loads(re.search(r'{.+}', driver.page_source).group())
            # print u"第一页数据长度%s" %len(data["data"])
            page_num = 2 # int(data["pager"]["pagecount"])

            urls = ["%s%s%s" %(url_dict[key][0], i+1, url_dict[key][1]) for i in range(page_num)]
            cookies = {d['name']: d['value'] for d in driver.get_cookies()}
            driver.quit()

            # 每页爬取
            for url in urls:
                item = fund_monitor.items.FundMonitorItem()
                item['cookies'] = cookies
                item['data'] = {'fund_type': key}
                time.sleep(2)
                yield scrapy.Request(url=url, meta={'item': item}, cookies=item['cookies'], callback=self.parse0)