def parse(self, response): driver = driver_manager.initialization() try: driver.get('about:blank') driver.get(response.url) while driver.find_elements_by_class_name('checkall'): driver.find_element_by_class_name('checkall').click() print u"等待数据加载完成" time.sleep(2) bs_obj = bs4.BeautifulSoup(driver.page_source, 'html.parser') #log_obj.update_error(bs_obj.prettify(encoding='utf8')) e_trs = bs_obj.find('table', id='oTable').tbody.find_all('tr') for e_tr in e_trs: item = fund_monitor.items.FundMonitorItem() item['fund_code'] = e_tr.find('td', class_='bzdm').get_text(strip=True) item['fund_name'] = e_tr.find('td', class_='tol').a.get('title') item['url'] = 'http://fund.eastmoney.com/' + e_tr.find('td', class_='tol').a.get('href') yield scrapy.Request(item['url'], meta={'item': item}, callback=self.parse1, dont_filter=True) except: log_obj.error("%s中无法解析\n原因:%s" %(self.name, traceback.format_exc())) finally: driver.quit()
def get_max_page(self, subject, url): driver = driver_manager.initialization(engine='Chrome') try: driver.get('about:blank') driver.get(url) bs_obj = bs4.BeautifulSoup(driver.page_source, 'html.parser') page_row = bs_obj.find('div', class_='pagination') max_page = page_row.find_all('a')[-2].get_text(strip=True) self.page_count[subject] = int(max_page) print "这学科%s排名一共%s页" % (subject, max_page) with open('page_count.json', 'w') as f: json.dump(self.page_count, f) except: log_obj.error(url) log_obj.error(traceback.format_exc()) driver.quit()
def parse_catalog(self, subject, url0): max_page = self.page_count[subject] urls0 = [url0 + ('?page=%s' %(i+1)) for i in range(max_page) if i > 0] urls = [url0,] + urls0 for url in urls: print "正在解析:", url if url in self.used_urls: print '%s pass page %s' %(subject, url) continue driver = driver_manager.initialization(engine='Chrome') try: driver.get('about:blank') driver.get(url) bs_obj = bs4.BeautifulSoup(driver.page_source, 'html.parser') e_div = bs_obj.find('div', id='resultsMain') e_rows = e_div.find_all('div', class_='sep') for e_row in e_rows: point = e_row.find('div', class_='t-large t-strong t-constricted').get_text(strip=True) rank = e_row.find('span', class_='rankscore-bronze').get_text(strip=True) university = e_row.find('h2', class_='h-taut').get_text() address = e_row.find('div', class_='t-taut').get_text() addition = e_row.find_all('div')[-1].get_text() d = { u"得分":point, u"排名":rank, u"大学":university, u"地址":address, u"其他":addition, u"url":url } d = {key: d[key].strip() for key in d} ser = pd.Series(d) print pd.DataFrame(ser).T yield ser except: log_obj.error(url) log_obj.error(traceback.format_exc()) driver.quit()
def parse(self, response): driver = driver_manager.initialization() try: driver.get('about:blank') driver.get(response.url) e_table = driver.find_element_by_id('dbtable') e_as = e_table.find_element_by_tag_name( 'thead').find_elements_by_tag_name('a') code_set = set() for e_a in e_as[6:15]: title = e_a.text e_a.click() time.sleep(1) bs_obj = bs4.BeautifulSoup(driver.page_source, 'html.parser') e_table = bs_obj.find('table', id='dbtable') df = pd.read_html(e_table.prettify(encoding='utf8'), encoding='utf8')[0] df.iloc[:, 2] = df.iloc[:, 2].apply( lambda i: '{0:0>6}'.format(i)) # 数字转全6位字符串 code_set.update(df.iloc[:, 2].tolist()) for code in code_set: item = fund_monitor.items.FundMonitorItem() item['fund_code'] = code item[ 'url'] = 'http://fund.eastmoney.com/f10/ccmx_%s.html' % code yield scrapy.Request(item['url'], meta={'item': item}, callback=self.parse1, dont_filter=True) except: log_obj.error("%s中无法解析\n原因:%s" % (self.name, traceback.format_exc())) finally: driver.quit()
def get_cookies(self, url): while True: driver = driver_manager.initialization(engine='Chrome') try: driver.get(url) #'http://www.simuwang.com/') while not driver.find_elements_by_id('gr-login-box'): driver.find_element_by_class_name('topRight').find_element_by_tag_name('a').click() time.sleep(2) time.sleep(10) # driver.save_screenshot('screenshot.png') cookies = driver.get_cookies() # print {d[u'name']:d[u'value'] for d in cookies} # driver.save_screenshot('screenshot.png') login_box = driver.find_element_by_id('gr-login-box') login_box.find_elements_by_tag_name('input')[0].send_keys('13575486859') login_box.find_elements_by_tag_name('input')[0].send_keys(Keys.TAB) login_box.find_elements_by_tag_name('input')[2].send_keys('137982') # passwd_input.click() # passwd_input.send_keys('137482') login_buttom = login_box.find_element_by_class_name('gr-big-btn') login_buttom.click() time.sleep(3) # driver.save_screenshot('screenshot.png') cookies = driver.get_cookies() # print {d[u'name']:d[u'value'] for d in cookies} return cookies# {d[u'name']:d[u'value'] for d in cookies} except: log_obj.error("%s中无法解析\n原因:%s" %(self.name, traceback.format_exc())) finally: driver.quit()
def start_requests(self): # d1 = self.get_cookies('http://dc.simuwang.com/product/HF00001MTU') # d1 = {d['name']: d['value'] for d in d1} # for key in d1: # print '"%s":"%s",' %(key,d1[key]) url_dict = { '股票策略':[ "http://dc.simuwang.com/ranking/get?page=", "&condition=fund_type%3A1%2C6%2C4%2C3%2C8%2C2%3Bret%3A4%3Brating_year%3A1%3Bstrategy%3A1%3Bistiered%3A0%3Bcompany_type%3A1%3Bsort_name%3Aprofit_col2%3Bsort_asc%3Adesc%3Bkeyword%3A" ], '宏观策略':[ "http://dc.simuwang.com/ranking/get?page=", "&condition=fund_type%3A1%2C6%2C4%2C3%2C8%2C2%3Bret%3A4%3Brating_year%3A1%3Bstrategy%3A2%3Bistiered%3A0%3Bcompany_type%3A1%3Bsort_name%3Aprofit_col2%3Bsort_asc%3Adesc%3Bkeyword%3A" ], '管理期货':[ "http://dc.simuwang.com/ranking/get?page=", "&condition=fund_type%3A1%2C6%2C4%2C3%2C8%2C2%3Bret%3A4%3Brating_year%3A1%3Bstrategy%3A3%3Bistiered%3A0%3Bcompany_type%3A1%3Bsort_name%3Aprofit_col2%3Bsort_asc%3Adesc%3Bkeyword%3A" ], '事件驱动':[ "http://dc.simuwang.com/ranking/get?page=", "&condition=fund_type%3A1%2C6%2C4%2C3%2C8%2C2%3Bret%3A4%3Brating_year%3A1%3Bstrategy%3A4%3Bistiered%3A0%3Bcompany_type%3A1%3Bsort_name%3Aprofit_col2%3Bsort_asc%3Adesc%3Bkeyword%3A" ], '相对价值':[ "http://dc.simuwang.com/ranking/get?page=", "&condition=fund_type%3A1%2C6%2C4%2C3%2C8%2C2%3Bret%3A4%3Brating_year%3A1%3Bstrategy%3A5%3Bistiered%3A0%3Bcompany_type%3A1%3Bsort_name%3Aprofit_col2%3Bsort_asc%3Adesc%3Bkeyword%3A" ], '固定收益':[ "http://dc.simuwang.com/ranking/get?page=", "&condition=fund_type%3A1%2C6%2C4%2C3%2C8%2C2%3Bret%3A4%3Brating_year%3A1%3Bstrategy%3A6%3Bistiered%3A0%3Bcompany_type%3A1%3Bsort_name%3Aprofit_col2%3Bsort_asc%3Adesc%3Bkeyword%3A" ] } for key in url_dict: url = "%s1%s" %(url_dict[key][0], url_dict[key][1]) print url # 获取总页数 driver = driver_manager.initialization() driver.get(url) # print driver.get_cookies() # print {d['name']:d['value'] for d in driver.get_cookies()} global catlog_cookies for cookie0 in catlog_cookies: driver.add_cookie(cookie0) driver.get(url) # print driver.page_source data = json.loads(re.search(r'{.+}', driver.page_source).group()) # print u"第一页数据长度%s" %len(data["data"]) page_num = 2 # int(data["pager"]["pagecount"]) urls = ["%s%s%s" %(url_dict[key][0], i+1, url_dict[key][1]) for i in range(page_num)] cookies = {d['name']: d['value'] for d in driver.get_cookies()} driver.quit() # 每页爬取 for url in urls: item = fund_monitor.items.FundMonitorItem() item['cookies'] = cookies item['data'] = {'fund_type': key} time.sleep(2) yield scrapy.Request(url=url, meta={'item': item}, cookies=item['cookies'], callback=self.parse0)