class Fund: def __init__(self): self.Sql = Sql() self.db_conn = self.Sql.conn_db(db='fund') def main(self, driver): total_page = driver.find_element_by_class_name('nv').text total_page = int(re.search('\d+', total_page).group()) print('total page:', total_page) sql = 'select crawledPage from crawl_info where spiderName = "fund"' crawledPage = int( self.Sql.exec_sql(self.db_conn, sql).fetchall()[0][0]) current_page = crawledPage if current_page == total_page: return for i in range(total_page): if i < current_page: next_page = driver.find_elements_by_xpath( "//div[@id='pager']/span[@class='nu page']")[-1] next_page.click() time.sleep(10) continue try: fund_ids = driver.find_elements_by_class_name('bzdm') for fund_id in fund_ids: fund_id = fund_id.text sql = 'insert into fund(fund) select "{0}" from dual where "{0}" not in(select fund from fund)'.format( fund_id) self.Sql.exec_sql(self.db_conn, sql) current_page += 1 print('Crawled Page {}'.format(current_page)) sql = 'update crawl_info set crawledPage = {} where spiderName = "fund"'.format( current_page) self.Sql.exec_sql(self.db_conn, sql) next_page = driver.find_elements_by_xpath( "//div[@id='pager']/span[@class='nu page']")[-1] next_page.click() time.sleep(10) except Exception as reason: print('Spider Crawl Failed in Page {0}, {1}'.format( current_page, str(reason)))
import requests from lxml import etree from sql import Sql Sql = Sql() db_conn = Sql.conn_db('fund') url = 'http://fund.eastmoney.com/allfund.html' r = requests.get(url) r.encoding = 'gb2312' html = r.text html = etree.HTML(html) num_boxes = html.xpath('//div[@id="code_content"]//div[@class="num_box"]') allfund = [] for num_box in [num_boxes[0]]: lies = num_box.xpath( '//div[@id="code_content"]//div[@class="num_box"]/ul/li') for li in [lies[0]]: funds = li.xpath( '//div[@id="code_content"]//div[@class="num_box"]/ul/li/div/a[1]/text()' ) for fund in funds: print(fund) code = fund.split(')')[0][1:] name = fund.split(')')[1] sql = 'insert into fund(code, name) values ("{}", "{}")'.format( code, name) Sql.exec_sql(db_conn, sql)
pages, page) response = requests.get(url) html = response.text data = html[len('var returnjson= {data:[['):(len(html.split(']')[-1]) + 2) * -1] managers = data.split('],[') for manager in managers: manager = manager[1:-1].split('","') code = manager[0] name = manager[1] companycode = manager[2] companyname = manager[3] currentfundcodes = manager[4] currentfundnames = manager[5] managertotalday = manager[6] bestincomerate = manager[7] bestfundcode = manager[8] bestfundname = manager[9] bestfundmoney = manager[10] bestfundincomerate = manager[11] crawldate = str(date.today()) sql = "insert into manager(code,name,companycode,companyname,currentfundcodes,currentfundnames,managertotalday,bestincomerate,bestfundcode,bestfundname,bestfundmoney,bestfundincomerate," \ "crawldate) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" Sql.exec_sql(db_conn, sql, [[ code, name, companycode, companyname, currentfundcodes, currentfundnames, managertotalday, bestincomerate, bestfundcode, bestfundname, bestfundmoney, bestfundincomerate, crawldate ]])
from datetime import date from bs4 import BeautifulSoup Sql = Sql() db_conn = Sql.conn_db('fund') url = 'http://fund.eastmoney.com/Company/default.html#scomname;dasc' response = requests.get(url) response.encoding = 'utf8' html = response.text soup = BeautifulSoup(html, 'html.parser') trs = soup.find('table', id='gspmTbl').find('tbody').find_all('tr') for tr in trs: tds = tr.find_all('td') code = tds[1].find('a')['href'] name = tds[1].get_text() establishdate = tds[3].get_text() lables = tds[4].find_all('label', 'sprite sprite-star1') txscore = len(lables) scalenumber = tds[5]['data-sortvalue'] allfundnumber = tds[6].get_text() allmanagernumber = tds[7].get_text() crawldate = str(date.today()) sql = "insert into company (code,name,establishdate,txscore,scalenumber,allfundnumber,allmanagernumber,crawldate) values(%s,%s,%s,%s,%s,%s,%s,%s)" Sql.exec_sql(db_conn, sql, [[ code, name, establishdate, txscore, scalenumber, allfundnumber, allmanagernumber, crawldate ]])
import requests from bs4 import BeautifulSoup from sql import Sql Sql = Sql() db_conn = Sql.conn_db('wzmatch') url = 'https://www.fnscore.com/detail/league/kog-2/league-kog-647.html' responce = requests.get(url) html = responce.text data = [] soup = BeautifulSoup(html, 'html.parser') matches = soup.find_all('div', 'match-panel-container')[1].find_all('div', 'match-panel-item match-table-item') for match in matches: match_info = match.find_all('p') matchTime = match_info[0].get_text() teams = match.find_all('div', 'team') teamA = teams[0].find('p').get_text() teamB= teams[1].find('p').get_text() score = match_info[2].get_text() BO = match_info[4].get_text() data.append([matchTime,teamA,teamB,score,BO]) sql = 'insert into kpl2020_autumn(matchTime,teamA,teamB,score,BO) values(%s,%s,%s,%s,%s)' Sql.exec_sql(db_conn, sql, data)
class JJJZ: def __init__(self): self.Sql = Sql() self.Driver = Driver() self.db_conn = self.Sql.conn_db(db='fund') def main(self): driver = self.Driver.main() root = 'http://fundf10.eastmoney.com/jjjz_' sql = 'select id,fund from fund where id > (select ifnull(crawledFundId,0) from crawl_info where spiderName = "jjjz")' fund_id_li = self.Sql.exec_sql(self.db_conn, sql) driver.implicitly_wait(2) for fund in fund_id_li: id = fund[0] fund = fund[1] jjjz_page = 0 url = root + fund + '.html' driver.get(url) jjjz_total_page = driver.find_elements_by_xpath( '//div[@class="pagebtns"]/label')[-2].text #sql = 'select crawledPage from crawl_info where spiderName = “jjjz”' #jjjz_page = int(self.Sql.exec(self.db_conn, sql)) data = [] for i in range(int(jjjz_total_page)): trs = driver.find_elements_by_xpath( '//div[@id="jztable"]/table/tbody/tr') for i in range(len(trs)): date = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[1]'. format(i + 1)).text.strip() unit_jz = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[2]'. format(i + 1)).text.strip() total_jz = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[3]'. format(i + 1)).text.strip() date_rate = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[4]'. format(i + 1)).text.strip() buy_status = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[5]'. format(i + 1)).text.strip() sale_status = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[6]'. format(i + 1)).text.strip() red = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[7]'. format(i + 1)).text.strip() data.append([ id, date, unit_jz, total_jz, date_rate, buy_status, sale_status, red ]) print('Crawling Fund {0}, Crawled Page {1}'.format( fund, jjjz_page)) jjjz_page += 1 jjjz_next_page = driver.find_elements_by_xpath( '//div[@class="pagebtns"]/label')[-1] jjjz_next_page.click() time.sleep(1) sql = "insert into jjjz(fundId,JZDate,unitJZ,totalJZ,dateRate,buyStatus,saleStatus,red) values (%s,%s,%s,%s,%s,%s,%s,%s)" self.Sql.exec_sql(self.db_conn, sql, data) sql = 'update crawl_info set crawledFundId = {} where spiderName = "jjjz"'.format( id) self.Sql.exec_sql(self.db_conn, sql)