class JJGKPipeline: def __init__(self): self.Sql = Sql() self.db_conn = self.Sql.conn_db('fund') def process_item(self, item, spider): code = item['code'] fullname = item['fullname'] shortname = item['shortname'] type = item['type'] releasetime = item['releasetime'] establishtime = item['establishtime'] establishcount = item['establishcount'] money = item['money'] count = item['count'] company = item['company'] companycode = item['companycode'] bank = item['bank'] bankcode = item['bankcode'] manager = item['manager'] managercode = item['managercode'] red = item['red'] managerfee = item['managerfee'] bankfee = item['bankfee'] servicefee = item['servicefee'] applybuyfee = item['applybuyfee'] buyfee = item['buyfee'] salefee = item['salefee'] comparestandard = item['comparestandard'] tacking = item['tacking'] target = item['target'] idea = item['idea'] range = item['range'] strangy = item['strangy'] redpolicy = item['redpolicy'] risk = item['risk'] leastbuy = item['leastbuy'] crawldate = item['crawldate'] sql = "insert into jjgk(code,fullname,shortname,type,releasetime,establishtime,establishcount,money,count,company,companycode," \ "bank,bankcode,manager,managercode,red,managerfee,bankfee,servicefee,applybuyfee,buyfee,salefee,comparestandard,tacking," \ "target,idea,`range`,strangy,redpolicy,risk,leastbuy,crawldate) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s," \ "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" self.Sql.exec_sql(self.db_conn, sql, [[ code, fullname, shortname, type, releasetime, establishtime, establishcount, money, count, company, companycode, bank, bankcode, manager, managercode, red, managerfee, bankfee, servicefee, applybuyfee, buyfee, salefee, comparestandard, tacking, target, idea, range, strangy, redpolicy, risk, leastbuy, crawldate ]]) return item
def __init__(self): self.Sql = Sql() self.db_conn = self.Sql.conn_db('fund')
class JJJLPipeline: def __init__(self): self.Sql = Sql() self.db_conn = self.Sql.conn_db('fund') def process_item(self, item, spider): if isinstance(item, JJJLItem): code = item['code'] name = item['name'] info = item['info'] totalday = item['totalday'] startdate = item['startdate'] currentcompany = item['currentcompany'] currentfundmoney = item['currentfundmoney'] bestincomerate = item['bestincomerate'] crawldate = item['crawldate'] sql = "insert into jjjl(code,name,info,totalday,startdate,currentcompany,currentfundmoney,bestincomerate,crawldate) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" self.Sql.exec_sql(self.db_conn, sql, [[ code, name, info, totalday, startdate, currentcompany, currentfundmoney, bestincomerate, crawldate ]]) elif isinstance(item, JJJLFundHistoryItem): code = item['code'] fundcode = item['fundcode'] fundname = item['fundname'] fundtype = item['fundtype'] fundmoney = item['fundmoney'] fundmanagerdate = item['fundmanagerdate'] fundmanagerday = item['fundmanagerday'] fundmanagerincomerate = item['fundmanagerincomerate'] crawldate = item['crawldate'] sql = "insert into jjjl_fund_history(code,fundcode,fundname,fundtype,fundmoney,fundmanagerdate,fundmanagerday,fundmanagerincomerate,crawldate) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" self.Sql.exec_sql(self.db_conn, sql, [[ code, fundcode, fundname, fundtype, fundmoney, fundmanagerdate, fundmanagerday, fundmanagerincomerate, crawldate ]]) elif isinstance(item, JJJLCurrentFundItem): code = item['code'] fundcode = item['fundcode'] fundname = item['fundname'] fundtype = item['fundtype'] last3mrate = item['last3mrate'] last3mrank = item['last3mrank'] last6mrate = item['last6mrate'] last6mrank = item['last6mrank'] last1yrate = item['last1yrate'] last1yrank = item['last1yrank'] last2yrate = item['last2yrate'] last2yrank = item['last2yrank'] currentyearrate = item['currentyearrate'] currentyearrank = item['currentyearrank'] crawldate = item['crawldate'] sql = "insert into jjjl_current_fund(code,fundcode,fundname,fundtype,last3mrate,last3mrank,last6mrate,last6mrank,last1yrate,last1yrank,last2yrate,last2yrank,currentyearrate,currentyearrank,crawldate) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s," \ "%s,%s,%s)" self.Sql.exec_sql(self.db_conn, sql, [[ code, fundcode, fundname, fundtype, last3mrate, last3mrank, last6mrate, last6mrank, last1yrate, last1yrank, last2yrate, last2yrank, currentyearrate, currentyearrank, crawldate ]]) return item
class JJJL(scrapy.Spider): def __init__(self): self.Sql = Sql() self.db_conn = self.Sql.conn_db('fund') name = "jjjl" custom_settings = { 'ITEM_PIPELINES': { 'fund.jjjl_pipelines.JJJLPipeline': 300 }, } def start_requests(self): sql = 'select code from manager where code not in(select code from jjjl)' codes = self.Sql.exec_sql(self.db_conn, sql).fetchall() for code in codes: code = code[0] url = 'http://fund.eastmoney.com/manager/{}.html'.format(code) yield scrapy.Request(url=url, callback=self.parse, meta={'code': code}) def parse(self, response): html = response.text code = response.meta['code'] item = JJJLItem() soup = BeautifulSoup(html, 'lxml') name = soup.find('div', 'content_out').find( 'div', 'content_in').find('span').get_text() info = soup.find('div', 'content_out').find( 'div', 'jlinfo clearfix').find('p').get_text() totalday = soup.find('div', 'content_out').find( 'div', 'left clearfix w438').find('div', 'right jd').get_text() startdate = '' currentcompany = soup.find('div', 'content_out').find( 'div', 'left clearfix w438').find('div', 'right jd').find('a').get_text() currentfundmoney = soup.find('div', 'content_out').find('div', 'left clearfix w438').find('div', 'right jd').\ find('div', 'gmContainer').find('div', 'gmleft gmlefts').find('span', 'numtext').get_text() bestincomerate = soup.find('div', 'content_out').find('div', 'left clearfix w438').find('div', 'right jd').\ find('div', 'gmContainer').find('div', 'gmleft').find('span', 'numtext').get_text() item['code'] = code item['name'] = name item['info'] = info item['totalday'] = totalday item['startdate'] = startdate item['currentcompany'] = currentcompany item['currentfundmoney'] = currentfundmoney item['bestincomerate'] = bestincomerate item['crawldate'] = str(date.today()) yield item item = JJJLFundHistoryItem() trs = soup.find('div', 'content_out').find( 'table', 'ftrs').find('tbody').find_all('tr') for tr in trs: tds = tr.find_all('td') fundcode = tds[0].get_text() fundname = tds[1].get_text() fundtype = tds[3].get_text() fundmoney = tds[4].get_text() fundmanagerdate = tds[5].get_text() fundmanagerday = tds[6].get_text() fundmanagerincomerate = tds[7].get_text() item['code'] = code item['fundcode'] = fundcode item['fundname'] = fundname item['fundtype'] = fundtype item['fundmoney'] = fundmoney item['fundmanagerdate'] = fundmanagerdate item['fundmanagerday'] = fundmanagerday item['fundmanagerincomerate'] = fundmanagerincomerate item['crawldate'] = str(date.today()) yield item item = JJJLCurrentFundItem() trs = soup.find_all('div', 'content_in')[1].find( 'table', 'ftrs').find('tbody').find_all('tr') for tr in trs: tds = tr.find_all('td') fundcode = tds[0].get_text() fundname = tds[1].get_text() fundtype = tds[2].get_text() last3mrate = tds[3].get_text() last3mrank = tds[4].get_text() last6mrate = tds[5].get_text() last6mrank = tds[6].get_text() last1yrate = tds[7].get_text() last1yrank = tds[8].get_text() last2yrate = tds[9].get_text() last2yrank = tds[10].get_text() currentyearrate = tds[11].get_text() currentyearrank = tds[12].get_text() item['code'] = code item['fundcode'] = fundcode item['fundname'] = fundname item['fundtype'] = fundtype item['last3mrate'] = last3mrate item['last3mrank'] = last3mrank item['last6mrate'] = last6mrate item['last6mrank'] = last6mrank item['last1yrate'] = last1yrate item['last1yrank'] = last1yrank item['last2yrate'] = last2yrate item['last2yrank'] = last2yrank item['currentyearrate'] = currentyearrate item['currentyearrank'] = currentyearrank item['crawldate'] = str(date.today()) yield item
class JJGK(scrapy.Spider): def __init__(self): self.Sql = Sql() self.db_conn = self.Sql.conn_db('fund') name = "jjgk" custom_settings = { 'ITEM_PIPELINES': {'fund.jjgk_pipelines.JJGKPipeline': 300}, } def start_requests(self): sql = 'select code from fund where code not in(select code from jjgk)' codes = self.Sql.exec_sql(self.db_conn, sql).fetchall() for code in codes: code = code[0] url = 'http://fundf10.eastmoney.com/jbgk_{}.html'.format(code) yield scrapy.Request(url=url, callback=self.parse, meta={'code':code}) def parse(self, response): html = response.text code = response.meta['code'] item = JJGKItem() soup = BeautifulSoup(html, 'lxml') boxes = soup.find('div', 'detail').find('div', 'txt_cont').find('div', 'txt_in').find_all('div', 'box') tds = boxes[0].find('table', 'info w790').find_all('td') fullname = tds[0].get_text() shortname = tds[1].get_text() type = tds[3].get_text() releasetime = tds[4].get_text() establishtime = tds[5].get_text().split('/')[0] establishcount = tds[5].get_text().split('/')[1] money = tds[6].get_text() count = tds[7].get_text() company = tds[8].get_text() companycode = tds[8].find('a')['href'] bank = tds[9].get_text() bankcode = tds[9].find('a')['href'] manager = tds[10].get_text() managercode = tds[10].find('a') if managercode: managercode = managercode['href'] red = tds[11].get_text() managerfee = tds[12].get_text() bankfee = tds[13].get_text() servicefee = tds[14].get_text() applybuyfee = tds[15].get_text() buyfee = tds[16].get_text() salefee = tds[17].get_text() comparestandard = tds[18].get_text() tacking = tds[19].get_text() if len(boxes) == 8: del (boxes[1]) target = boxes[1].find('p').get_text().strip('\n\r ') idea = boxes[2].find('p').get_text().strip('\n\r ') range = boxes[2].find('p').get_text().strip('\n\r ') strangy = boxes[2].find('p').get_text().strip('\n\r ') redpolicy = boxes[2].find('p').get_text().strip('\n\r ') risk = boxes[2].find('p').get_text().strip('\n\r ') leastbuy = soup.find('div', 'bs_jz').find('div', 'col-left').find('div').find('a').find_all('span')[ -1].get_text() crawldate = str(date.today()) item['code'] = code item['fullname'] = fullname item['shortname'] = shortname item['type'] = type item['releasetime'] = releasetime item['establishtime'] = establishtime item['establishcount'] = establishcount item['money'] = money item['count'] = count item['company'] = company item['companycode'] = companycode item['bank'] = bank item['bankcode'] = bankcode item['manager'] = manager item['managercode'] = managercode item['red'] = red item['managerfee'] = managerfee item['bankfee'] = bankfee item['servicefee'] = servicefee item['applybuyfee'] = applybuyfee item['buyfee'] = buyfee item['salefee'] = salefee item['comparestandard'] = comparestandard item['tacking'] = tacking item['target'] = target item['idea'] = idea item['range'] = range item['strangy'] = strangy item['redpolicy'] = redpolicy item['risk'] = risk item['leastbuy'] = leastbuy item['crawldate'] = crawldate yield item