def _listparse(self,element,cfg): rlist = []; if self.result is None: self.result = rlist; list = self.tran.select(element,cfg); ncfg = cfg.xpath("dict | list | val"); if not len(ncfg): ncfg = None; else: ncfg = ncfg[0]; for x in list: rlist.append(self._parse(x,ncfg)); for item in cfg.findall("incl"): for ele in list: ilist = self.tran.select(ele,item); rlt = self._inclparse(item , ilist); if type(rlt) == list : for v in rlt: rlist.append(v); else: log.error("%s 's result is not a list (%s)" % (cfg.getroottree().getpath(cfg),type(rlt))); for item in cfg.findall("spec"): self.tran.spec(rlist , item , list); return rlist;
def update_all_nav(fundcode=[],start=None,end=datetime.now()): log.info('update_all_nav start'); global SQL_CONN; req = 'http://biz.finance.sina.com.cn/fundinfo/open/lsjz.php?fund_code='; fundstart = {}; endtime = end.strftime('%Y-%m-%d'); conn = pydb.connect(**SQL_CONN); cur = conn.cursor(); cur.execute("select code,birthday from fund_info"); for row in cur.fetchall(): fundstart[row[0]] = {'startdate1' : row[1].strftime('%Y-%m-%d'),'enddate1':endtime}; if fundcode is None: fundcode = fundstart.keys(); sql4data = '''insert into fund_data ( code , date , nav , tnav) values (%s,%s,%s,%s) on duplicate key update nav=%s, tnav=%s '''; ind = 1; count = len(fundcode); cur = conn.cursor(); for fc in fundcode: log.info("start %s %s/%s" % (fc,ind,count)); crawl = crawler(); crawl.settranslator("xml"); postdata = fundstart[fc]; if start is not None: postdata['startdate1'] = start.strftime('%Y-%m-%d'); header = {'Refer':req + fc}; crawl.seturi(req + fc,postdata,header); crawl.setcfgfile("crawler_allfund_of_sina.xml"); result = crawl.parse(); log.debug("data %s %s" % (fc,result)); for data in result or []: if(result): cur.execute(sql4data,(fc,data['date'])+(data['nav'],data['tnav'])*2); else: log.error("no information about %s" % fc); warnings = cur.fetchwarnings() if warnings: log.warn("db:" + warnings); conn.commit(); log.info("done %s %s/%s" % (fc,ind,count)); ind = ind + 1; conn.close(); log.info('update_all_nav done');
def seturi(self,uri,params=None,headers=None): log.info("loading %s %s" % (uri,params)); self.uri = uri; if(params is not None and type(params) == dict): params = urllib.urlencode(params); for i in range(1,self.MAX_URL_OPEN): try: opener=urllib.URLopener(); opener.addheader("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; Embedded Web Browser from: http://bsalsa.com/; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; Tablet PC 2.0)"); opener.addheader("Cache-Control","no-cache"); for k,v in (headers or {}).iteritems(): opener.addheader(k,v); self.content = opener.open(uri,params).read(); break; except BaseException,e: log.error("%s got error %s" % (i,e)); if i < self.MAX_URL_OPEN: time.sleep(5);
def parse(self): self.result = None; if(self.content is None): log.error("no content for parsing"); return; if(self.cfg is None): log.error("no config file"); return; if(self.tran is None): log.error("no translator"); return; log.info("parsing"); data = self._parse(self.tran.getroot(),self.cfg); log.info("done!"); _writetologfile(self.uuid , json.dumps(data,ensure_ascii=False,indent=4)); return data;
def showerrorpath(self,text,cfg): log.error((text or "parse cfg error") + " " + cfg.getroottree().getpath(cfg));