def data_insert(product,licailist): #用于记录日志 global db insert = 0#插入标志,为1才插入 startdate = product.get('sell_Org_Date') enddate = product.get('sell_End_Date') currency = product.get('entr_Curncy_Name') productdays = product.get('days') if productdays == '': productdays =0 rate = product.get('prd_Max_Yld_De') name = product.get('prd_Sname') BankName = product.get('bank_Name') money = product.get('entr_Min_Curncy') if money =='': money = 0 endday = product.get('end_Date') id1 = str(product.get('inner_Code')) url ='http://bankpro.jrj.com.cn/product/'+str(product.get('inner_Code'))+'/' html = urllib.urlopen(url).read() soup = BeautifulSoup(html) increasemoney = re.findall('<td class="txr">委托递增金额(元).*?</td>\s+<td class="cur">(\d+).*?</td>',str(soup)) try: start= re.findall('<td class="cur">收益起始日期</td><td>(.*?)</td>',str(soup))[0] except: start = '' try: atype= re.findall('<td class="cur">收益类型</td><td>(.*?)</td>',str(soup))[0] except: atype = '' try: amoney = re.findall('<td class="cur">起购金额递增单位</td><td>(.*?)</td>',str(soup))[0] except: amoney = 0 try: area = re.findall('<td class="cur">销售地区</td><td colspan="5"><span.*?>(.*?)</span></td>',str(soup))[0] except: area = '' #工商银行人民币2012-12-122013-06-252012-12-062012-12-1150000 #key = BankName+currency+start[:10]+endday+startdate+enddate+money key = BankName+currency+startdate+enddate+str(int(money))+str(int(productdays)) if key not in licailist: print "not in" insert = 1 else: print key,"already in!" try: sql= "insert into sohulist(ID,ProductName,BankName,Currency,Duration,Product_StartDate,Sell_StartDate,Sell_EndDate,PayDuration,Return_Rate,Start_Money,Type,Product_EndDate,create_time,Increasing_Unit,Area,source) values('"+id1+"','"+name+"','"+BankName+"','"+currency+"',"+productdays+",'"+start+"','"+startdate+"','"+enddate+"','"+productdays+"',"+rate+","+money+",'"+atype+"','"+endday+"','"+str(datetime.now())[:19]+"','"+amoney+"','"+area+"','jrj')" except: sql ='' print 'sql=',sql try: if insert == 1: testlog = log.getLogging('jrj') db.execute(sql) print "insert ok!" testlog.critical(str(datetime.now())[:19]+'\tjrj\t'+key+'\n' ) except: print "insert error" pass
import log g_log=log.getLogging(__name__) from concurrent import futures import time import threading import grpc import bank_pb2 import bank_pb2_grpc import bank_sql # from bank import bank_pb2 # from bank import bank_pb2_grpc # from bank import bank_sql _ONE_DAY_IN_SECONDS = 60 * 60 * 24 class bank_server(bank_pb2_grpc.bankServicer): def __init__(self): super(bank_server, self).__init__() self.bankSql=bank_sql.bank() self.bankSql.run() self.threadLock = threading.Lock() def deposit(self, request, context): # g_log.info(request) g_log.info("deposit: "+ request.account + " " + str(request.value)) self.threadLock.acquire()
tmpstr = tmpstr+str(int(id1.Duration)) idlist.append(id1.values()) licailist.append(tmpstr) jrjsql = 'select ID from jrjlist;' jrjIDs = [] for jrj in db.query(jrjsql): jrjIDs.append(jrj.ID) dic =json.loads(html.replace('var bps=','')) products = dic.get('bankProductList') for a in products: id1 = str(a.get('inner_Code')) if id1 in jrjIDs: print id1,"already crawlered" continue if id1 in idlist: print id1+' is already in it' continue print u"insert into jrjlist(ID) values('"+id1+"');" db.execute(u"insert into jrjlist(ID) values('"+id1+"');") data_insert(a,licailist) #''' if __name__ == "__main__": testlog = log.getLogging('jrj') testlog.critical(str(datetime.now())[:19]+'\tjrj\tstarted\n' ) print str(datetime.now())[:19]+'\tjrj\tstarted\n' main()
# -*- coding: utf8 -*- __author__ = 'Tan Ying<*****@*****.**>' import os import sys import log import json import config import commands logger = log.getLogging('utils.py') def is_argument_in_dataset(argument, dataset, ignore_case = False): """ 参数是否在数据集中 """ for item in dataset: if ignore_case: if argument.lower() == item.lower(): return True else: if argument == item: return True return False def change_list_to_json(list, file): string = '' if len(list) > 0: string = '{\n'
url = re.findall('<a href="/(.*?)"',hlist[0],re.S)[0] durl = u'http://www.hui800.com/' + url dhtml = urllib.urlopen(durl).read().decode('utf-8') dlist = re.findall('<body.*?>(.*?)</body>',dhtml,re.S)[0] address = re.findall('<a href="(.*?)"',dlist,re.S)[1] ''' hlist = re.findall('<div class="dealsug b615">(.*?)</div>',html,re.S) url = re.findall('<a href="/(.*?)"',hlist[0],re.S)[0] durl = u'http://www.hui800.com/' + url dhtml = urllib.urlopen(durl).read().decode('utf-8') dlist = re.findall('<body.*?>(.*?)</body>',dhtml,re.S)[0] address = re.findall('<a href="(.*?)"',dlist,re.S)[1] except: address='' return address if __name__ == "__main__": testlog = log.getLogging('qq') testlog.critical(str(datetime.now())[:19]+'\tqq\tstarted\n' ) main()
def run(html,blist,productid,productname): global db logs = open("crawler.log",'a') bankdic = { u'上海浦东发展银行':u'浦发银行', u'中国光大银行':u'光大银行', u'中国农业银行':u'农业银行', u'中国工商银行':u'工商银行', u'中国建设银行':u'建设银行', u'中国民生银行':u'民生银行', u'中国邮政储蓄银行':u'邮政储蓄银行', u'中国银行':u'中国银行', u'交通银行':u'交通银行', u'兴业银行':u'兴业银行', u'北京银行':u'北京银行', u'华夏银行':u'华夏银行', u'平安银行':u'平安银行', u'广发银行':u'广发银行', u'招商银行':u'招商银行', u'上海银行':u'上海银行', u'中信银行':u'中信银行', } banklist=['中国银行','建设银行','农业银行','工商银行','交通银行','招商银行','民生银行','平安银行','浦发银行','广发银行','光大银行','华夏银行','中信银行','北京银行','上海银行','兴业银行','深发展银行','邮政储蓄银行'] bankname=blist[0].replace(u'股份有限公司','') #print productid,blist sell_startDate = blist[1] sell_endDate = blist[2][:10] sell_endDate = sell_endDate.replace(' ','') #print len(sell_endDate),'qq' product_endDate=blist[3] currency=blist[4] start_money=blist[7] print 'start_money= ',start_money if (start_money=='0') or (start_money==''): print 'money error!!!' return return_rate=blist[8] return_type=blist[10] hlist = re.findall("<li>(.*?)</li>",html,re.S) product_startDate=hlist[4].replace(u'收益起计:','') area=re.findall('<a title="(.*?)" class=',hlist[6])[0] product_type=hlist[8].replace(u'对象:','') duration=hlist[12].replace(u'付息周期:','') duration=duration.replace(u'天','') duration=duration.replace(u'日','') if u"月" in duration: d = int(re.findall('\d+',duration)[0])*30 duration=str(d) elif u"年" in duration: d = int(re.findall('\d+',duration)[0])*360 duration=str(d) if len(duration)==0: duration=str(0) #复制出来的 licailis sql = u'select ID,BankName,Currency,Duration,Product_StartDate,Sell_StartDate,Sell_EndDate,Duration,Product_EndDate,Start_Money,Return_Rate from sohulist ;' licailist = [] for id1 in db.query(sql): tmplist = [] tmpstr = '' tmpstr = tmpstr+id1.BankName tmpstr = tmpstr+id1.Currency #tmpstr = tmpstr+id1.Product_StartDate #tmpstr = tmpstr+id1.Product_EndDate tmpstr = tmpstr+id1.Sell_StartDate.replace(' ','') tmpstr = tmpstr+id1.Sell_EndDate.replace(' ','') tmpstr = tmpstr+str(int(id1.Start_Money)) tmpstr = tmpstr+str(int(id1.Duration)) licailist.append(tmpstr) #if bankname in bankdic: if len(bankname)>0: if bankname in bankdic: bankname=bankdic.get(bankname) sell_endDate = sell_endDate[:10] sell_endDate = sell_endDate.replace(' ','') key = bankname+currency+sell_startDate+sell_endDate+str(int(start_money))+str(int(duration)) print 'key =',key #这里开始判断 #====================== #====================== #print '啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊' sql1 = u'select ID from qqlist;' idlist = [] for id1 in db.query(sql1): idlist.append(id1.ID) if productid in idlist: print 'qqlist已经存在' return 0 else: sql = u"insert into qqlist(ID,ProductName,BankName,Currency,Duration,Product_StartDate,Sell_StartDate,Sell_EndDate,PayDuration,Return_Rate,Type,Start_Money,Area,Product_EndDate,Product_Type) values ('"+productid+"','"+productname+"','"+bankname+"','"+currency+"',"+duration+",'"+product_startDate+"','"+sell_startDate+"','"+sell_endDate+"','"+duration+"','"+return_rate+"','"+return_type+"',"+start_money+",'"+area+"','"+product_endDate+"','"+product_type+"');" #print sql try: pass db.execute(sql) print 'qqlist不存在,插入qqlist' except: print "error sql@@@@@@@: " #sqlsohulist= u"select * from sohulist WHERE BankName='"+bankname+"' and Start_Money="+start_money+" and Sell_StartDate = '"+sell_startDate+"' and Sell_EndDate = '"+sell_endDate+"'and Product_StartDate='"+product_startDate+"' and Product_EndDate='"+product_endDate+"';" if key not in licailist: sqlsohu = u"insert into sohulist(ID,ProductName,BankName,Currency,Duration,Product_StartDate,Sell_StartDate,Sell_EndDate,PayDuration,Return_Rate,Type,Start_Money,Area,Product_EndDate,Product_Type,create_time,source) values('"+productid+"','"+productname+"','"+bankname+"','"+currency+"',"+duration+",'"+product_startDate+"','"+sell_startDate+"','"+sell_endDate+"','"+duration+"','"+return_rate+"','"+return_type+"',"+start_money+",'"+area+"','"+product_endDate+"','"+product_type+"','"+str(datetime.now())[:19]+"','qq');" #print 'sqlsohu',sqlsohu sql2 = u'select ID from sohulist;' idlist2 = [] for id2 in db.query(sql2): idlist2.append(id2.ID) if productid not in idlist2: try: print 'sohu里没有' db.execute(sqlsohu) testlog = log.getLogging('qq') testlog.critical(str(datetime.now())[:19]+'\tqq\t'+productname+'\n' ) except: print "error sql: " else: print 'sohulist idlist里已经有了' else: print 'sohu里已经有了'
def run(url,licailist): print "len of list is ",len(licailist) global db logs = open("crawler.log",'a') bankdic = { u'中国邮政储蓄银行':u'邮政储蓄银行', u'平安银行(原)':u'平安银行', } html = urllib.urlopen("http://db.money.sohu.com"+url).read().replace('%','').decode('cp936').replace(u'平安银行(原)','平安银行').replace(u'中国邮政储蓄银行','邮政储蓄银行') product_id = re.findall("view/\d+/(\d+).html",url)[0] print product_id, soup = BeautifulSoup(html) strs = str(soup.table).replace('\t','').replace('\r\n','').replace(' ','').replace('</p>','').replace('<p>','') hlist = re.findall("<td.*?>(.*?)</td>\s+<td.*?>(.*?)</td>",strs,re.S) #print hlist p = [] sql = u'select ID from sohulist;' idlist = [] idlist.append('00051509') insert = 1 for id2 in db.query(sql): idlist.append(id2.ID) if product_id in idlist: insert = 0 print "already in" p.append(product_id) for h in hlist: p.append(h[1]) #sql = u"insert into qqlist(Id,Name,BankName,StartDate,EndDate,Currency,Duration,Return_Rate,StartMoney,Type) values ('"+id1+"','"+name+"','"+bankname+"','"+startdate+"','"+enddate+"','"+currency+"',"+duration+",'"+returnrate+"%%',"+startmoney+",'"+type1+"');" t = hlist[4][1] start = datetime(int(t[:4]),int(t[5:7]),int(t[8:10])) duration = int(re.findall('\d+',hlist[3][1])[0]) if p[2] in ['上海银行','农业银行','中国银行','招商银行','浦发银行','广发银行','光大银行','北京银行','建设银行','兴业银行','平安银行','交通银行','华夏银行','中信银行','江苏银行','包商银行','杭州银行','湖北银行','富滇银行','恒丰银行','青岛银行','兰州银行']: endday = str(start+ timedelta(duration))[:10] else: endday = str(start+ timedelta(duration-1))[:10] print start,str(duration),endday p.append(endday) tmpstr=u'' for m in p[:4]: tmpstr = tmpstr+"'"+m+"'," temp=p[4] if '天' in temp: temp=int(filter(str.isdigit,temp)) if '月' in p[4]: temp=int(filter(str.isdigit,temp))*30 if '年' in p[4]: temp=int(filter(str.isdigit,temp))*365 duration=str(temp) tmpstr = tmpstr+duration+"," #['2012-12-18', '2012-12-11', '2012-12-17', '93\xe5\xa4\xa9', '4.7', '\xe4\xbf\xa1\xe6\x89\x98\xe7\xb1\xbb'] for m in p[5:8]: tmpstr = tmpstr+"'"+m+"'," tmpstr = tmpstr+"'"+duration+"'," tmpstr = tmpstr+""+p[9]+"," tmpstr = tmpstr+"'"+p[10]+"'," if '万元' in p[11]: tmpstr = tmpstr+""+p[11].replace('万元','0000')+"," moneynu= p[11].replace('万元','0000') if len(moneynu)> len('9999999'): return else: moneynu = re.findall('\d+',p[11])[0] if len(moneynu)> len('9999999'): return tmpstr = tmpstr+""+moneynu+"," #工商银行人民币2012-12-122013-06-252012-12-062012-12-1150000 #工商银行人民币2012-12-072013-04-232012-12-042012-12-061000000 #key = p[2]+p[3]+p[5]+endday+p[6]+p[7]+moneynu key = p[2]+p[3]+p[6][:10]+p[7][:10]+str(int(moneynu))+str(int(duration)) print "key",key if key in licailist: insert = 0 print "already in" for m in p[12:]: tmpstr = tmpstr+"'"+m+"'," print type(tmpstr) sql = u"insert into sohulist values("+tmpstr[:-1].replace('\n','').replace('%','%%')+",'"+str(datetime.now())[:19]+"','sohu')" if insert ==1: print sql.encode('utf-8') try: testlog = log.getLogging('sohu') db.execute(sql) testlog.critical(str(datetime.now())[:19]+'\tsohu\t'+key+'\n' ) except: print "error"
return tmpstr = tmpstr+""+moneynu+"," #工商银行人民币2012-12-122013-06-252012-12-062012-12-1150000 #工商银行人民币2012-12-072013-04-232012-12-042012-12-061000000 #key = p[2]+p[3]+p[5]+endday+p[6]+p[7]+moneynu key = p[2]+p[3]+p[6][:10]+p[7][:10]+str(int(moneynu))+str(int(duration)) print "key",key if key in licailist: insert = 0 print "already in" for m in p[12:]: tmpstr = tmpstr+"'"+m+"'," print type(tmpstr) sql = u"insert into sohulist values("+tmpstr[:-1].replace('\n','').replace('%','%%')+",'"+str(datetime.now())[:19]+"','sohu')" if insert ==1: print sql.encode('utf-8') try: testlog = log.getLogging('sohu') db.execute(sql) testlog.critical(str(datetime.now())[:19]+'\tsohu\t'+key+'\n' ) except: print "error" if __name__ == "__main__": testlog = log.getLogging('sohu') testlog.critical(str(datetime.now())[:19]+'\tsohu\tstarted\n' ) print str(datetime.now())[:19]+'\tsohu\tstarted\n' main()