def insert_to_mongo(): mongodb = mongoutil.getmondbv2("localhost", 27017, "bigdata_higgs", "qyxx_shangshihangyefenxi") reader = csv.DictReader(file(u'c:/企业分类数据.csv', 'r')) for d in reader: del d[""] id = d['股票代码'] d['股票代码'] = id.split(".")[0] d["交易所编号"] = id.split(".")[1] d["uptime"] = time.time() mongoutil.updatev3(mongodb, id, d)
def Get_message(self,url): db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd) now=timeutil.format("%Y-%m-%d",time.time()) proxy=None count=10 while True: try: #proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880) #proxy=None myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk") break except Exception as e3: print e3 if count<=0: raise Exception(u"连续10次失败,放弃") count-=1 time.sleep(1) tree=etree.HTML(myPage) title=xpathutil.get_all_text(tree,".//*[@id='mainNav']/div[2]/table/caption/h1",num=0,split=u" ") #address=xpathutil.get_all_text(tree,".//*[@id='mainNav']/div[2]/table/tbody/tr[3]/td",num=0,split=u" ") #pp=re.findall('<td>汉阳郭茨口香格里都3楼腾飞人才市场</td>') print title myPage=myPage.encode('utf-8') address1=re.findall('<th width="90">(.*?)</th>(.*?)<td>(.*?)</td>',myPage,re.S) j=0 for i in address1: if j==0: city1=re.findall('">(.*?)</a>',i[-1]) city=city1[0] elif j==1: date=i[-1] elif j==2: address=i[-1] j=j+1 print len(address) print city print date print address key=url+now mongoutil.updatev3(db_yjs,key,{"标题":title,"城市":city,"招聘会时间":date,'招聘会地点':address,"页面链接":url,"dotime":now,"uptime":time.time(),"source":"yingjiesheng","type":"2"})
def __init__(self, queue_name): fileutil.mkdirs(queue_name) self.logging = get_logger(queue_name + '/' + 'ssdb_save') # if queue_name == u'shanghai_2': # self.queue_name = 'shanghai_2' # else: self.queue_name = queue_name self.db_name = 'bigdata_higgs_' + queue_name self.logging.info(self.db_name) while True: try: self.logging.info(u'连接mongo') self.mongo = mongoutil.getmondbv2( config.mongo_host, config.mongo_port, self.db_name, config.table_name, username=config.mongo_username, password=config.mongo_passwd) break except Exception as e: self.logging.error(u'连接mongo异常 %s' % exceputil.traceinfo(e)) time.sleep(60) continue while True: try: self.logging.info(u'连接ssdb') self.ssdb = ssdbutil2.getSSDBQueuev2(self.queue_name, host=config.ssdb_host, port=config.ssdb_port) break except Exception as e: self.logging.error(u'连接ssdb异常 %s' % exceputil.traceinfo(e)) time.sleep(60) continue
def Clean(self,key): try: db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd,timeout=30) hh=db_yjs.find({'type':'1'}) number=0 for i in hh: if i: p=i.get(u"文本2") id=i.get("_id") p=functions.remove_all_space_char(p) emeail=re.compile('[\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+') #p=p.decode('utf-8','ignore') ' #print p e=emeail.findall(p) a=[] phone_number=re.compile(r'^0\d{2,3}\d{7,8}$|^1[358]\d{9}$|^147\d{8}$') pn=phone_number.findall(p) posdict=dict() #print p[53:102] for key in self.key_word: found=False for j in key: index=p.find(j) if index>=0: if found: print "error" else: posdict[j]=index found=True for key in posdict: a.append(posdict[key]) a.sort() save_data=dict() for i in range(0,len(a)): if i+1<len(a): text3='' text3=p[int(a[i]):int(a[i+1])].replace(':',':',1).replace(":",":",1).replace(":",":",1) text3=text3.split(':') if len(text3)>1: if len(text3)==2: save_data[text3[0]]=text3[1] else: save_data[text3[0]]=text3[1]+text3[2] elif len(text3)==1: save_data[text3[0]]='None' else: pass else: pass if e: save_data["邮箱"]=e[0] else: save_data["邮箱"]="None" if pn: save_data['电话']=pn[0] else: save_data["电话"]="None" save_data['文本3']='None' for i in save_data: print i,save_data[i] number=number+1 mongoutil.updatev3(db_yjs,id,save_data) print "更新成功!%s"%number else: pass logging.error("完毕!") logging.error("%s"%number) except Exception as e2: print e2 logging.error("错误:%s" %e2) logging.error("数量%s" %number) self.Clean(key)
def Get_message(self,url,date): db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd,timeout=30) now=timeutil.format("%Y-%m-%d",time.time()) proxy=None count=10 while True: try: #proxy=None myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk") break except Exception as e3: print e3 proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880) if count<=0: raise Exception(u"连续10次失败,放弃") count-=1 time.sleep(1) tree=etree.HTML(myPage) jiben=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[2]/div/ul",num=0,split=u" ")#.//*[@id='container']/div[3]/div[2]/div/ul/li[2] text=xpathutil.get_all_text(tree,".//*[@id='wordDiv']/div/div",num=0,split=u" ") print len(text) if len(text)<=10: text=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]",num=0,split=u" ") else: pass p=functions.remove_all_space_char(text) p=functions.remove_all_space_char(p) emeail=re.compile('[\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+') e=emeail.findall(p) a=[] phone_number=re.compile('^(?:\+86)?(\d{3})\d{8}$|^(?:\+86)?(0\d{2,3})\d{7,8}$') pn=phone_number.findall(p) posdict=dict() for key in self.key_word: found=False for j in key: index=p.find(j) if index>=0: if found: print "error" else: posdict[j]=index found=True for key in posdict: a.append(posdict[key]) a.sort() save_data=dict() for i in range(0,len(a)): if i+1<len(a): text3='' text3=p[int(a[i]):int(a[i+1])].replace(':',':',1).replace(":",":",1).replace(":",":",1) text3=text3.split(':') if len(text3)>1: if len(text3)==2: save_data[text3[0]]=text3[1] else: save_data[text3[0]]=text3[1]+text3[2] elif len(text3)==1: save_data[text3[0]]='None' else: pass else: pass if e: save_data["邮箱"]=e[0] else: save_data["邮箱"]="无" if pn: save_data['电话']=pn[0] else: save_data["电话"]="无" myPage=myPage.encode('utf-8') title=re.findall('<title>(.*?)</title>',myPage) if not title: title=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[1]/h1/a",num=0,split=u" ") name= title else: name=title[0] print name p='' if not jiben: pp=re.findall('<div class="info clearfix"><ol><li>\xe5\x8f\x91\xe5\xb8\x83\xe6\x97\xb6\xe9\x97\xb4:<u>(.*?)</u></li><li>\xe5\xb7\xa5\xe4\xbd\x9c\xe5\x9c\xb0\xe7\x82\xb9:<u>(.*?) </u></li><li>\xe8\x81\x8c\xe4\xbd\x8d\xe7\xb1\xbb\xe5\x9e\x8b:<u>(.*?)</u></li><li>\xe6\x9d\xa5\xe6\xba\x90:<a href="#" onclick="window.open(.*?)">(.*?)</a></li>',myPage) for i in pp: for j in i: if j.find('(')==-1: p=p+j+'\r\n' else: jiben=jiben.replace('\t','') jiben=jiben.split('\r\n') for i in jiben: p=p+i.split(':')[-1].replace('\n','')+'\r\n' print p myPage=myPage.decode('utf-8') keys=url+now save_data["公司名称"]=name save_data["发布时间"]=date save_data["文本1"]=p save_data['文本2']=text save_data["页面链接"]=url save_data["页面源码"]=myPage save_data['dotime']=now save_data['uptime']=time.time() save_data['source']="yingjiesheng" save_data["type"]="1" mongoutil.updatev3(db_yjs,keys,save_data) print("数据入库成功!")
def save_data(self, last_failure_file='ssdb_mongo.data', wait_time=300): if os.path.exists(last_failure_file) == True: failed_list = [] count = 0 with open(last_failure_file, 'rb') as f: for line in f: failed_list.append( self.json_to_dict(line.strip().strip('\n'))) count += 1 while True: try: self.logging.info('Last Failed File :%d' % len(failed_list)) if failed_list != None and len(failed_list) > 0: for data in failed_list: if data == None: continue if '_id' in data.keys(): _id = data['_id'] else: _id = None if isinstance(data, dict) and _id != None: try: self.mongo.table.update({'_id': _id}, data, True) self.logging.info(u'成功update一条数据:%s' % _id) except Exception, e: self.logging.info(u'fail-update一条数据:%s' % _id) os.remove(last_failure_file) # insert_ret = self.mongo.table.insert(failed_list,safe = True) # if count - len(insert_ret) < 10 and count - len(insert_ret) >= 0: # os.remove(last_failure_file) # break # else: # time.sleep(5) # continue break except pymongo.errors.OperationFailure as e: # self.logging.error(exceputil.traceinfo(e)) if e != None and e != '': self.logging.info(e) _id = re.findall( r'.*?dup key:.*?\{.*?:.*?\"(.*?)\".*?\}', str(e)) self.logging.info('_id:%s' % _id[0]) if len(_id) > 0: update_data, other_list = self.get_index_and_other_list( _id[0], failed_list) if update_data != None: self.mongo.table.update( {'_id': update_data['_id']}, update_data, True) self.logging.info(u'update data:%s 成功' % _id[0]) if other_list != None and len(other_list) > 0: failed_list = other_list else: break continue else: break else: break except Exception as e: self.logging.error(u'存mongo数据异常 %s' % exceputil.traceinfo(e)) time.sleep(5) self.mongo = mongoutil.getmondbv2( config.mongo_host, config.mongo_port, self.db_name, config.table_name, username=config.mongo_username, password=config.mongo_passwd)
self.logging.info(u'成功update一条数据') if other_list != None and len(other_list) > 0: data_list = other_list continue else: break else: break except Exception as e: self.logging.error(u'存mongo数据异常 %s' % exceputil.traceinfo(e)) time.sleep(5) self.mongo = mongoutil.getmondbv2( config.mongo_host, config.mongo_port, self.db_name, config.table_name, username=config.mongo_username, password=config.mongo_passwd) def json_to_dict(self, data): try: if data != None: data_dict = json.loads(data) if data_dict != None: return data_dict else: return None except Exception as e: self.logging.error(u'转换dict异常 %s' % exceputil.traceinfo(e))
def __init__(self, name,queue_name,process_number=1,redis_host=None,redis_port=None): Customer.__init__(self, name,queue_name,process_number=process_number,redis_host=redis_host,redis_port=redis_port) self.mongo = mongoutil.getmondbv2(config.mongo_host, config.mongo_port, config.mongo_db, config.mongo_table)