Example #1
0
def insert_to_mongo():
    mongodb = mongoutil.getmondbv2("localhost", 27017, "bigdata_higgs",
                                   "qyxx_shangshihangyefenxi")
    reader = csv.DictReader(file(u'c:/企业分类数据.csv', 'r'))

    for d in reader:
        del d[""]
        id = d['股票代码']

        d['股票代码'] = id.split(".")[0]
        d["交易所编号"] = id.split(".")[1]
        d["uptime"] = time.time()
        mongoutil.updatev3(mongodb, id, d)
 def Get_message(self,url):
     db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd)
     now=timeutil.format("%Y-%m-%d",time.time())
     proxy=None
     count=10
     while True:
         try:
             #proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880)
             #proxy=None
             myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk")
             break
         except Exception as e3:
             print e3
             if count<=0:
                 raise  Exception(u"连续10次失败,放弃")
             count-=1
             time.sleep(1) 
     tree=etree.HTML(myPage)
     title=xpathutil.get_all_text(tree,".//*[@id='mainNav']/div[2]/table/caption/h1",num=0,split=u" ")
     #address=xpathutil.get_all_text(tree,".//*[@id='mainNav']/div[2]/table/tbody/tr[3]/td",num=0,split=u" ")
     #pp=re.findall('<td>汉阳郭茨口香格里都3楼腾飞人才市场</td>')
     print title
     myPage=myPage.encode('utf-8')
     address1=re.findall('<th width="90">(.*?)</th>(.*?)<td>(.*?)</td>',myPage,re.S)
     j=0
     for i in address1:
         if j==0:
             city1=re.findall('">(.*?)</a>',i[-1])
             city=city1[0]
         elif j==1:
             date=i[-1]
         elif j==2:
             address=i[-1]
         j=j+1
     
     print len(address)
     print city
     print date
     print address
     key=url+now
     mongoutil.updatev3(db_yjs,key,{"标题":title,"城市":city,"招聘会时间":date,'招聘会地点':address,"页面链接":url,"dotime":now,"uptime":time.time(),"source":"yingjiesheng","type":"2"})
Example #3
0
    def __init__(self, queue_name):
        fileutil.mkdirs(queue_name)
        self.logging = get_logger(queue_name + '/' + 'ssdb_save')
        # if queue_name == u'shanghai_2':
        #     self.queue_name = 'shanghai_2'
        # else:
        self.queue_name = queue_name
        self.db_name = 'bigdata_higgs_' + queue_name

        self.logging.info(self.db_name)
        while True:
            try:
                self.logging.info(u'连接mongo')
                self.mongo = mongoutil.getmondbv2(
                    config.mongo_host,
                    config.mongo_port,
                    self.db_name,
                    config.table_name,
                    username=config.mongo_username,
                    password=config.mongo_passwd)
                break
            except Exception as e:
                self.logging.error(u'连接mongo异常 %s' % exceputil.traceinfo(e))
                time.sleep(60)
                continue

        while True:
            try:
                self.logging.info(u'连接ssdb')
                self.ssdb = ssdbutil2.getSSDBQueuev2(self.queue_name,
                                                     host=config.ssdb_host,
                                                     port=config.ssdb_port)
                break
            except Exception as e:
                self.logging.error(u'连接ssdb异常 %s' % exceputil.traceinfo(e))
                time.sleep(60)
                continue
    def Clean(self,key):
        try:
            db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd,timeout=30)
            hh=db_yjs.find({'type':'1'})
            number=0
            for i in hh:
                if i:
                    p=i.get(u"文本2")
		    
		   
                    id=i.get("_id")
                    p=functions.remove_all_space_char(p)
                    emeail=re.compile('[\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')
                    #p=p.decode('utf-8','ignore') '
                    #print p 
                    e=emeail.findall(p)
                    a=[]
		    phone_number=re.compile(r'^0\d{2,3}\d{7,8}$|^1[358]\d{9}$|^147\d{8}$')
		    pn=phone_number.findall(p)
                    posdict=dict()
                    #print p[53:102]
                    for  key in self.key_word:
                        found=False
                        for j in key:
                            index=p.find(j)
                            if index>=0:
                                if found:
                                    print "error"
                                else:
                                    posdict[j]=index
                                    found=True
                    for key in  posdict:
                        a.append(posdict[key])
		
                    a.sort()
                    
		    save_data=dict()

                    for i in range(0,len(a)):
                        if i+1<len(a):
                            text3=''
                            text3=p[int(a[i]):int(a[i+1])].replace(':',':',1).replace(":",":",1).replace(":",":",1)
			   	
			    text3=text3.split(':')
			   
			    if len(text3)>1:
				if len(text3)==2:
			    		save_data[text3[0]]=text3[1]

				else:
					save_data[text3[0]]=text3[1]+text3[2]
			    elif len(text3)==1:
				save_data[text3[0]]='None'
			    else:
				pass


                        else:
                            pass
                
                    if e:
                        save_data["邮箱"]=e[0]
                    else:
                        save_data["邮箱"]="None"
		    if pn:
			save_data['电话']=pn[0]
		    else:
			save_data["电话"]="None"
		    save_data['文本3']='None'
                    for i in save_data:
			print i,save_data[i]
                    number=number+1
                    mongoutil.updatev3(db_yjs,id,save_data)
		    print "更新成功!%s"%number
		    
		   
                else:
                    pass
	    logging.error("完毕!")
	    logging.error("%s"%number)
               
        except Exception as e2:
            print e2
	    logging.error("错误:%s" %e2)
	    logging.error("数量%s" %number)
            self.Clean(key)
    def Get_message(self,url,date):
        db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd,timeout=30)
        now=timeutil.format("%Y-%m-%d",time.time())
        proxy=None
        count=10
        while True:
            try:
               
                #proxy=None
                myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk")
                break
            except Exception as e3:
                print e3
		proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880)
                if count<=0:
                    raise  Exception(u"连续10次失败,放弃")
                count-=1
                time.sleep(1) 
        tree=etree.HTML(myPage)
        
        jiben=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[2]/div/ul",num=0,split=u" ")#.//*[@id='container']/div[3]/div[2]/div/ul/li[2]
        text=xpathutil.get_all_text(tree,".//*[@id='wordDiv']/div/div",num=0,split=u" ")
        print len(text)
        if len(text)<=10:
            text=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]",num=0,split=u" ")
        else:
            pass
	    p=functions.remove_all_space_char(text)
        p=functions.remove_all_space_char(p)
        emeail=re.compile('[\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')
                    
        e=emeail.findall(p)
        a=[]
        phone_number=re.compile('^(?:\+86)?(\d{3})\d{8}$|^(?:\+86)?(0\d{2,3})\d{7,8}$')
        pn=phone_number.findall(p)
        posdict=dict()
                   
        for  key in self.key_word:
		found=False
                for j in key:
			index=p.find(j)
                        if index>=0:
				if found:
                                    print "error"
                                else:
                                    posdict[j]=index
                                    found=True
        for key in  posdict:
		a.append(posdict[key])
	a.sort()
	save_data=dict()
	for i in range(0,len(a)):
		if i+1<len(a):
			text3=''
                        text3=p[int(a[i]):int(a[i+1])].replace(':',':',1).replace(":",":",1).replace(":",":",1)
                        text3=text3.split(':')

                        if len(text3)>1:
				if len(text3)==2:
                                        save_data[text3[0]]=text3[1]

                                else:
                                        save_data[text3[0]]=text3[1]+text3[2]
                        elif len(text3)==1:
                               	save_data[text3[0]]='None'
                        else:
				pass
		
		else:
			pass

        if e:
            save_data["邮箱"]=e[0]
        else:
            save_data["邮箱"]="无"
        if pn:
               	save_data['电话']=pn[0]
        else:
                save_data["电话"]="无"
    
	
        myPage=myPage.encode('utf-8')
        title=re.findall('<title>(.*?)</title>',myPage)
        if not title:
            title=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[1]/h1/a",num=0,split=u" ")
            name= title
        else:
            name=title[0]
        print name

        p=''
        if not jiben:
            pp=re.findall('<div class="info clearfix"><ol><li>\xe5\x8f\x91\xe5\xb8\x83\xe6\x97\xb6\xe9\x97\xb4:<u>(.*?)</u></li><li>\xe5\xb7\xa5\xe4\xbd\x9c\xe5\x9c\xb0\xe7\x82\xb9:<u>(.*?) </u></li><li>\xe8\x81\x8c\xe4\xbd\x8d\xe7\xb1\xbb\xe5\x9e\x8b:<u>(.*?)</u></li><li>\xe6\x9d\xa5\xe6\xba\x90:<a href="#" onclick="window.open(.*?)">(.*?)</a></li>',myPage)
            for i in pp:
                for j in i:
                    if j.find('(')==-1:
                        p=p+j+'\r\n'
        else:
            jiben=jiben.replace('\t','')
            jiben=jiben.split('\r\n')
            for i in jiben:
                p=p+i.split(':')[-1].replace('\n','')+'\r\n' 
        print p 
        myPage=myPage.decode('utf-8')
        keys=url+now
	save_data["公司名称"]=name
	save_data["发布时间"]=date
	save_data["文本1"]=p
	save_data['文本2']=text
	save_data["页面链接"]=url
	save_data["页面源码"]=myPage
	save_data['dotime']=now
	save_data['uptime']=time.time()
	save_data['source']="yingjiesheng"
	save_data["type"]="1"
        mongoutil.updatev3(db_yjs,keys,save_data)
	print("数据入库成功!")
Example #6
0
 def save_data(self, last_failure_file='ssdb_mongo.data', wait_time=300):
     if os.path.exists(last_failure_file) == True:
         failed_list = []
         count = 0
         with open(last_failure_file, 'rb') as f:
             for line in f:
                 failed_list.append(
                     self.json_to_dict(line.strip().strip('\n')))
                 count += 1
         while True:
             try:
                 self.logging.info('Last Failed File :%d' %
                                   len(failed_list))
                 if failed_list != None and len(failed_list) > 0:
                     for data in failed_list:
                         if data == None:
                             continue
                         if '_id' in data.keys():
                             _id = data['_id']
                         else:
                             _id = None
                         if isinstance(data, dict) and _id != None:
                             try:
                                 self.mongo.table.update({'_id': _id}, data,
                                                         True)
                                 self.logging.info(u'成功update一条数据:%s' % _id)
                             except Exception, e:
                                 self.logging.info(u'fail-update一条数据:%s' %
                                                   _id)
                     os.remove(last_failure_file)
                     # insert_ret = self.mongo.table.insert(failed_list,safe = True)
                     # if count - len(insert_ret) < 10 and count - len(insert_ret) >= 0:
                     #     os.remove(last_failure_file)
                     #     break
                     # else:
                     #     time.sleep(5)
                     #     continue
                 break
             except pymongo.errors.OperationFailure as e:
                 #                    self.logging.error(exceputil.traceinfo(e))
                 if e != None and e != '':
                     self.logging.info(e)
                     _id = re.findall(
                         r'.*?dup key:.*?\{.*?:.*?\"(.*?)\".*?\}', str(e))
                     self.logging.info('_id:%s' % _id[0])
                     if len(_id) > 0:
                         update_data, other_list = self.get_index_and_other_list(
                             _id[0], failed_list)
                         if update_data != None:
                             self.mongo.table.update(
                                 {'_id': update_data['_id']}, update_data,
                                 True)
                             self.logging.info(u'update data:%s 成功' %
                                               _id[0])
                             if other_list != None and len(other_list) > 0:
                                 failed_list = other_list
                             else:
                                 break
                             continue
                         else:
                             break
                     else:
                         break
             except Exception as e:
                 self.logging.error(u'存mongo数据异常 %s' %
                                    exceputil.traceinfo(e))
                 time.sleep(5)
                 self.mongo = mongoutil.getmondbv2(
                     config.mongo_host,
                     config.mongo_port,
                     self.db_name,
                     config.table_name,
                     username=config.mongo_username,
                     password=config.mongo_passwd)
Example #7
0
                                self.logging.info(u'成功update一条数据')
                            if other_list != None and len(other_list) > 0:
                                data_list = other_list
                                continue
                            else:
                                break
                        else:
                            break
                except Exception as e:
                    self.logging.error(u'存mongo数据异常 %s' %
                                       exceputil.traceinfo(e))
                    time.sleep(5)
                    self.mongo = mongoutil.getmondbv2(
                        config.mongo_host,
                        config.mongo_port,
                        self.db_name,
                        config.table_name,
                        username=config.mongo_username,
                        password=config.mongo_passwd)

    def json_to_dict(self, data):
        try:
            if data != None:
                data_dict = json.loads(data)
                if data_dict != None:
                    return data_dict
            else:
                return None
        except Exception as e:
            self.logging.error(u'转换dict异常 %s' % exceputil.traceinfo(e))
 def __init__(self,  name,queue_name,process_number=1,redis_host=None,redis_port=None):
     Customer.__init__(self,  name,queue_name,process_number=process_number,redis_host=redis_host,redis_port=redis_port)
     self.mongo = mongoutil.getmondbv2(config.mongo_host, config.mongo_port, config.mongo_db, config.mongo_table)