Exemple #1
0
    def back_money(self,recChar,code_id,yzm,img_path):
        """
        打码失败后请求退钱,并且验证码内容存储到文本文件和图片一起存储到self.pinyin目录,文件名使用code_id。
        退钱正常的图片和验证码文本文件前缀为1,退钱失败前缀为0
        :param recChar:
        :param code_id:  打码系统id
        :param yzm:  验证码
        :param img_path:  图像地址
        :return: (None)
        """
        if code_id=="0":
            self.logging.warning(u"手工打码,无需退钱")
            return

        if recChar==None:
            self.logging.err(u"退钱发生异常。recChar==None")
            return
        #失败次数计数器加1
        self.yzm_error+=1
        today=timeutil.format("%Y-%m-%d",time.time())
        dir_path=os.path.abspath('.')
        yzm_dir=os.path.join(dir_path,self.pinyin,today)
        if not fileutil.isdir(yzm_dir):
            #建立目录
            fileutil.mkdirs(yzm_dir)
        try:
            #使用coide_id号退钱
            recChar.reportErrorID(code_id)
            #退钱正常文件名前缀为1
            img_name=os.path.join(yzm_dir,str(1),"%s.png"%code_id)
            text_file_name=os.path.join(yzm_dir,str(1),"%s.txt"%code_id)
            #把验证码文字写入到文本文件中,放到退钱目录
            fileutil.write(text_file_name,yzm.encode("UTF-8","ignore"))
            #把图片文件复制到退钱的目录
            fileutil.copyfile(img_path,img_name)
            self.logging.error(u"验证码没识别出来,退钱正常")
        except Exception as ee:
            #退钱失败文件名前缀为0
            img_name="%s\\%d_%s.png" %(yzm_dir,0,code_id)
            text_file_name="%s\\%d_%s.txt"%(yzm_dir,0,code_id)
            #把验证码文字写入到文本文件中,放到退钱目录
            fileutil.write(text_file_name,yzm.encode("UTF-8","ignore"))
            #把图片文件复制到退钱的目录
            fileutil.copyfile(img_path,img_name)
            self.logging.error(u"验证码没识别出来,errorType=5 。退钱发生异常.error:%s" % exceputil.traceinfo(ee))
Exemple #2
0
    def set_black_keyword(self, company_dic):
        """
        设置关键字黑名单
        :param key: 关键字
        :return:
        """
        save_data = dict()
        now = timeutil.format('%Y-%m-%d', time.time())
        save_data['do_time'] = now
        save_data.update(company_dic)
        self.queue.select_queue(self.pinyin + '_noncompany')

        key=filter(lambda x:x in company_dic,['name','zch','xydm'])
        key= company_dic[key[0]] if key else json.dumps(company_dic)
        key= key if len(key)<100 else key[:100]
        if  self.queue.ssdb_put_zset(key):
            self.queue.save(save_data)
            self.logging.info(u'成功写入%s_nonCompany队列一条数据:%s' % (self.pinyin,key))
Exemple #3
0
 def send_mail(self, mail_list, sub, content):
     if self.to_queue:
         mail_data = {
             'mail_list': json.dumps(mail_list),
             'sub': sub,
             'content': content,
             'date': timeutil.format('%Y-%m-%d %H:%M:%S', time.time())
         }
         self.queue_mail.save(mail_data)
     else:
         me = 'server' + '<' + self.mail_username + '>'
         msg = MIMEText(content, _subtype='html', _charset='utf-8')
         msg['Subject'] = sub
         msg['From'] = me
         msg['To'] = ';'.join(mail_list)
         server = smtplib.SMTP()
         server.connect(self.mail_host)
         server.login(self.mail_username, self.mail_password)
         server.sendmail(me, mail_list, msg.as_string())
         server.close()
 def Get_message(self,url):
     db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd)
     now=timeutil.format("%Y-%m-%d",time.time())
     proxy=None
     count=10
     while True:
         try:
             #proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880)
             #proxy=None
             myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk")
             break
         except Exception as e3:
             print e3
             if count<=0:
                 raise  Exception(u"连续10次失败,放弃")
             count-=1
             time.sleep(1) 
     tree=etree.HTML(myPage)
     title=xpathutil.get_all_text(tree,".//*[@id='mainNav']/div[2]/table/caption/h1",num=0,split=u" ")
     #address=xpathutil.get_all_text(tree,".//*[@id='mainNav']/div[2]/table/tbody/tr[3]/td",num=0,split=u" ")
     #pp=re.findall('<td>汉阳郭茨口香格里都3楼腾飞人才市场</td>')
     print title
     myPage=myPage.encode('utf-8')
     address1=re.findall('<th width="90">(.*?)</th>(.*?)<td>(.*?)</td>',myPage,re.S)
     j=0
     for i in address1:
         if j==0:
             city1=re.findall('">(.*?)</a>',i[-1])
             city=city1[0]
         elif j==1:
             date=i[-1]
         elif j==2:
             address=i[-1]
         j=j+1
     
     print len(address)
     print city
     print date
     print address
     key=url+now
     mongoutil.updatev3(db_yjs,key,{"标题":title,"城市":city,"招聘会时间":date,'招聘会地点':address,"页面链接":url,"dotime":now,"uptime":time.time(),"source":"yingjiesheng","type":"2"})
    def Get_message(self,url,date):
        db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd,timeout=30)
        now=timeutil.format("%Y-%m-%d",time.time())
        proxy=None
        count=10
        while True:
            try:
               
                #proxy=None
                myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk")
                break
            except Exception as e3:
                print e3
		proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880)
                if count<=0:
                    raise  Exception(u"连续10次失败,放弃")
                count-=1
                time.sleep(1) 
        tree=etree.HTML(myPage)
        
        jiben=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[2]/div/ul",num=0,split=u" ")#.//*[@id='container']/div[3]/div[2]/div/ul/li[2]
        text=xpathutil.get_all_text(tree,".//*[@id='wordDiv']/div/div",num=0,split=u" ")
        print len(text)
        if len(text)<=10:
            text=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]",num=0,split=u" ")
        else:
            pass
	    p=functions.remove_all_space_char(text)
        p=functions.remove_all_space_char(p)
        emeail=re.compile('[\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')
                    
        e=emeail.findall(p)
        a=[]
        phone_number=re.compile('^(?:\+86)?(\d{3})\d{8}$|^(?:\+86)?(0\d{2,3})\d{7,8}$')
        pn=phone_number.findall(p)
        posdict=dict()
                   
        for  key in self.key_word:
		found=False
                for j in key:
			index=p.find(j)
                        if index>=0:
				if found:
                                    print "error"
                                else:
                                    posdict[j]=index
                                    found=True
        for key in  posdict:
		a.append(posdict[key])
	a.sort()
	save_data=dict()
	for i in range(0,len(a)):
		if i+1<len(a):
			text3=''
                        text3=p[int(a[i]):int(a[i+1])].replace(':',':',1).replace(":",":",1).replace(":",":",1)
                        text3=text3.split(':')

                        if len(text3)>1:
				if len(text3)==2:
                                        save_data[text3[0]]=text3[1]

                                else:
                                        save_data[text3[0]]=text3[1]+text3[2]
                        elif len(text3)==1:
                               	save_data[text3[0]]='None'
                        else:
				pass
		
		else:
			pass

        if e:
            save_data["邮箱"]=e[0]
        else:
            save_data["邮箱"]="无"
        if pn:
               	save_data['电话']=pn[0]
        else:
                save_data["电话"]="无"
    
	
        myPage=myPage.encode('utf-8')
        title=re.findall('<title>(.*?)</title>',myPage)
        if not title:
            title=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[1]/h1/a",num=0,split=u" ")
            name= title
        else:
            name=title[0]
        print name

        p=''
        if not jiben:
            pp=re.findall('<div class="info clearfix"><ol><li>\xe5\x8f\x91\xe5\xb8\x83\xe6\x97\xb6\xe9\x97\xb4:<u>(.*?)</u></li><li>\xe5\xb7\xa5\xe4\xbd\x9c\xe5\x9c\xb0\xe7\x82\xb9:<u>(.*?) </u></li><li>\xe8\x81\x8c\xe4\xbd\x8d\xe7\xb1\xbb\xe5\x9e\x8b:<u>(.*?)</u></li><li>\xe6\x9d\xa5\xe6\xba\x90:<a href="#" onclick="window.open(.*?)">(.*?)</a></li>',myPage)
            for i in pp:
                for j in i:
                    if j.find('(')==-1:
                        p=p+j+'\r\n'
        else:
            jiben=jiben.replace('\t','')
            jiben=jiben.split('\r\n')
            for i in jiben:
                p=p+i.split(':')[-1].replace('\n','')+'\r\n' 
        print p 
        myPage=myPage.decode('utf-8')
        keys=url+now
	save_data["公司名称"]=name
	save_data["发布时间"]=date
	save_data["文本1"]=p
	save_data['文本2']=text
	save_data["页面链接"]=url
	save_data["页面源码"]=myPage
	save_data['dotime']=now
	save_data['uptime']=time.time()
	save_data['source']="yingjiesheng"
	save_data["type"]="1"
        mongoutil.updatev3(db_yjs,keys,save_data)
	print("数据入库成功!")
Exemple #6
0
    def save(self,company_name,save_data1):
        """
        通过公司名和日期生成唯一id,并把公司内容存入mongodb数据库。
        :param company_name:  (unicode) 公司名
        :param save_data:  (dict)  公司信息
        :return:  (bool) 是否成功存储 -> true / false
        """
        #清理数据
        save_data=dict()
        for key in save_data1:
            value=save_data1[key]
            new_key=remove_all_space_char(key)
            if len(new_key)>0:
                save_data[new_key]=value
        company_name = remove_all_space_char(company_name)
        fields=[u"成员出资总额",u'名称',u'注册号',u'登记机关',u'类型',u'经营状态',u'登记状态',u'营业场所',u'住所',u'营业期限自',u'营业期限至',u'成立日期',u'核准日期',u'吊销日期',u'注册资本',u"经营期限至",u"经营期限自"]
        people=[u"名称",u"经营者",u"法定代表",u"法定代表人",u"经营者姓名",u'负责人',u"法人",u"首席代表",u"投资人",u"执行事务合伙人",u"执行事务合伙人(委派代表)",u"股东"]

        for p in people:
            if save_data.has_key(p):
                value=self.parse_people(save_data[p])
                save_data[p]=value
                if len(value)<1:
                    self.logging.error(u"字段内容长度为0,公司名:%s,字段名:%s"%(company_name,p))
        for field in fields:
            if save_data.has_key(field):
                value=remove_all_space_char(save_data[field])
                save_data[field]=value
                if len(value)<1:
                    self.logging.error(u"字段内容长度为0,公司名:%s,字段名:%s"%(company_name,field))

        if len(save_data)<15:
            raise Exception(u"字段缺失:%s"%company_name)
        self.logging.info(u"存储数据,公司名:%s"%company_name)
        now=timeutil.format("%Y-%m-%d",time.time())
        id=mongoutil.get_id_key(company_name,now)
        # prefix="^"+company_name
        # ret=self.db_qyxx.table.find({"_id":{'$regex':prefix}}).sort("uptime",-1)
        #处理version,如果未指定则默认为1
        if not save_data.has_key("version"):
            save_data["version"]=3
        #处理没有type,如果未指定则默认为chinese
        if not save_data.has_key("type"):
            save_data["type"]=self.chinese

        save_data["company_name"] = company_name  #设置公司名字
        save_data["do_time"]=now
        save_data["uptime"]=time.time()
        save_data["down_type"]=0
        #处理键值为None
        if save_data.has_key(None):
            del save_data[None]

        #处理股东信息
        gdxx_list=list()
        if  save_data.has_key("gdxx"):
            gdxx_list=save_data["gdxx"]
            if not isinstance(gdxx_list,list):
                gdxx_list=list()
        save_data["gdxx"]=JSONEncoder().encode(gdxx_list)

        #处理备案信息
        baxx_list=list()
        if save_data.has_key("baxx"):
            baxx_list=save_data["baxx"]
            if not  isinstance(baxx_list,list):
                baxx_list=list()
        save_data["baxx"]=JSONEncoder().encode(baxx_list)

        #处理变更信息
        bgxx_list=list()
        if save_data.has_key("bgxx"):
            bgxx_list=save_data["bgxx"]
            if not  isinstance(bgxx_list,list):
                bgxx_list=list()
        save_data["bgxx"]=JSONEncoder().encode(bgxx_list)

        #处理分支机构
        fzjg_list=list()
        if save_data.has_key("fzjg"):
            fzjg_list=save_data["fzjg"]
            if not  isinstance(fzjg_list,list):
                fzjg_list=list()
        save_data["fzjg"]=JSONEncoder().encode(fzjg_list)

        #处理行政处罚
        xzcf_list=list()
        if  save_data.has_key("xzcf") and  isinstance(save_data["xzcf"],list):
            xzcf_list=save_data["xzcf"]
        save_data["xzcf"]=JSONEncoder().encode(xzcf_list)

        # 注册号策略
        save_data['keyword'] = self.keyword
        key_list =save_data.keys()
        res_list = filter(lambda x: u"注册号" in x , key_list)

        if not res_list:
            self.logging.error(u'没有注册号!')
            if self.is_num(self.keyword) and len(self.keyword) == 15:
                self.logging.info(u'写入注册号:%s' % self.keyword)
                save_data[u'注册号'] = self.keyword

        save_data['_id'] = id
        save_data['has_company'] = 1
        self.db.save(save_data)

        self.logging.info(u'成功写入%s一条数据:%s' % (config.type1, id))
        self.proxy_series_error=0
        if self.proxy and self.proxy.split(":")[-1]not in ['42271','42272']:
            self.logging.info(u"优质非自建代理插入队列尾部,当前非自建代理列表长度为:%s"% self.put_proxy_into_queue_or_set(type='queue'))
        if not config.debug:
            self.monitor.add()
        return True