def back_money(self,recChar,code_id,yzm,img_path): """ 打码失败后请求退钱,并且验证码内容存储到文本文件和图片一起存储到self.pinyin目录,文件名使用code_id。 退钱正常的图片和验证码文本文件前缀为1,退钱失败前缀为0 :param recChar: :param code_id: 打码系统id :param yzm: 验证码 :param img_path: 图像地址 :return: (None) """ if code_id=="0": self.logging.warning(u"手工打码,无需退钱") return if recChar==None: self.logging.err(u"退钱发生异常。recChar==None") return #失败次数计数器加1 self.yzm_error+=1 today=timeutil.format("%Y-%m-%d",time.time()) dir_path=os.path.abspath('.') yzm_dir=os.path.join(dir_path,self.pinyin,today) if not fileutil.isdir(yzm_dir): #建立目录 fileutil.mkdirs(yzm_dir) try: #使用coide_id号退钱 recChar.reportErrorID(code_id) #退钱正常文件名前缀为1 img_name=os.path.join(yzm_dir,str(1),"%s.png"%code_id) text_file_name=os.path.join(yzm_dir,str(1),"%s.txt"%code_id) #把验证码文字写入到文本文件中,放到退钱目录 fileutil.write(text_file_name,yzm.encode("UTF-8","ignore")) #把图片文件复制到退钱的目录 fileutil.copyfile(img_path,img_name) self.logging.error(u"验证码没识别出来,退钱正常") except Exception as ee: #退钱失败文件名前缀为0 img_name="%s\\%d_%s.png" %(yzm_dir,0,code_id) text_file_name="%s\\%d_%s.txt"%(yzm_dir,0,code_id) #把验证码文字写入到文本文件中,放到退钱目录 fileutil.write(text_file_name,yzm.encode("UTF-8","ignore")) #把图片文件复制到退钱的目录 fileutil.copyfile(img_path,img_name) self.logging.error(u"验证码没识别出来,errorType=5 。退钱发生异常.error:%s" % exceputil.traceinfo(ee))
def set_black_keyword(self, company_dic): """ 设置关键字黑名单 :param key: 关键字 :return: """ save_data = dict() now = timeutil.format('%Y-%m-%d', time.time()) save_data['do_time'] = now save_data.update(company_dic) self.queue.select_queue(self.pinyin + '_noncompany') key=filter(lambda x:x in company_dic,['name','zch','xydm']) key= company_dic[key[0]] if key else json.dumps(company_dic) key= key if len(key)<100 else key[:100] if self.queue.ssdb_put_zset(key): self.queue.save(save_data) self.logging.info(u'成功写入%s_nonCompany队列一条数据:%s' % (self.pinyin,key))
def send_mail(self, mail_list, sub, content): if self.to_queue: mail_data = { 'mail_list': json.dumps(mail_list), 'sub': sub, 'content': content, 'date': timeutil.format('%Y-%m-%d %H:%M:%S', time.time()) } self.queue_mail.save(mail_data) else: me = 'server' + '<' + self.mail_username + '>' msg = MIMEText(content, _subtype='html', _charset='utf-8') msg['Subject'] = sub msg['From'] = me msg['To'] = ';'.join(mail_list) server = smtplib.SMTP() server.connect(self.mail_host) server.login(self.mail_username, self.mail_password) server.sendmail(me, mail_list, msg.as_string()) server.close()
def Get_message(self,url): db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd) now=timeutil.format("%Y-%m-%d",time.time()) proxy=None count=10 while True: try: #proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880) #proxy=None myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk") break except Exception as e3: print e3 if count<=0: raise Exception(u"连续10次失败,放弃") count-=1 time.sleep(1) tree=etree.HTML(myPage) title=xpathutil.get_all_text(tree,".//*[@id='mainNav']/div[2]/table/caption/h1",num=0,split=u" ") #address=xpathutil.get_all_text(tree,".//*[@id='mainNav']/div[2]/table/tbody/tr[3]/td",num=0,split=u" ") #pp=re.findall('<td>汉阳郭茨口香格里都3楼腾飞人才市场</td>') print title myPage=myPage.encode('utf-8') address1=re.findall('<th width="90">(.*?)</th>(.*?)<td>(.*?)</td>',myPage,re.S) j=0 for i in address1: if j==0: city1=re.findall('">(.*?)</a>',i[-1]) city=city1[0] elif j==1: date=i[-1] elif j==2: address=i[-1] j=j+1 print len(address) print city print date print address key=url+now mongoutil.updatev3(db_yjs,key,{"标题":title,"城市":city,"招聘会时间":date,'招聘会地点':address,"页面链接":url,"dotime":now,"uptime":time.time(),"source":"yingjiesheng","type":"2"})
def Get_message(self,url,date): db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd,timeout=30) now=timeutil.format("%Y-%m-%d",time.time()) proxy=None count=10 while True: try: #proxy=None myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk") break except Exception as e3: print e3 proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880) if count<=0: raise Exception(u"连续10次失败,放弃") count-=1 time.sleep(1) tree=etree.HTML(myPage) jiben=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[2]/div/ul",num=0,split=u" ")#.//*[@id='container']/div[3]/div[2]/div/ul/li[2] text=xpathutil.get_all_text(tree,".//*[@id='wordDiv']/div/div",num=0,split=u" ") print len(text) if len(text)<=10: text=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]",num=0,split=u" ") else: pass p=functions.remove_all_space_char(text) p=functions.remove_all_space_char(p) emeail=re.compile('[\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+') e=emeail.findall(p) a=[] phone_number=re.compile('^(?:\+86)?(\d{3})\d{8}$|^(?:\+86)?(0\d{2,3})\d{7,8}$') pn=phone_number.findall(p) posdict=dict() for key in self.key_word: found=False for j in key: index=p.find(j) if index>=0: if found: print "error" else: posdict[j]=index found=True for key in posdict: a.append(posdict[key]) a.sort() save_data=dict() for i in range(0,len(a)): if i+1<len(a): text3='' text3=p[int(a[i]):int(a[i+1])].replace(':',':',1).replace(":",":",1).replace(":",":",1) text3=text3.split(':') if len(text3)>1: if len(text3)==2: save_data[text3[0]]=text3[1] else: save_data[text3[0]]=text3[1]+text3[2] elif len(text3)==1: save_data[text3[0]]='None' else: pass else: pass if e: save_data["邮箱"]=e[0] else: save_data["邮箱"]="无" if pn: save_data['电话']=pn[0] else: save_data["电话"]="无" myPage=myPage.encode('utf-8') title=re.findall('<title>(.*?)</title>',myPage) if not title: title=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[1]/h1/a",num=0,split=u" ") name= title else: name=title[0] print name p='' if not jiben: pp=re.findall('<div class="info clearfix"><ol><li>\xe5\x8f\x91\xe5\xb8\x83\xe6\x97\xb6\xe9\x97\xb4:<u>(.*?)</u></li><li>\xe5\xb7\xa5\xe4\xbd\x9c\xe5\x9c\xb0\xe7\x82\xb9:<u>(.*?) </u></li><li>\xe8\x81\x8c\xe4\xbd\x8d\xe7\xb1\xbb\xe5\x9e\x8b:<u>(.*?)</u></li><li>\xe6\x9d\xa5\xe6\xba\x90:<a href="#" onclick="window.open(.*?)">(.*?)</a></li>',myPage) for i in pp: for j in i: if j.find('(')==-1: p=p+j+'\r\n' else: jiben=jiben.replace('\t','') jiben=jiben.split('\r\n') for i in jiben: p=p+i.split(':')[-1].replace('\n','')+'\r\n' print p myPage=myPage.decode('utf-8') keys=url+now save_data["公司名称"]=name save_data["发布时间"]=date save_data["文本1"]=p save_data['文本2']=text save_data["页面链接"]=url save_data["页面源码"]=myPage save_data['dotime']=now save_data['uptime']=time.time() save_data['source']="yingjiesheng" save_data["type"]="1" mongoutil.updatev3(db_yjs,keys,save_data) print("数据入库成功!")
def save(self,company_name,save_data1): """ 通过公司名和日期生成唯一id,并把公司内容存入mongodb数据库。 :param company_name: (unicode) 公司名 :param save_data: (dict) 公司信息 :return: (bool) 是否成功存储 -> true / false """ #清理数据 save_data=dict() for key in save_data1: value=save_data1[key] new_key=remove_all_space_char(key) if len(new_key)>0: save_data[new_key]=value company_name = remove_all_space_char(company_name) fields=[u"成员出资总额",u'名称',u'注册号',u'登记机关',u'类型',u'经营状态',u'登记状态',u'营业场所',u'住所',u'营业期限自',u'营业期限至',u'成立日期',u'核准日期',u'吊销日期',u'注册资本',u"经营期限至",u"经营期限自"] people=[u"名称",u"经营者",u"法定代表",u"法定代表人",u"经营者姓名",u'负责人',u"法人",u"首席代表",u"投资人",u"执行事务合伙人",u"执行事务合伙人(委派代表)",u"股东"] for p in people: if save_data.has_key(p): value=self.parse_people(save_data[p]) save_data[p]=value if len(value)<1: self.logging.error(u"字段内容长度为0,公司名:%s,字段名:%s"%(company_name,p)) for field in fields: if save_data.has_key(field): value=remove_all_space_char(save_data[field]) save_data[field]=value if len(value)<1: self.logging.error(u"字段内容长度为0,公司名:%s,字段名:%s"%(company_name,field)) if len(save_data)<15: raise Exception(u"字段缺失:%s"%company_name) self.logging.info(u"存储数据,公司名:%s"%company_name) now=timeutil.format("%Y-%m-%d",time.time()) id=mongoutil.get_id_key(company_name,now) # prefix="^"+company_name # ret=self.db_qyxx.table.find({"_id":{'$regex':prefix}}).sort("uptime",-1) #处理version,如果未指定则默认为1 if not save_data.has_key("version"): save_data["version"]=3 #处理没有type,如果未指定则默认为chinese if not save_data.has_key("type"): save_data["type"]=self.chinese save_data["company_name"] = company_name #设置公司名字 save_data["do_time"]=now save_data["uptime"]=time.time() save_data["down_type"]=0 #处理键值为None if save_data.has_key(None): del save_data[None] #处理股东信息 gdxx_list=list() if save_data.has_key("gdxx"): gdxx_list=save_data["gdxx"] if not isinstance(gdxx_list,list): gdxx_list=list() save_data["gdxx"]=JSONEncoder().encode(gdxx_list) #处理备案信息 baxx_list=list() if save_data.has_key("baxx"): baxx_list=save_data["baxx"] if not isinstance(baxx_list,list): baxx_list=list() save_data["baxx"]=JSONEncoder().encode(baxx_list) #处理变更信息 bgxx_list=list() if save_data.has_key("bgxx"): bgxx_list=save_data["bgxx"] if not isinstance(bgxx_list,list): bgxx_list=list() save_data["bgxx"]=JSONEncoder().encode(bgxx_list) #处理分支机构 fzjg_list=list() if save_data.has_key("fzjg"): fzjg_list=save_data["fzjg"] if not isinstance(fzjg_list,list): fzjg_list=list() save_data["fzjg"]=JSONEncoder().encode(fzjg_list) #处理行政处罚 xzcf_list=list() if save_data.has_key("xzcf") and isinstance(save_data["xzcf"],list): xzcf_list=save_data["xzcf"] save_data["xzcf"]=JSONEncoder().encode(xzcf_list) # 注册号策略 save_data['keyword'] = self.keyword key_list =save_data.keys() res_list = filter(lambda x: u"注册号" in x , key_list) if not res_list: self.logging.error(u'没有注册号!') if self.is_num(self.keyword) and len(self.keyword) == 15: self.logging.info(u'写入注册号:%s' % self.keyword) save_data[u'注册号'] = self.keyword save_data['_id'] = id save_data['has_company'] = 1 self.db.save(save_data) self.logging.info(u'成功写入%s一条数据:%s' % (config.type1, id)) self.proxy_series_error=0 if self.proxy and self.proxy.split(":")[-1]not in ['42271','42272']: self.logging.info(u"优质非自建代理插入队列尾部,当前非自建代理列表长度为:%s"% self.put_proxy_into_queue_or_set(type='queue')) if not config.debug: self.monitor.add() return True