コード例 #1
0
 def put(self,
         data,
         sleep_time=10,
         retry=9,
         email_list=functions.mailto_list_ourselves):
     i = 0
     count = retry + 1
     ssdb_queue = None
     while True:
         try:
             if data != None and ssdb_queue != None:
                 if isinstance(data, dict):
                     return ssdb_queue.put_data(pickle.dumps(data))
             else:
                 if isinstance(data, dict):
                     return self.put_data(pickle.dumps(data))
         except Exception as e:
             print u'插入队列异常 %s' % exceputil.traceinfo(e)
             i += 1
             if i >= count:
                 #发邮件
                 functions.send_mail_old(
                     email_list, u"ssdb队列更新异常", u"错误信息%s \nqueue_name:%s" %
                     (exceputil.traceinfo(e), self.queue_name))
                 time.sleep(600)
                 i = 0
             else:
                 time.sleep(sleep_time)
             ssdb_queue = getSSDBQueuev2(self.queue_name, self.host,
                                         self.port, self.max_connections,
                                         self.timeout)
コード例 #2
0
 def get(self,
         sleep_time=10,
         retry=9,
         email_list=functions.mailto_list_ourselves):
     i = 0
     count = retry + 1
     ssdb_queue = None
     while True:
         try:
             if ssdb_queue != None:
                 return ssdb_queue.get_data()
             else:
                 return self.get_data()
         except Exception as e:
             print u'查询数据异常 %s' % exceputil.traceinfo(e)
             i += 1
             if i >= count:
                 #发邮件
                 functions.send_mail_old(email_list, u"ssdb队列更新异常",
                                         u"错误信息%s" % exceputil.traceinfo(e))
                 time.sleep(600)
                 i = 0
             else:
                 time.sleep(sleep_time)
             ssdb_queue = getSSDBQueuev2(self.queue_name, self.host,
                                         self.port, self.max_connections,
                                         self.timeout)
コード例 #3
0
ファイル: producer_customer.py プロジェクト: chybot/crawler
 def start(self):
     """
     启动消费者
     1。把上次处理失败的last_key文件内容,并放到redis队列末尾
     2。获取新的key值
     3。根据新的key值抓取。如果失败,则把失败的key放到last_file文件中
     4。回到第2步
     :return:
     """
     self.process_last_error()
     while True:
         try:
             item = self.get()
             if item == None or len(item) < 1:
                 self.customer_logging.error(u"队列内容为空")
                 continue
         except Exception as e:
             self.customer_logging.error(u"读取redis集合错误,错误信息:%s" %
                                         (traceinfo(e)))
             time.sleep(10)
             continue
         fileutil.write(self.last_file, item.encode("UTF-8"))
         try:
             self.run(item)
             fileutil.clear(self.last_file)
         except Exception as e2:
             self.customer_logging.error(u"抓取异常,错误信息:%s" % (traceinfo(e2)))
             self.customer_queue_conn.putv2(item, self.customer_queue_name)
             fileutil.clear(self.last_file)
             time.sleep(10)
         time.sleep(1)
コード例 #4
0
ファイル: producer_customer.py プロジェクト: chybot/crawler
    def product(self, item, unique=None):
        """
        生产url,放到redis队列,对唯一性进行判断
        :param item: (str) url或种子
        :return:(None)
        """
        if self.bloomfilter_mode:
            if unique == None:
                unique = item
            while True:
                try:
                    if self.bloomFilterclient.insert_if_not_exists(unique):
                        self.product_logging.info(u"此元素未处理,item:%s" % unique)
                        self.product_real(item)
                    else:
                        self.product_logging.info(u"此元素已处理,item:%s" % unique)
                    break
                except Exception as e:
                    self.product_logging.error(u"访问布隆过滤器发生错误!,错误信息:%s" %
                                               traceinfo(e))
                    while True:
                        try:
                            self.bloomFilterclient = BloomFilterClient(
                                self.bf_host, self.bf_port)
                            break
                        except Exception as e1:
                            self.product_logging.error(
                                u"布隆过滤器连接发生错误!,错误信息:%s" % traceinfo(e1))
                            time.sleep(5)
                            continue

        else:
            self.product_real(item)
コード例 #5
0
def getSSDBQueuev2(queue_name,
                   host='127.0.0.1',
                   port=8888,
                   max_connections=1,
                   timeout=30,
                   retry=9,
                   sleep_time=30,
                   email_list=functions.mailto_list_ourselves):
    i = 0
    count = retry + 1
    while True:
        try:
            return getSSDBQueue(queue_name,
                                host=host,
                                port=port,
                                max_connections=max_connections,
                                timeout=timeout)
        except Exception as e:
            print exceputil.traceinfo(e)
            i += 1
            if i > count:
                #发邮件
                functions.send_mail_old(email_list, u"redis队列连接异常",
                                        u"错误信息%s" % exceputil.traceinfo(e))
                time.sleep(600)
                i = 0
            else:
                time.sleep(sleep_time)
コード例 #6
0
ファイル: dict_change_json.py プロジェクト: chybot/crawler
def main_():
    dict_ = {u"北京": 30000, u"上海": 30000, u"江苏": 20000, u"浙江": 20000}
    f = open("D:/hehongjing/xxxx.json", "w")
    for dd in dict_:
        print dd
        table = mongoutil.getmondbbyhost("bigdata_higgs", "qyxx").table
        tables = table.find({"type": dd}).limit(dict_[dd])
        for tt in tables:
            try:
                if dd == u"上海":
                    for _html in ["other_html", "company_html"]:
                        if _html in tt:
                            del tt[_html]
                if dd == u"浙江":
                    for _html in ["baxx_html"]:
                        if _html in tt:
                            del tt[_html]
                jj = json.dumps(tt)
                jj = jj.replace(u"xa0", "").replace("ue001", "").replace(
                    "ue00b", "").replace("\\\\", "\\")
                #js= jj.decode('raw_unicode_escape').encode("UTF-8")
                js = jj.decode('raw_unicode_escape').decode("UTF-8").encode(
                    "GBK")
                f.write(js + "\n")
            except Exception as e:
                exceputil.traceinfo(e)
    f.close()
    print "end"
コード例 #7
0
 def put_proxy_into_queue_or_set(self,type='queue'):
     try:
         if type == "queue":
             return self.proxy_white_list_db.put_data_back(self.proxy)
         else:
             return  self.proxy_black_list_db.ssdb_put_zset(self.proxy,score=int(time.time()))
     except Exception as e:
         exceputil.traceinfo(e)
コード例 #8
0
def write_file(open_file,write_str,pattern):
    while True:
        try:
            f=open(open_file,pattern)
            f.write(str(write_str)+"\n")
            f.close()
            break
        except Exception as e:
            exceputil.traceinfo(e)
            time.sleep(1)
コード例 #9
0
 def get_proxy_qyxx(self,need_check=False, is_debug=False, area=u"电信"):
     try:
         if config.debug:
             return None
         else:
             # if self.proxy_white_list_db.size() > 0:
             #     return self.proxy_white_list_db.get()
             # else:
             return get("http://spider7:9876/qyxx?area=%s&type=%s" % (self.pinyin,self.proxy_typess)).text.strip()
     except Exception as e:
         exceputil.traceinfo(e)
         return proxyutils.choice_proxy(is_debug=False,area=u"电信",host=config.proxy_host,port=config.proxy_port)
コード例 #10
0
def get_exec_files(style,dir_path):
    global fil_tuples
    for i in os.listdir(dir_path):
        file=os.path.join(dir_path,i)
        if os.path.isfile(file):
            if str(file).endswith(style) and not str(file).endswith("pyc"):
                if i not in not_tuples:
                    fil_tuples.append(file)
        else:
            try:
                get_exec_files(style,file)
            except Exception as e:
                exceputil.traceinfo(e)
    return fil_tuples
コード例 #11
0
def clear_log(root):
    for i in os.listdir(root):
        if re.match(r"qyxx_weixin", i):
            isfile = os.path.join(root, i)
            if os.path.isfile(isfile):
                if re.search(r"qyxx.*?log", i):
                    file_change_time = os.path.getmtime(isfile)
                    if nowtime - file_change_time > 3 * 24 * 3600:
                        #os.remove(isfile)
                        print isfile
            else:
                try:
                    clear_log(isfile)
                except Exception as e:
                    exceputil.traceinfo(e)
コード例 #12
0
 def record_success(self,yzm,img_path,count=10000):
     """
     打码成功后记录,文件名使用yzm
     :param yzm:  验证码
     :param count: 保存验证码文件个数,默认10000个
     :return: (None)
     """
     try:
         dir_path=os.path.abspath('../')
         yzm_dir=os.path.join(dir_path,"yzm_success",self.pinyin)
         if not fileutil.isdir(yzm_dir):
             #建立目录
             fileutil.mkdirs(yzm_dir)
         pics = sum([len(files) for root,dirs,files in os.walk(yzm_dir)])
         self.logging.info(u"已存放%d张验证码图片"%(pics-1))
         if pics > count:
             self.logging.warn(u"已存放超%d张验证码图片"%count)
             return
         # 唯一的验证码图片文件名
         img = "%s.jpg"%str(uuid.uuid1())
         # 记录图片与验证码对应关系
         text_file_name=os.path.join(yzm_dir,"ans.txt")
         file=open(text_file_name,"a")
         file.write(img + ' ' + yzm + '\n')
         file.close()
         # 保存验证码图片
         img_name=os.path.join(yzm_dir,img)
         fileutil.copyfile(img_path,img_name)
     except Exception as e:
         self.logging.error(u"记录发生异常.错误信息:%s" % exceputil.traceinfo(e))
コード例 #13
0
ファイル: excelutil.py プロジェクト: chybot/crawler
def getsheet(filename, sheetname):
    # 打开 Excel 文件
    try:
        device_workbook = xlrd.open_workbook(filename)
        excel_sheet = device_workbook.sheet_by_name(sheetname)
        return sheet(excel_sheet)
    except Exception as e:
        print u'文件名%s,错误信息:%s', (filename, traceinfo(e))
コード例 #14
0
ファイル: from_ssdb_to_mongo.py プロジェクト: chybot/crawler
 def get_index_and_other_list(self, key, data_list):
     try:
         i = 0
         for data in data_list:
             i += 1
             if key == data['_id']:
                 return data, data_list[i:]
     except Exception as e:
         self.logging.error(u'获取列表剩余异常 %s' % exceputil.traceinfo(e))
コード例 #15
0
ファイル: from_ssdb_to_mongo.py プロジェクト: chybot/crawler
 def json_to_dict(self, data):
     try:
         if data != None:
             data_dict = json.loads(data)
             if data_dict != None:
                 return data_dict
         else:
             return None
     except Exception as e:
         self.logging.error(u'转换dict异常 %s' % exceputil.traceinfo(e))
コード例 #16
0
def exce_file_subprocess(exce_file):
    try:
        if exce_file.endswith("py"):
            #启动的py文件
            child_sub=subprocess.Popen("%s %s"%(python_path,exce_file),shell=True)#,stderr=subprocess.STDOUT)
        elif exce_file.endswith("bat"):
            #启动windows的批处理
            child_sub=subprocess.Popen(exce_file,shell=True)
        else:
            logging.info(u"启动的文件既不是py,也不是py,启动文件错误:%s"%exce_file)
            return
        c=str(child_sub.pid)
        write_file(u"./pid/%s_pid.txt"%dir_last_file,c+"|"+exce_file,"a+")
        child_sub.wait()
        time.sleep(1)
    except Exception as e:
        exceputil.traceinfo(e)
        print u"执行%s时候出错"%exce_file
        #杀死该进程
        child_sub.kill()
        write_file(u"./pid/%s_error.txt"%dir_last_file,c+"|"+exce_file,"a+")
        time.sleep(3)
コード例 #17
0
ファイル: from_ssdb_to_mongo.py プロジェクト: chybot/crawler
    def __init__(self, queue_name):
        fileutil.mkdirs(queue_name)
        self.logging = get_logger(queue_name + '/' + 'ssdb_save')
        # if queue_name == u'shanghai_2':
        #     self.queue_name = 'shanghai_2'
        # else:
        self.queue_name = queue_name
        self.db_name = 'bigdata_higgs_' + queue_name

        self.logging.info(self.db_name)
        while True:
            try:
                self.logging.info(u'连接mongo')
                self.mongo = mongoutil.getmondbv2(
                    config.mongo_host,
                    config.mongo_port,
                    self.db_name,
                    config.table_name,
                    username=config.mongo_username,
                    password=config.mongo_passwd)
                break
            except Exception as e:
                self.logging.error(u'连接mongo异常 %s' % exceputil.traceinfo(e))
                time.sleep(60)
                continue

        while True:
            try:
                self.logging.info(u'连接ssdb')
                self.ssdb = ssdbutil2.getSSDBQueuev2(self.queue_name,
                                                     host=config.ssdb_host,
                                                     port=config.ssdb_port)
                break
            except Exception as e:
                self.logging.error(u'连接ssdb异常 %s' % exceputil.traceinfo(e))
                time.sleep(60)
                continue
コード例 #18
0
ファイル: from_ssdb_to_mongo.py プロジェクト: chybot/crawler
    def get_all_from_ssdb(self):
        ssdb_list = []
        while True:
            try:
                data = self.ssdb.get()
                if data != None and len(data) > 0:
                    ssdb_list.append(pickle.loads(data[0]))

                if self.ssdb.size() == 0:
                    break
            except Exception as e:
                self.logging.error(u'获取剩余全部队列数据异常 %s' % exceputil.traceinfo(e))
                time.sleep(60)
                self.ssdb = ssdbutil2.getSSDBQueuev2(self.queue_name,
                                                     host=config.ssdb_host,
                                                     port=config.ssdb_port)
        return ssdb_list
コード例 #19
0
ファイル: bloomfilterserver.py プロジェクト: chybot/crawler
 def run(self):
     """线程主函数
     1天持久化1次布隆过滤器
     :return:(None)
     """
     while True:
         try:
             now = time.time()
             subtime = now - self.store_last_time
             if subtime >= 3600 * 24:
                 self.bloomfilter.write_file(self.storefile)
                 self.store_last_time = time.time()
                 time.sleep(24 * 3600)
             else:
                 time.sleep(3600)
         except BaseException as e:
             logger.error(u"错误信息:%s" % traceinfo(e))
コード例 #20
0
    def back_money(self,recChar,code_id,yzm,img_path):
        """
        打码失败后请求退钱,并且验证码内容存储到文本文件和图片一起存储到self.pinyin目录,文件名使用code_id。
        退钱正常的图片和验证码文本文件前缀为1,退钱失败前缀为0
        :param recChar:
        :param code_id:  打码系统id
        :param yzm:  验证码
        :param img_path:  图像地址
        :return: (None)
        """
        if code_id=="0":
            self.logging.warning(u"手工打码,无需退钱")
            return

        if recChar==None:
            self.logging.err(u"退钱发生异常。recChar==None")
            return
        #失败次数计数器加1
        self.yzm_error+=1
        today=timeutil.format("%Y-%m-%d",time.time())
        dir_path=os.path.abspath('.')
        yzm_dir=os.path.join(dir_path,self.pinyin,today)
        if not fileutil.isdir(yzm_dir):
            #建立目录
            fileutil.mkdirs(yzm_dir)
        try:
            #使用coide_id号退钱
            recChar.reportErrorID(code_id)
            #退钱正常文件名前缀为1
            img_name=os.path.join(yzm_dir,str(1),"%s.png"%code_id)
            text_file_name=os.path.join(yzm_dir,str(1),"%s.txt"%code_id)
            #把验证码文字写入到文本文件中,放到退钱目录
            fileutil.write(text_file_name,yzm.encode("UTF-8","ignore"))
            #把图片文件复制到退钱的目录
            fileutil.copyfile(img_path,img_name)
            self.logging.error(u"验证码没识别出来,退钱正常")
        except Exception as ee:
            #退钱失败文件名前缀为0
            img_name="%s\\%d_%s.png" %(yzm_dir,0,code_id)
            text_file_name="%s\\%d_%s.txt"%(yzm_dir,0,code_id)
            #把验证码文字写入到文本文件中,放到退钱目录
            fileutil.write(text_file_name,yzm.encode("UTF-8","ignore"))
            #把图片文件复制到退钱的目录
            fileutil.copyfile(img_path,img_name)
            self.logging.error(u"验证码没识别出来,errorType=5 。退钱发生异常.error:%s" % exceputil.traceinfo(ee))
コード例 #21
0
ファイル: loggingutil.py プロジェクト: chybot/crawler
def get_logger(app, **kwargs):
    dict_config = DEFAULT_CONF
    dict_config['app'] = app
    dict_config.update(kwargs)
    logger = logging.getLogger(app)
    logger.setLevel(dict_config['level'])

    if dict_config['is_file']:
        # 日志文件名按时间自动更换
        filehandler = TimedRotatingFileHandler(
            dict_config['filepath'] + dict_config['filename_prefix'],
            dict_config['when'], dict_config['interval'],
            dict_config['backup_count'])
        # 日志后缀名
        filehandler.suffix = dict_config['filename_suffix']
        # 每行日志的前缀设置
        formatter = logging.Formatter(fmt=dict_config['format_file'],
                                      datefmt=dict_config['datefmt_file'])
        # 设置格式到日志对象
        filehandler.setFormatter(formatter)
        filehandler.setLevel(dict_config['level_file'])
        logger.addHandler(filehandler)

    # 开启输出到屏幕
    if dict_config['is_console']:
        console = logging.StreamHandler()
        console.setLevel(dict_config['level_console'])
        formatter_console = logging.Formatter(dict_config['format_console'])
        console.setFormatter(formatter_console)
        logger.addHandler(console)

    # 开启输出到屏幕
    if dict_config['is_queue']:
        try:
            qh = QueueHandler(**dict_config)
            formatter_queue = logging.Formatter(dict_config['format_queue'])
            qh.setFormatter(formatter_queue)
            qh.setLevel(dict_config['level_queue'])
            logger.addHandler(qh)
        except Exception as e:
            print u'启动kafka 出错:{}'.format(traceinfo(e))
    return logger
コード例 #22
0
ファイル: from_ssdb_to_mongo.py プロジェクト: chybot/crawler
    def get_data_from_ssdb(self):
        ssdb_list = []
        count = 1000
        retry_count = 10
        while True:
            try:
                count -= 1
                if count >= 0:
                    data = self.ssdb.get()
                    if data != None and len(data) > 0:
                        ssdb_list.append(pickle.loads(data[0]))
                else:
                    break
            except Exception as e:
                self.logging.error(u'从ssdb中弹出数据异常 %s' % exceputil.traceinfo(e))
                retry_count -= 1
                if retry_count > 0:
                    time.sleep(60)
                    self.ssdb = ssdbutil2.getSSDBQueuev2(self.queue_name,
                                                         host=config.ssdb_host,
                                                         port=config.ssdb_port)

        return ssdb_list
コード例 #23
0
ファイル: from_ssdb_to_mongo.py プロジェクト: chybot/crawler
 def save_data(self, last_failure_file='ssdb_mongo.data', wait_time=300):
     if os.path.exists(last_failure_file) == True:
         failed_list = []
         count = 0
         with open(last_failure_file, 'rb') as f:
             for line in f:
                 failed_list.append(
                     self.json_to_dict(line.strip().strip('\n')))
                 count += 1
         while True:
             try:
                 self.logging.info('Last Failed File :%d' %
                                   len(failed_list))
                 if failed_list != None and len(failed_list) > 0:
                     for data in failed_list:
                         if data == None:
                             continue
                         if '_id' in data.keys():
                             _id = data['_id']
                         else:
                             _id = None
                         if isinstance(data, dict) and _id != None:
                             try:
                                 self.mongo.table.update({'_id': _id}, data,
                                                         True)
                                 self.logging.info(u'成功update一条数据:%s' % _id)
                             except Exception, e:
                                 self.logging.info(u'fail-update一条数据:%s' %
                                                   _id)
                     os.remove(last_failure_file)
                     # insert_ret = self.mongo.table.insert(failed_list,safe = True)
                     # if count - len(insert_ret) < 10 and count - len(insert_ret) >= 0:
                     #     os.remove(last_failure_file)
                     #     break
                     # else:
                     #     time.sleep(5)
                     #     continue
                 break
             except pymongo.errors.OperationFailure as e:
                 #                    self.logging.error(exceputil.traceinfo(e))
                 if e != None and e != '':
                     self.logging.info(e)
                     _id = re.findall(
                         r'.*?dup key:.*?\{.*?:.*?\"(.*?)\".*?\}', str(e))
                     self.logging.info('_id:%s' % _id[0])
                     if len(_id) > 0:
                         update_data, other_list = self.get_index_and_other_list(
                             _id[0], failed_list)
                         if update_data != None:
                             self.mongo.table.update(
                                 {'_id': update_data['_id']}, update_data,
                                 True)
                             self.logging.info(u'update data:%s 成功' %
                                               _id[0])
                             if other_list != None and len(other_list) > 0:
                                 failed_list = other_list
                             else:
                                 break
                             continue
                         else:
                             break
                     else:
                         break
             except Exception as e:
                 self.logging.error(u'存mongo数据异常 %s' %
                                    exceputil.traceinfo(e))
                 time.sleep(5)
                 self.mongo = mongoutil.getmondbv2(
                     config.mongo_host,
                     config.mongo_port,
                     self.db_name,
                     config.table_name,
                     username=config.mongo_username,
                     password=config.mongo_passwd)
コード例 #24
0
                sheet = book.sheet_by_name(sheet_)
                if len(sheet.row_values(0)) > 2 and len(
                        sheet.col_values(m)) > 2:
                    col_data = sheet.col_values(m)
                else:
                    continue
                if re.search(u"(企业)?(更名后名称)?", col_data[0]):
                    begin_num = 1
                else:
                    begin_num = 0
                for col in range(begin_num, len(col_data)):
                    f1.write(
                        col_data[col].strip().replace("•", "").encode("GBK") +
                        "\n")
            except Exception as e:
                logging.error(exceputil.traceinfo(e))
                continue
                time.sleep(2)
        f1.close()

# # encoding : utf-8       #设置编码方式
#
# import xlrd                    #导入xlrd模块
#
# #打开指定文件路径的excel文件
#
# xlsfile = r'D:\AutoPlan\apisnew.xls'
# book = xlrd.open_workbook(xlsfile)     #获得excel的book对象
#
# #获取sheet对象,方法有2种:
# sheet_name=book.sheet_names()[0]          #获得指定索引的sheet名字
コード例 #25
0
    def parse_yzm(self,img_url,img_src,typecode,yzm_max_len=4,type=None):
        """
        对验证码进行人工打码验证
        :param img_url:  验证码图片地址
        :param img_src:  验证码图片内容
        :param typecode:
        :param yzm_max_len:  验证码最大长度
        :return: (unicode,unicode,bool,RecChar,unicode)(验证码内容, 打码系统id, 是否正常,打码对象,验证码图片地址)
        """
        try:
            dir_path=os.path.abspath('.')
            urlpret = urlparse.urlparse(img_url)
            img_path = os.path.join(dir_path,"%s_%s.png"%(urlpret.hostname,self.pinyin))
            print "img_path:", img_path, "type:", type
            fileutil.write(img_path,img_src)
            self.logging.info(u"请求验证码")
            #发送给打码公司打码 或 机器打码
            if type!=None and len(type)>0:
                if self.recChar == None:
                    self.recChar=RecChar(type)
                ret=self.recChar.rec(img_path)
                if ret!=None and len(ret)>0:
                    yzm= str(ret[0])
                    print "yzm:",yzm
                    if chardet.detect(yzm)['encoding'] == "utf-8":
                        yzm = yzm.decode("utf-8")
                    if yzm!=None and yzm.lower()=="none":
                        yzm=None
                    return yzm,"0",False,self.recChar,img_path
                else:
                    raise Exception(u"机器打码返回值为None或长度为0.")

            else:
                if self.recChar == None:
                    self.recChar = RecChar()
                self.yzm_count+=1
                (yzm, code_id, is_report_error,img_path)=self.bbd_yzm(img_src)
                #(yzm, code_id, is_report_error) = self.recChar.rec(img_path, typecode=typecode);
                # 手工打码,用于测试
                # recChar=""
                # yzm=raw_input()
                # yzm= yzm.decode("UTF-8",'ignore')
                # code_id="asdfasdfasdf"
                # is_report_error=False
                # print "yzm:",yzm
                self.logging.info(u"验证码返回结果,yzm:%s,code_id:%s,is_report_error:%s"%(yzm, str(code_id), str(is_report_error)))
                #退钱需要用coid_id,如果coid为空则证明没有打码失败没有收费,所以不需要退钱
                # if len(str(code_id))<4:
                #     self.logging.error(u"验证码识别错误。errorType=1,coid_id为空")
                #     self.yzm_error+=1
                #     raise self.ValidYzmException(u"验证码识别错误")
                # #验证码内容为空
                # if len(yzm)<1 or len(yzm)>yzm_max_len:
                #     self.logging.error(u"验证码识别错误。errorType=2,验证码长度不在正确范围")
                #     if len(code_id)>=4:
                #         self.back_money(self.recChar,code_id,yzm,img_path)
                #     else:
                #         self.yzm_error+=1
                #     time.sleep(0.1)
                #     raise self.ValidYzmException(u"验证码识别错误")

                return (yzm, code_id, is_report_error,self.recChar,img_path)




        except self.ValidYzmException as e1:
            self.logging.error(u"验证码处理异常,error:%s"%exceputil.traceinfo(e1))
            raise
        except Exception as  yzmerror:
            self.logging.error(u"验证码处理异常,errorType=4,error:%s"%exceputil.traceinfo(yzmerror))
            # self.back_money(self.recChar,code_id,yzm,img_path)
            raise
コード例 #26
0
    def process(self):
        """
        主程序
        1、获取上一次退出前最后下载失败的公司名并放到队列末尾
        2、从队列中读取公司名,写到本地文件中
        3、调用抓取调度模块,对公司名进行分词处理
        4、若抓取失败,公司名会放到队列末尾并清理本地文件
        5. 回到第2步
        :return: (None)
        """
        self.logging.info(u"开始%s站内容抓取" % self.chinese)
        #获取上一次退出前最后下载失败的公司名并放到队列末尾
        self.process_last_error()
        #失败次数,连续n个公司失败后会休眠1小时并发邮件
        fail_count = 10
        while True:
            try:
                #从队列中读取公司名进行处理
                company_name = self.pop_company().decode("UTF-8", "ignore")
                if len(company_name) < 1 or len(company_name) > 2000:
                    raise Exception(u"公司长度不合理")
                self.keyword = company_name
            except Exception as e:
                self.logging.error(u"队列取值错误.error:%s" % exceputil.traceinfo(e))
                time.sleep(1)
                continue
            #把redis队列中取到的内容写到本地文件中
            fileutil.write(self.last_company_key_file, company_name.encode("UTF-8", "ignore"))
            #当前时间
            #this_time = time.strftime(u"%Y-%m-%d %H:%M:%S",time.localtime())
            try:
                #若当前的代理为自建代理,则更换代理,非自建代理则继续使用
                self.logging.info(u"代理《%s》使用次数:%s"%(self.proxy,self.proxy_num))
                try:
                    if self.proxy:
                        if self.pinyin not in proxy_series_configure :
                            if str(self.proxy.split(":")[-1]) in  ["42271","42272"]:
                                self.proxy = self.get_useful_proxy()
                                self.logging.error(u"使用优化代理:%s开始抓取,公司名:%s" % (self.proxy, company_name))
                            elif self.proxy_num>=proxy_none_series_configure.get(self.pinyin,50):
                                self.proxy = self.get_useful_proxy()
                                self.logging.error(u"使用优化代理:%s开始抓取,公司名:%s" % (self.proxy, company_name))
                            else:
                                self.proxy_series_error=0
                                self.proxy_num+=1
                        elif self.proxy_num>=proxy_series_configure.get(self.pinyin,2000):
                            self.proxy = self.get_useful_proxy()
                        else:
                            self.proxy_series_error=0
                            self.proxy_num+=1
                    else:
                        self.proxy = self.get_useful_proxy()
                except Exception as e:
                    self.logging.error(e)
                #调用抓取调度模块,对公司名进行分词处理
                ret = self.crawler_scheduler(company_name)

                fileutil.clear(self.last_company_key_file)
                #成功,重置为初始值
                fail_count = 10
            except self.ValidException as e1:
                #连续失败计数
                fail_count -= 1
                #失败的公司名放到redis队列末尾
                self.append_bottom(company_name)
                #清理存有上次处理公司名的文件
                fileutil.clear(self.last_company_key_file)
                #如果连续失败10个公司验证码每个都失败10次则休眠1小时
                if (fail_count < 0):
                    #TODO
                    #if hasattr(self,"kafka_mail"):
                    #    self.kafka_mail.send_mail(self.mail_list, u'%s站验证码识别异常报告' % self.chinese, u'公司爬取失败数超过10个')
                    self.logging.error(u"公司爬取失败数超过10个")
                    time.sleep(60 * 30) # 这里休眠时间缩短,机器打码不需要休眠太久
                    #睡醒了,重置为初始值
                    fail_count = 10
                continue
            except Exception as e:
                self.logging.error(u"公司抓取异常。公司名:%s error:%s" % (company_name, exceputil.traceinfo(e)))
                #失败公司放到redis队列末尾
                self.append_bottom(company_name)
                #清理存有上次处理公司名的文件
                fileutil.clear(self.last_company_key_file)

        self.logging.info(u"%s站内容抓取完成" % self.chinese)
コード例 #27
0
ファイル: from_ssdb_to_mongo.py プロジェクト: chybot/crawler
                                    JSONEncoder().encode(line) + '\n'
                                    for line in data_list
                                ]
                                f.writelines(other_list)
                        self.logging.info(u'获取一千个数据')
                        end_time = time.time()
                        self.logging.info(u'获取一千条数据消耗时间为:%d' %
                                          (end_time - start_time))
                        break
                    else:
                        self.logging.info(u'休眠5s,等待数据')
                        count_time += 1
                        time.sleep(5)
                        continue
                except Exception as e:
                    self.logging.error(u'取数据异常 %s' % exceputil.traceinfo(e))
                    time.sleep(30)
                    self.ssdb = ssdbutil2.getSSDBQueuev2(self.queue_name,
                                                         host=config.ssdb_host,
                                                         port=config.ssdb_port)

            start_time = None
            end_time_1 = None
            end_time_2 = None
            while True:
                try:
                    if data_list != None and len(data_list) > 0:
                        count = len(data_list)
                        start_time = time.time()
                        for data in data_list:
                            if data == None:
コード例 #28
0
    def crawler_scheduler(self, company_name):
        """
         抓取调度器。
         1.公司名进行分词,然后用分词后的关键字去查询。
         2.如果有返回内容并且包含指定公司则停止抓取,返回True。。
         3.如果有返回内容达到max_num_perpage条但不包含指定公司则继续抓取,并设置关键字白名单和公司白名单。
        4.如果没有返回内容则结束抓取。
        :param company_name:  (str)  公司名 -> 北京钢铁公司
        :return: True/unicode  -> True                  :抓取到内容并且包含指定公司
                                 "没有指定公司"          :抓取到数据,没有指定公司
                                 "没有查询到企业信息"    :没有抓取到任何内容
        """
        has_data = False
        try:
            self.logging.info(u"分词:%s"%company_name)
            company_dic = {}
            is_dic = False
            try:
                #转换抓取的种子,目前种子可能是一个字典:例如:
                pattern_str = r"^\d{%d}" % len(company_name)
                if re.match(pattern_str, company_name):
                    company_dic[u"zch"] = company_name
                else:
                    if company_name.startswith('{') and company_name.endswith('}'):
                        company_name=json.loads(company_name)
                        company_dic.update(company_name)
                    else:
                        if not isinstance(company_name,unicode):
                            encoding= chardet.detect(company_name).get("encoding")
                            if encoding:
                                if encoding=='ascii':
                                    self.logging.error(u"关键字编码错误")
                                    return 0,False
                                else:
                                    company_name=company_name.decode(encoding,'ignore')
                        company_dic['name']=company_name
                    if 'keyword' in company_dic and ('zch' in company_dic.get('keyword') or 'name' in company_dic.get('keyword')):
                        company_dic.update(company_dic.get('keyword'))
                    is_dic = True
            except Exception as e:
                self.logging.error(u"种子队列转换出错:%s" % e)
                company_dic[u"name"] = company_name
                pass
            #抓取优先级为:注册号--》公司名----》信用代码
            company_count = 0
            inner_company = False
            is_Exception = []
            #使用注册号抓取
            if company_dic.get(u"zch"):
                try:
                    temp_key = company_dic.get(u"zch")
                    self.logging.info(u"抓取(1):%s" % temp_key)
                    company_count, inner_company = self.crawler(temp_key, temp_key)
                except Exception as e:
                    is_Exception.append(e)
            #使用公司url抓取
            if company_count <1 and company_dic.get(u"name") and company_dic.get(u"url"):
                try:
                    company_name=company_dic.get(u"name")
                    self.logging.info(u"抓取(2):%s" % company_name)
                    company_url=company_dic.get(u"url")
                    company_count,inner_company=self.crawler_url(company_url,company_name)
                except Exception as e:
                    is_Exception.append(e)
            #使用url抓取,能抓到公司,但是没有抓到该公司
            if company_count>=1 and inner_company==False and company_dic.get(u"name"):
                try:
                    temp_key = company_dic.get(u"name")
                    self.logging.info(u"抓取(3):%s" % temp_key)
                    company_count, inner_company = self.crawler(temp_key, temp_key)
                except Exception as e:
                    is_Exception.append(e)
            #若使用注册号没有抓取到,则使用公司名抓取
            if company_count < 1 and company_dic.has_key(u"name") and company_dic.get(u"name", u""):
                try:
                    temp_key = company_dic.get(u"name", u"")
                    self.logging.info(u"抓取(4):%s" % temp_key)
                    company_count, inner_company = self.crawler(temp_key, temp_key)
                except Exception as e:
                    is_Exception.append(e)
            #若使用公司名没有抓取到,则使用抓取信用代码抓取
            if company_count < 1 and company_dic.has_key(u"xydm") and company_dic.get(u"xydm", u""):
                try:
                    temp_key = company_dic.get(u"xydm", u"")
                    self.logging.info(u"抓取(5):%s" % temp_key)
                    company_count, inner_company = self.crawler(temp_key, temp_key)
                except Exception as e:
                    is_Exception.append(e)

            #判断书否为元祖队列,且没有抓取的,记录数据,用于排查
            if company_count < 1 and is_dic:
                try:
                    if is_Exception:
                        company_dic[u"exception"] = u"no"
                    else:
                        company_dic[u"exception"] = u"yes"
                    self.queue.select_queue(self.pinyin + '_noncompany_dic')
                    self.queue.save(company_dic)
                except:
                    self.logging.error(u"保存为抓取的元祖队列失败:%s" % company_name)

            #程序抛错
            if is_Exception:
                raise json.dumps(is_Exception)
                #raise Exception('\n'.join(map(lambda x:x.decode(chardet.detect(x).get("encoding","UTF-8"),'ignore') if not isinstance(x,object) else str(x),is_Exception)))

            if company_count < 1:
                #没有抓取内容'''
                self.set_black_keyword(company_dic)
                # break
            else:
                #有抓取内容'''
                self.set_white_key(company_name)
                if inner_company == True:
                    return True
                else:
                    has_data = True
                    # if company_count < self.max_num_perpage:
                    #     break
        except self.ValidException as e1:
            self.logging.error(u"验证码异常,关键字:%s,错误信息:%s" % (company_name, exceputil.traceinfo(e1)))
            raise
        except Exception as e:
            self.logging.error(u"关键字:%s,错误信息:%s" % (company_name, exceputil.traceinfo(e)))
            raise


        if has_data:
            return u"没有指定公司"
        else:
            return u"没有查询到企业信息"