Example #1
0
 def close_spider(self, spider):
     s_members = RedisHelper.get_redis_conn().hvals(spider.member_xls_key)
     r = 0
     hs = spider.member_headers
     for i, h in enumerate(hs):
         spider.member_result_xsl_sheet.write(0, i, h)
     r += 1
     log.msg('%s get members from redis, size: %d' %(spider.name, 0 if not s_members else len(s_members)))
     if s_members:
         for s_member in s_members:
             member = json.loads(s=s_member, encoding="utf-8")
             is_valid = True
             for h in hs:
                 if not h in member:
                     is_valid = False
                     break
             if not is_valid:
                 continue
             card_no = member[u'卡号']
             if self.card_no_sep in card_no:
                 card_no = card_no[:card_no.index(self.card_no_sep)]
                 member[u'卡号'] = card_no
             for i, h in enumerate(hs):
                 spider.member_result_xsl_sheet.write(r, i, member[h])
             r += 1
     spider.member_result_xsl_book.save(spider.member_result_xls)
     log.msg('%s complete write member xsl for %s.' % (spider.name, spider.login_username))
     RedisHelper.get_redis_conn().delete(spider.member_xls_key)
     RedisHelper.get_redis_conn().srem(running_auths_key, spider.login_username)
Example #2
0
 def item_completed(self, results, item, info):
     if self.IMAGES_RESULT_FIELD in item.fields:
         item[self.IMAGES_RESULT_FIELD] = [x for ok, x in results if ok]
         log.msg('got verify code result %s' % json.dumps(obj=item[self.IMAGES_RESULT_FIELD], ensure_ascii=False, indent=4))
         path = settings.IMAGES_STORE + '/' + item[self.IMAGES_RESULT_FIELD][0]['path']
         code = None
         while not code:
             code = dama.dama(path)
         code = str(code)
         log.msg('got code %s' % code)
         RedisHelper.get_redis_conn().set(item['key'], code)
         raise DropItem('drop it.')
Example #3
0
    def __info_props(self):
        self.login_username = self.kwargs['username']
        log.start(logfile=settings.get_log_file(self.login_username), loglevel=settings.LOG_LEVEL, crawler=self.crawler)
        RedisHelper.get_redis_conn().sadd(running_auths_key, self.login_username)
        self.login_password = self.kwargs['password']
        self.start_time = time.time()
        uid = uuid.uuid3(uuid.NAMESPACE_X500, self.login_username)
        self.member_xls_key = '__origin_xls_members_%s_%d' % (uid, self.start_time)
        self.res_dir = result_dir
        if not self.res_dir.endswith('/'):
            self.res_dir = '%s/' % (self.res_dir, )
        self.res_dir = '%s%s' % (self.res_dir, self.login_username)
        shutil.rmtree(self.res_dir, ignore_errors=True)
        if not os.path.exists(self.res_dir):
            os.makedirs(self.res_dir)
        self.member_result_xls = '%s/members_%d.xls' % (self.res_dir, self.start_time)
        self.member_result_xsl_book = xlwt.Workbook()
        self.member_result_xsl_sheet = self.member_result_xsl_book.add_sheet(u'会员资料')
        self.member_result_rows = 0
        self.member_result_semaphore = multiprocessing.Semaphore(1)
        self.member_origin_result_ready = False
        self.member_headers = [u'手机号', u'姓名', u'性别', u'会员分类', u'注册日期', u'卡号', u'卡名称', u'卡类型', u'折扣', u'储值总额', u'消费总额', u'卡内总余额', u'赠送总余额', u'失效日期', u'消费次数', u'当前积分', u'最后消费日', u'欠款']

        self.employee_result_semaphore = multiprocessing.Semaphore(1)
        self.employee_result_xls = '%s/employees_%d.xls' % (self.res_dir, self.start_time)
        self.employee_result_xsl_book = xlwt.Workbook()
        self.employee_result_xsl_sheet = self.employee_result_xsl_book.add_sheet(u'员工资料')
        self.employee_result_rows = 0

        self.service_result_semaphore = multiprocessing.Semaphore(1)
        self.service_result_xls = '%s/services_%d.xls' % (self.res_dir, self.start_time)
        self.service_result_xsl_book = xlwt.Workbook()
        self.service_result_xsl_sheet = self.service_result_xsl_book.add_sheet(u'服务项目')
        self.service_result_rows = 0

        self.membercard_result_semaphore = multiprocessing.Semaphore(1)
        self.membercard_result_xls = '%s/membercards_%d.xls' % (self.res_dir, self.start_time)
        self.membercard_result_xsl_book = xlwt.Workbook()
        self.membercard_result_xsl_sheet = self.membercard_result_xsl_book.add_sheet(u'会员卡')
        self.membercard_result_rows = 0

        self.member_treat_key = '__member_treats_%s_%d' % (uid, self.start_time)
        self.member_treat_result_xsl = '%s/membertreats_%d.xls' % (self.res_dir, self.start_time)
        self.member_treat_result_xsl_book = xlwt.Workbook()
        self.member_treat_result_xsl_sheet = self.member_treat_result_xsl_book.add_sheet(u'疗程项目')
        self.member_treat_rows = 0
Example #4
0
    def close_spider(self, spider):
        s_member_treats = RedisHelper.get_redis_conn().lrange(spider.member_treat_key, 0, -1)
        if s_member_treats:
            for s_member_treat in s_member_treats:
                member_treat = json.loads(s=s_member_treat, encoding="utf-8")
                if spider.member_treat_rows == 0:
                    hs = member_treat['hs']
                    for i, h in enumerate(hs):
                        spider.member_treat_result_xsl_sheet.write(0, i, h)
                    spider.member_treat_rows += 1
                vals = member_treat['vals']
                r = spider.member_treat_rows
                for i, v in enumerate(vals):
                    spider.member_treat_result_xsl_sheet.write(r, i, v)
                spider.member_treat_rows += 1

        spider.member_treat_result_xsl_book.save(spider.member_treat_result_xsl)
        log.msg('%s complete write member treat xsl for %s.' % (spider.name, spider.login_username))
        RedisHelper.get_redis_conn().delete(spider.member_treat_key)
def start(options, tp='file'):
    threads = options.threads
    if threads < 1:
        print u'threads必须是大于0的整数'
        parser.print_help()
        sys.exit(0)
    workpath = options.workpath
    delimiter = options.delimiter
    semaphore = multiprocessing.Semaphore(threads)
    if tp == 'file':
        authfile = options.authfile
        auths = []
        f = open(authfile, 'r')
        for l in f:
            l = l.strip()
            auth = l.split(delimiter)
            if len(auth) < 2:
                print u'%s 的分隔符与配置不符' % l
                continue
            auths.append((auth[0], auth[1]));
        print u'%s中账号和密码的个数为%d' % (authfile, len(auths))
        if auths:
            for auth in auths:
                semaphore.acquire()
                threading.Thread(target=_start_crawl,args=(workpath, auth, semaphore)).start()
            while semaphore.get_value() != threads:
                pass
            print 'all done...'
    elif tp == 'redis':
        auth_key = options.auth_key
        rconn=RedisHelper.get_redis_conn()
        while 1:
            info = rconn.lpop(auth_key)
            if not info:
                time.sleep(1)
                continue
            auth = info.split(delimiter)
            if len(auth) < 2:
                print u'%s 的分隔符与配置不符' % info
                continue
            if rconn.sismember(running_auths_key, auth[0]):
                print u'%s 该用户名的数据正在抓取中, 请等待本次抓取完成后再试' % info
                continue
            semaphore.acquire()
            threading.Thread(target=_start_crawl,args=(workpath, (auth[0], auth[1]), semaphore)).start()
    else:
        print u'不支持的账号信息保存类型'