def close_spider(self, spider): s_members = RedisHelper.get_redis_conn().hvals(spider.member_xls_key) r = 0 hs = spider.member_headers for i, h in enumerate(hs): spider.member_result_xsl_sheet.write(0, i, h) r += 1 log.msg('%s get members from redis, size: %d' %(spider.name, 0 if not s_members else len(s_members))) if s_members: for s_member in s_members: member = json.loads(s=s_member, encoding="utf-8") is_valid = True for h in hs: if not h in member: is_valid = False break if not is_valid: continue card_no = member[u'卡号'] if self.card_no_sep in card_no: card_no = card_no[:card_no.index(self.card_no_sep)] member[u'卡号'] = card_no for i, h in enumerate(hs): spider.member_result_xsl_sheet.write(r, i, member[h]) r += 1 spider.member_result_xsl_book.save(spider.member_result_xls) log.msg('%s complete write member xsl for %s.' % (spider.name, spider.login_username)) RedisHelper.get_redis_conn().delete(spider.member_xls_key) RedisHelper.get_redis_conn().srem(running_auths_key, spider.login_username)
def item_completed(self, results, item, info): if self.IMAGES_RESULT_FIELD in item.fields: item[self.IMAGES_RESULT_FIELD] = [x for ok, x in results if ok] log.msg('got verify code result %s' % json.dumps(obj=item[self.IMAGES_RESULT_FIELD], ensure_ascii=False, indent=4)) path = settings.IMAGES_STORE + '/' + item[self.IMAGES_RESULT_FIELD][0]['path'] code = None while not code: code = dama.dama(path) code = str(code) log.msg('got code %s' % code) RedisHelper.get_redis_conn().set(item['key'], code) raise DropItem('drop it.')
def __info_props(self): self.login_username = self.kwargs['username'] log.start(logfile=settings.get_log_file(self.login_username), loglevel=settings.LOG_LEVEL, crawler=self.crawler) RedisHelper.get_redis_conn().sadd(running_auths_key, self.login_username) self.login_password = self.kwargs['password'] self.start_time = time.time() uid = uuid.uuid3(uuid.NAMESPACE_X500, self.login_username) self.member_xls_key = '__origin_xls_members_%s_%d' % (uid, self.start_time) self.res_dir = result_dir if not self.res_dir.endswith('/'): self.res_dir = '%s/' % (self.res_dir, ) self.res_dir = '%s%s' % (self.res_dir, self.login_username) shutil.rmtree(self.res_dir, ignore_errors=True) if not os.path.exists(self.res_dir): os.makedirs(self.res_dir) self.member_result_xls = '%s/members_%d.xls' % (self.res_dir, self.start_time) self.member_result_xsl_book = xlwt.Workbook() self.member_result_xsl_sheet = self.member_result_xsl_book.add_sheet(u'会员资料') self.member_result_rows = 0 self.member_result_semaphore = multiprocessing.Semaphore(1) self.member_origin_result_ready = False self.member_headers = [u'手机号', u'姓名', u'性别', u'会员分类', u'注册日期', u'卡号', u'卡名称', u'卡类型', u'折扣', u'储值总额', u'消费总额', u'卡内总余额', u'赠送总余额', u'失效日期', u'消费次数', u'当前积分', u'最后消费日', u'欠款'] self.employee_result_semaphore = multiprocessing.Semaphore(1) self.employee_result_xls = '%s/employees_%d.xls' % (self.res_dir, self.start_time) self.employee_result_xsl_book = xlwt.Workbook() self.employee_result_xsl_sheet = self.employee_result_xsl_book.add_sheet(u'员工资料') self.employee_result_rows = 0 self.service_result_semaphore = multiprocessing.Semaphore(1) self.service_result_xls = '%s/services_%d.xls' % (self.res_dir, self.start_time) self.service_result_xsl_book = xlwt.Workbook() self.service_result_xsl_sheet = self.service_result_xsl_book.add_sheet(u'服务项目') self.service_result_rows = 0 self.membercard_result_semaphore = multiprocessing.Semaphore(1) self.membercard_result_xls = '%s/membercards_%d.xls' % (self.res_dir, self.start_time) self.membercard_result_xsl_book = xlwt.Workbook() self.membercard_result_xsl_sheet = self.membercard_result_xsl_book.add_sheet(u'会员卡') self.membercard_result_rows = 0 self.member_treat_key = '__member_treats_%s_%d' % (uid, self.start_time) self.member_treat_result_xsl = '%s/membertreats_%d.xls' % (self.res_dir, self.start_time) self.member_treat_result_xsl_book = xlwt.Workbook() self.member_treat_result_xsl_sheet = self.member_treat_result_xsl_book.add_sheet(u'疗程项目') self.member_treat_rows = 0
def close_spider(self, spider): s_member_treats = RedisHelper.get_redis_conn().lrange(spider.member_treat_key, 0, -1) if s_member_treats: for s_member_treat in s_member_treats: member_treat = json.loads(s=s_member_treat, encoding="utf-8") if spider.member_treat_rows == 0: hs = member_treat['hs'] for i, h in enumerate(hs): spider.member_treat_result_xsl_sheet.write(0, i, h) spider.member_treat_rows += 1 vals = member_treat['vals'] r = spider.member_treat_rows for i, v in enumerate(vals): spider.member_treat_result_xsl_sheet.write(r, i, v) spider.member_treat_rows += 1 spider.member_treat_result_xsl_book.save(spider.member_treat_result_xsl) log.msg('%s complete write member treat xsl for %s.' % (spider.name, spider.login_username)) RedisHelper.get_redis_conn().delete(spider.member_treat_key)
def start(options, tp='file'): threads = options.threads if threads < 1: print u'threads必须是大于0的整数' parser.print_help() sys.exit(0) workpath = options.workpath delimiter = options.delimiter semaphore = multiprocessing.Semaphore(threads) if tp == 'file': authfile = options.authfile auths = [] f = open(authfile, 'r') for l in f: l = l.strip() auth = l.split(delimiter) if len(auth) < 2: print u'%s 的分隔符与配置不符' % l continue auths.append((auth[0], auth[1])); print u'%s中账号和密码的个数为%d' % (authfile, len(auths)) if auths: for auth in auths: semaphore.acquire() threading.Thread(target=_start_crawl,args=(workpath, auth, semaphore)).start() while semaphore.get_value() != threads: pass print 'all done...' elif tp == 'redis': auth_key = options.auth_key rconn=RedisHelper.get_redis_conn() while 1: info = rconn.lpop(auth_key) if not info: time.sleep(1) continue auth = info.split(delimiter) if len(auth) < 2: print u'%s 的分隔符与配置不符' % info continue if rconn.sismember(running_auths_key, auth[0]): print u'%s 该用户名的数据正在抓取中, 请等待本次抓取完成后再试' % info continue semaphore.acquire() threading.Thread(target=_start_crawl,args=(workpath, (auth[0], auth[1]), semaphore)).start() else: print u'不支持的账号信息保存类型'