def update_user_fans(): log.info("--------update fans running--------") last_timestamp: int = 0 update_delta: int = 24 * 60 * 60 file: List[dict] = list() try: while True: timestamp: int = int(time.time()) if timestamp - last_timestamp >= update_delta: log.info("----------update fans----------") session = DBSession() mids: Set[int] = set() sql: str = 'select mid from "user"' res: ResultProxy = session.execute(sql) for item in res.fetchall(): mids.add(int(item[0])) log.info("mids: %s" % mids.__len__()) for i, v in enumerate(mids): try: mid = {'mid': v} res: HTTPResponse = selfusepy.get( 'http://api.bilibili.com/x/web-interface/card', **mid) j: dict = json.loads(res.data) fans: int = int(j["data"]["follower"]) user: UserProfileDO = session.query( UserProfileDO).filter( UserProfileDO.mid == v).first() if fans is None: raise Exception("mid: %s, fans can not be none" % v) # fans - user.fans if user.fans is not None else user.fans = 0 log.info( "i: %s, mid: %s, former fans: %s, fans: %s, delta: %s" % (i, v, user.fans, fans, fans - user.fans if user.fans is not None else fans)) user.fans = fans session.commit() file.append({ "mid": v, "former_fans": user.fans, "fans": fans }) time.sleep(2) except BaseException as e: log.info("mid: %s, user: %s" % (v, user)) raise e session.close() last_timestamp = timestamp file_name = "%s.json" % ("%s-%s" % ("fans", timestamp)) file_path = "data-temp/%s" % file_name _file.save(json.dumps(file), file_path) _s3.put({file_name: file_path}) log.info("----------update fans end----------") else: time.sleep(10) except BaseException as e: log.exception(e) import traceback if platform.system() != "Windows": _email.send(email_to_addr, traceback.format_exc())
def main(): """ 测试需要调整数据库, s3删除, archive目录 :return: """ temp_file_dir = 'data-temp/' # download data log.info("Getting objects' keys") keys: Set[str] = _s3.get_all_objects_key() if keys.__len__() < 1: log.info("No file in COS!") exit(0) else: local_processing.multi_download(temp_file_dir, keys) if not _s3.archive_object(keys): log.error("Archive objects failed") exit(0) log.info("Download files, DONE.") # reading data all_data: MutableMapping[str, AV] = read_file(temp_file_dir) log.info("Analyze") # multi analyze pool = Pool(processes = cpu_use_number) q = multiprocessing.Manager().Queue() size = int(math.ceil(all_data.__len__() / float(cpu_use_number))) map_temp: MutableMapping[str, AV] = {} res: List[ApplyResult] = list() for key, value in all_data.items(): map_temp[key] = value if map_temp.__len__() % size == 0: res.append(pool.apply_async(func = analyze, args = (q, map_temp,))) map_temp = {} res.append(pool.apply_async(func = analyze, args = (q, map_temp,))) pool.close() pool.join() if q.qsize() > 0: # 当queue的size大于0的话, 那就是进程里面出现了错误, raise, 结束任务 log.error('analyze occurs error') raise Exception(q) # saving all_avinfos: List[AVInfoDO] = [] all_avstats: List[AVStatDO] = [] for item in res: v = item.get() all_avinfos.extend(v[0]) all_avstats.extend(v[1]) # remove avinfos which exist in db already and same in program log.info("Remove duplicated avinfo") temp: Set[int] = set() # db for item in all_avinfos: temp.add(item.aid) session = DBSession() sql: str = "select aid from av_info where aid in (%s)" % ",".join("%s" % item for item in temp) aids: ResultProxy = session.execute(sql) temp.clear() for item in aids.fetchall(): temp.add(int(item[0])) temp2: List[AVInfoDO] = [] # program for item in all_avinfos: if not temp.__contains__(item.aid): temp2.append(item) temp.add(item.aid) all_avinfos = temp2 # db log.info("Save infos(%s) and stats(%s)" % (all_avinfos.__len__(), all_avstats.__len__())) session.bulk_save_objects(all_avinfos) session.bulk_save_objects(all_avstats) session.commit() # archive log.info("Archive") for item in all_data.keys(): index: int = item.find("/online") shutil.move(item[:index], "D:/spider archive") log.info('[Done]')