Beispiel #1
0
def update_user_fans():
    log.info("--------update fans running--------")
    last_timestamp: int = 0
    update_delta: int = 24 * 60 * 60
    file: List[dict] = list()
    try:
        while True:
            timestamp: int = int(time.time())
            if timestamp - last_timestamp >= update_delta:
                log.info("----------update fans----------")
                session = DBSession()

                mids: Set[int] = set()
                sql: str = 'select mid from "user"'
                res: ResultProxy = session.execute(sql)
                for item in res.fetchall():
                    mids.add(int(item[0]))
                log.info("mids: %s" % mids.__len__())

                for i, v in enumerate(mids):
                    try:
                        mid = {'mid': v}
                        res: HTTPResponse = selfusepy.get(
                            'http://api.bilibili.com/x/web-interface/card',
                            **mid)
                        j: dict = json.loads(res.data)
                        fans: int = int(j["data"]["follower"])
                        user: UserProfileDO = session.query(
                            UserProfileDO).filter(
                                UserProfileDO.mid == v).first()
                        if fans is None:
                            raise Exception("mid: %s, fans can not be none" %
                                            v)
                        # fans - user.fans if user.fans is not None else user.fans = 0
                        log.info(
                            "i: %s, mid: %s, former fans: %s, fans: %s, delta: %s"
                            % (i, v, user.fans, fans, fans -
                               user.fans if user.fans is not None else fans))
                        user.fans = fans
                        session.commit()
                        file.append({
                            "mid": v,
                            "former_fans": user.fans,
                            "fans": fans
                        })
                        time.sleep(2)
                    except BaseException as e:
                        log.info("mid: %s, user: %s" % (v, user))
                        raise e

                session.close()
                last_timestamp = timestamp
                file_name = "%s.json" % ("%s-%s" % ("fans", timestamp))
                file_path = "data-temp/%s" % file_name
                _file.save(json.dumps(file), file_path)
                _s3.put({file_name: file_path})
                log.info("----------update fans end----------")
            else:
                time.sleep(10)
    except BaseException as e:
        log.exception(e)
        import traceback
        if platform.system() != "Windows":
            _email.send(email_to_addr, traceback.format_exc())
Beispiel #2
0
def main():
  """
  测试需要调整数据库, s3删除, archive目录
  :return:
  """
  temp_file_dir = 'data-temp/'

  # download data
  log.info("Getting objects' keys")
  keys: Set[str] = _s3.get_all_objects_key()

  if keys.__len__() < 1:
    log.info("No file in COS!")
    exit(0)
  else:
    local_processing.multi_download(temp_file_dir, keys)
    if not _s3.archive_object(keys):
      log.error("Archive objects failed")
      exit(0)
    log.info("Download files, DONE.")

  # reading data
  all_data: MutableMapping[str, AV] = read_file(temp_file_dir)

  log.info("Analyze")
  # multi analyze
  pool = Pool(processes = cpu_use_number)
  q = multiprocessing.Manager().Queue()

  size = int(math.ceil(all_data.__len__() / float(cpu_use_number)))
  map_temp: MutableMapping[str, AV] = {}

  res: List[ApplyResult] = list()
  for key, value in all_data.items():
    map_temp[key] = value
    if map_temp.__len__() % size == 0:
      res.append(pool.apply_async(func = analyze, args = (q, map_temp,)))
      map_temp = {}
  res.append(pool.apply_async(func = analyze, args = (q, map_temp,)))
  pool.close()
  pool.join()
  if q.qsize() > 0:  # 当queue的size大于0的话, 那就是进程里面出现了错误, raise, 结束任务
    log.error('analyze occurs error')
    raise Exception(q)

  # saving
  all_avinfos: List[AVInfoDO] = []
  all_avstats: List[AVStatDO] = []
  for item in res:
    v = item.get()
    all_avinfos.extend(v[0])
    all_avstats.extend(v[1])

  # remove avinfos which exist in db already and same in program
  log.info("Remove duplicated avinfo")
  temp: Set[int] = set()  # db
  for item in all_avinfos:
    temp.add(item.aid)
  session = DBSession()
  sql: str = "select aid from av_info where aid in (%s)" % ",".join("%s" % item for item in temp)
  aids: ResultProxy = session.execute(sql)
  temp.clear()
  for item in aids.fetchall():
    temp.add(int(item[0]))

  temp2: List[AVInfoDO] = []  # program
  for item in all_avinfos:
    if not temp.__contains__(item.aid):
      temp2.append(item)
      temp.add(item.aid)
  all_avinfos = temp2

  # db
  log.info("Save infos(%s) and stats(%s)" % (all_avinfos.__len__(), all_avstats.__len__()))
  session.bulk_save_objects(all_avinfos)
  session.bulk_save_objects(all_avstats)
  session.commit()

  # archive
  log.info("Archive")
  for item in all_data.keys():
    index: int = item.find("/online")
    shutil.move(item[:index], "D:/spider archive")

  log.info('[Done]')