Exemple #1
0
def do_insert(flag_queue, task_queue, res_queue, db_info):
    logger.info('Starting: do_insert()')
    insert_done_sql = db_info['sqls']['insert-done']
    delete_todo_sql = db_info['sqls']['delete-todo']
    
    done_conn = gen_db_conn(db_info['done-db'])
    todo_conn = gen_db_conn(db_info['todo-db'])

    statsd_cfg = cfg['statsd']
    statsd_client = gen_statsd_client(statsd_cfg)
    STATSD_KEY = statsd_cfg['key-insert']
    
    while True:
        try:
            record = task_queue.get()
            _do_insert(done_conn, insert_done_sql, record)
            execute_sql(todo_conn, delete_todo_sql, (record[0],), commit=True)
            res_queue.put(True)
            statsd_client.incr(STATSD_KEY)
        except Exception:
            res_queue.put(False)
            done_conn.close()
            done_conn = gen_db_conn(db_info['done-db'])
            todo_conn.close()
            todo_conn = gen_db_conn(db_info['todo-db'])
            logger.error('loop_insert requests.Exception: %r' % traceback.format_exc())
            time.sleep(0.5)

    done_conn.close()
    todo_conn.close()
    logger.warning('@@@ Exit: do_insert()')
Exemple #2
0
    def insert(cls, initial=False):
        """
        向表中插入数据
        :param initial: 是否先格式化表
        :return:
        """
        try:
            logger.info(f"start update {cls.table_name}'s data.")
            if initial:
                execute_sql(cls.splice_truncate_sql(cls.table_name))

            df = cls.make_data()

            if df is None:
                return

            logger.info(f"find {cls.table_name}'s {df.shape[0]} records")
            if df.shape[0] == 0:
                return

            sql = cls.splice_insert_sql(cls.table_name, df)
            execute_sql(sql)
        except OperationalError as e:
            logger.error(f"Due to exception: {e}, this task failed!")
        else:
            logger.info(f'{cls.table_name} insert {df.shape[0]} records')
Exemple #3
0
    def process_msg(self, body, conn, sql):
        params = body.get('params')

        # task_uuid    = params.get('external_id')
        client_id = params.get('additional_info').get('client_id')
        thunder_hash = params.get('thunder_hash')
        digest = params.get('digest')
        # URL
        url = params.get('url')
        url_loc = url.get('location')
        url_hash = url.get('hash')
        # Seed
        seed_file = params.get('seed_file', {})
        seed_hash = seed_file.get('hash', '')
        swift_path = seed_file.get('path', '')

        algorithm = params.get('digest_algorithm')
        mime_type = params.get('mime_type')
        file_name = params.get('file_name')
        file_size = params.get('file_size')

        digest = url_hash if url_hash else seed_hash
        record = (cfg['custom-type'], client_id, thunder_hash, url_loc, digest,
                  algorithm, mime_type, file_name, file_size, swift_path)
        try:
            execute_sql(conn, sql, record, commit=True)
        except mdb.IntegrityError:
            pass
Exemple #4
0
def do_insert(flag_queue, task_queue, res_queue, db_info):
    logger.info('Starting: do_insert()')
    insert_done_sql = db_info['sqls']['insert-done']
    delete_todo_sql = db_info['sqls']['delete-todo']

    done_conn = gen_db_conn(db_info['done-db'])
    todo_conn = gen_db_conn(db_info['todo-db'])

    statsd_cfg = cfg['statsd']
    statsd_client = gen_statsd_client(statsd_cfg)
    STATSD_KEY = statsd_cfg['key-insert']

    while True:
        try:
            record = task_queue.get()
            _do_insert(done_conn, insert_done_sql, record)
            execute_sql(todo_conn, delete_todo_sql, (record[0], ), commit=True)
            res_queue.put(True)
            statsd_client.incr(STATSD_KEY)
        except Exception:
            res_queue.put(False)
            done_conn.close()
            done_conn = gen_db_conn(db_info['done-db'])
            todo_conn.close()
            todo_conn = gen_db_conn(db_info['todo-db'])
            logger.error('loop_insert requests.Exception: %r' %
                         traceback.format_exc())
            time.sleep(0.5)

    done_conn.close()
    todo_conn.close()
    logger.warning('@@@ Exit: do_insert()')
Exemple #5
0
    def process_msg(self, body, conn, sql):
        params = body.get('params')

        # task_uuid    = params.get('external_id')
        client_id    = params.get('additional_info').get('client_id')
        thunder_hash = params.get('thunder_hash')
        digest       = params.get('digest')
        # URL
        url = params.get('url')
        url_loc  = url.get('location')
        url_hash = url.get('hash')
        # Seed
        seed_file = params.get('seed_file', {})
        seed_hash  = seed_file.get('hash', '')
        swift_path = seed_file.get('path', '')

        algorithm = params.get('digest_algorithm')
        mime_type = params.get('mime_type')
        file_name = params.get('file_name')
        file_size = params.get('file_size')

        digest = url_hash if url_hash else seed_hash
        record = (cfg['custom-type'], client_id, thunder_hash, url_loc,
                  digest, algorithm,
                  mime_type, file_name, file_size, swift_path)
        try:
            execute_sql(conn, sql, record, commit=True)
        except mdb.IntegrityError:
            pass
Exemple #6
0
def _do_insert(done_conn, insert_done_sql, record):
    rid, thunder_hash, digest = record[0], record[3], record[5]
    try:
        logger.info('inserting mysql %d' % rid)
        execute_sql(done_conn, insert_done_sql, record[1:-1], commit=True)
        logger.info('inserting vddb %d' % rid)
        resp, logs = insert_vddb_tmp(cfg['vddb-async-url'], [thunder_hash, digest])
        if not resp or resp.status_code != 200:
            raise ValueError('Insert result management Failed! <%r>' % (logs,))
        logger.info('inserted mysql %d' % rid)
    except mdb.IntegrityError:
        pass
    except Exception:
        logger.error('_do_insert requests.Exception: %r' % traceback.format_exc())
        time.sleep(0.5)
Exemple #7
0
    def _id_df(cls):
        """
        读取common_id_attribution表中的数据,并转成dataframe格式
        :return:
        """
        sql = "select * from common_id_attribution"
        data = execute_sql(sql, dict_cursor=True)
        if len(data) <= 1:
            logger.info(
                f'common_id_attribution table is empty, initial data of it.')
            CommonId.insert(initial=True)
            logger.info(f'common_id_attribution table initial data success.')

        df = pd.DataFrame(data)
        df.drop(['id', 'created_time', 'updated_time'], axis=1, inplace=True)

        def zhi_xia_city(name):
            if name is None:
                return name
            if name[:2] in ('北京', '上海', '天津', '重庆'):
                return name[:2] + '市'
            return name

        df['full_city'] = df['full_city'].apply(zhi_xia_city)
        return df
Exemple #8
0
def _do_insert(done_conn, insert_done_sql, record):
    rid, thunder_hash, digest = record[0], record[3], record[5]
    try:
        logger.info('inserting mysql %d' % rid)
        execute_sql(done_conn, insert_done_sql, record[1:-1], commit=True)
        logger.info('inserting vddb %d' % rid)
        resp, logs = insert_vddb_tmp(cfg['vddb-async-url'],
                                     [thunder_hash, digest])
        if not resp or resp.status_code != 200:
            raise ValueError('Insert result management Failed! <%r>' %
                             (logs, ))
        logger.info('inserted mysql %d' % rid)
    except mdb.IntegrityError:
        pass
    except Exception:
        logger.error('_do_insert requests.Exception: %r' %
                     traceback.format_exc())
        time.sleep(0.5)
Exemple #9
0
 def _exist_numbers(cls):
     """
     读取已存在common_mobile_attribution表中的所有手机号码(前7位),用于新插入数据的去重
     :return:
     """
     number_sql = "select number from common_mobile_attribution"
     number_data = execute_sql(number_sql, dict_cursor=True)
     number_set = set(pd.DataFrame(number_data)['number'].to_list())
     return number_set
Exemple #10
0
 def _last_date(cls):
     """
     获取common_date表中最近的一条记录的date
     :return: date格式的日期
     """
     sql = "select date from common_date order by date desc limit 1"
     data = execute_sql(sql, dict_cursor=True)
     if len(data) == 0:
         return
     return data[0].get('date')
Exemple #11
0
 def _last_time(cls):
     """
     获取common_mobile_attribution表中最新一条数据的插入时间
     :return: datetime
     """
     last_date_sql = "select max(created_time) as last_time from common_mobile_attribution"
     last_date_data = execute_sql(last_date_sql, dict_cursor=True)
     if len(last_date_data) == 0:
         return
     last_time = last_date_data[0].get('last_time')
     if last_time is None:
         return
     return last_time.date()
Exemple #12
0
    def _init_data(cls):
        """
        从 my_user_profile.DateClass 中拉取历史common_date数据
        :return:
        """
        sql = "select distinct date as date_str, class from DateClass"
        data = execute_sql(sql, db='my_user_profile', dict_cursor=True)

        df = pd.DataFrame(data)
        df.drop_duplicates(subset=['date_str'], inplace=True)
        df['date'] = df['date_str'].apply(
            lambda x: datetime.date(int(x[:4]), int(x[4:6]), int(x[6:])))
        df['weekday'] = df['date'].apply(lambda x: x.weekday())
        return df
Exemple #13
0
    def process_msg(self, body, db_conn, sql):
        if isinstance(body, (str, unicode)):
            body = json.loads(body)

        params = body.get('params')

        task_uuid = params.get('external_id')
        client_id = params.get('additional_info').get('client_id')
        thunder_hash = params.get('thunder_hash')
        digest = params.get('digest')
        # URL
        url = params.get('url')
        url_loc = url.get('location')
        url_hash = url.get('hash')
        # Seed
        seed_file = params.get('seed_file', {})
        seed_hash = seed_file.get('hash', '')
        swift_path = seed_file.get('path', '')

        algorithm = params.get('digest_algorithm')
        mime_type = params.get('mime_type')
        file_name = params.get('file_name')
        file_size = params.get('file_size')

        if cfg['vddb-async']['should-insert']:
            logger.info('Insert to vddb: %s' % task_uuid)
            resp, logs = insert_vddb_tmp(cfg['vddb-async']['url'],
                                         [thunder_hash, url_hash, seed_hash])
            if not resp or resp.status_code != 200:
                raise ValueError('Insert result management Failed! <%r>' %
                                 (logs, ))
        record = (cfg['custom-type'], client_id, thunder_hash, url_loc, digest,
                  algorithm, mime_type, file_name, file_size, swift_path)
        try:
            execute_sql(db_conn, sql, record)
        except mdb.IntegrityError:
            pass
Exemple #14
0
    def process_msg(self, body, db_conn, sql):
        if isinstance(body, (str, unicode)):
            body = json.loads(body)

        params = body.get('params')

        task_uuid    = params.get('external_id')
        client_id    = params.get('additional_info').get('client_id')
        thunder_hash = params.get('thunder_hash')
        digest       = params.get('digest')
        # URL
        url = params.get('url')
        url_loc  = url.get('location')
        url_hash = url.get('hash')
        # Seed
        seed_file = params.get('seed_file', {})
        seed_hash  = seed_file.get('hash', '')
        swift_path = seed_file.get('path', '')

        algorithm = params.get('digest_algorithm')
        mime_type = params.get('mime_type')
        file_name = params.get('file_name')
        file_size = params.get('file_size')

        if cfg['vddb-async']['should-insert']:
            logger.info('Insert to vddb: %s' % task_uuid)
            resp, logs = insert_vddb_tmp(cfg['vddb-async']['url'], [thunder_hash, url_hash, seed_hash])
            if not resp or resp.status_code != 200:
                raise ValueError('Insert result management Failed! <%r>' % (logs,))
        record = (cfg['custom-type'], client_id, thunder_hash, url_loc,
                  digest, algorithm,
                  mime_type, file_name, file_size, swift_path)
        try:
            execute_sql(db_conn, sql, record)
        except mdb.IntegrityError:
            pass
Exemple #15
0
def loop_insert(flag_queue, task_queue, res_queue, db_info):

    logger.info('Starting: loop_insert() => %s' % (cfg['vddb-async-url']))

    select_todo_sql = db_info['sqls']['select-todo']
    select_limit = db_info['select-limit']

    todo_conn = gen_db_conn(db_info['todo-db'])

    while True:
        try:
            while not res_queue.empty():
                ok = res_queue.get()

            rows = execute_sql(todo_conn,
                               select_todo_sql, (select_limit, ),
                               fetch=True)
            for record in rows:
                logger.info('Deleted mysql %d' % record[0])
                task_queue.put(record)

            time.sleep(1)
            results = []
            results_ok = 0
            rows_len = len(rows)
            for i in range(rows_len):
                try:
                    ok = res_queue.get(timeout=5)
                except Queue.Empty():
                    ok = False
                if ok:
                    results_ok += 1
                results.append(ok)

            if rows_len == 0:
                time.sleep(db_info['select-interval'])
            logger.info('One turn: (%d/%d), %r' %
                        (results_ok, rows_len, results))
        except KeyboardInterrupt:
            break
        except Exception:
            todo_conn.close()
            todo_conn = gen_db_conn(db_info['todo-db'])
            logger.error('loop_insert Exception: %r' % traceback.format_exc())
            time.sleep(0.5)

    todo_conn.close()
    logger.warning('@@@ Exit: loop_insert()')
Exemple #16
0
def loop_insert(flag_queue, task_queue, res_queue, db_info):

    logger.info('Starting: loop_insert() => %s' % (cfg['vddb-async-url']))

    select_todo_sql = db_info['sqls']['select-todo']
    select_limit = db_info['select-limit']
    
    todo_conn = gen_db_conn(db_info['todo-db'])

    while True:
        try:
            while not res_queue.empty():
                ok = res_queue.get()
                
            rows = execute_sql(todo_conn, select_todo_sql,
                               (select_limit, ), fetch=True)
            for record in rows:
                logger.info('Deleted mysql %d' % record[0])
                task_queue.put(record)
                
            time.sleep(1)
            results = []
            results_ok = 0
            rows_len = len(rows)
            for i in range(rows_len):
                try:
                    ok = res_queue.get(timeout=5)
                except Queue.Empty():
                    ok = False
                if ok:
                    results_ok += 1
                results.append(ok)
                
            if rows_len == 0:
                time.sleep(db_info['select-interval'])
            logger.info('One turn: (%d/%d), %r' % (results_ok, rows_len, results))
        except KeyboardInterrupt:
            break
        except Exception:
            todo_conn.close()
            todo_conn = gen_db_conn(db_info['todo-db'])
            logger.error('loop_insert Exception: %r' % traceback.format_exc())
            time.sleep(0.5)
    
    todo_conn.close()
    logger.warning('@@@ Exit: loop_insert()')
Exemple #17
0
    def _update_user_mobile(cls):
        """
        从 my_v2.credit_user表中读取最新一天的用户手机号(前7位)数据,去重过滤后,标注归属地信息
        :return: dateframe格式的数据
        """
        end = datetime.datetime.now().date()
        start = end - datetime.timedelta(days=1)
        sql = f"select distinct mobile as number from credit_user where created_time between '{start}' and '{end}'"
        data = execute_sql(sql, db='my_v2', dict_cursor=True)

        if len(data) == 0:
            logger.info(
                f'Not add any new user between {start} and {end} in credit_user table'
            )
            return

        df = pd.DataFrame(data)
        df = df[df['number'].apply(is_valid_phone)]
        df['number'] = df['number'].apply(lambda x: x[:7])
        df.drop_duplicates(inplace=True)
        df.dropna(inplace=True)

        exist_numbers = cls._exist_numbers()
        df = df[df['number'].apply(lambda x: x not in exist_numbers)]
        logger.info(
            f"New unique numbers: {df.values.tolist()} between {start} and {end}"
        )

        if df.shape[0] == 0:
            logger.info(f"No valid update data for {cls.table_name} table!")
            return

        df['phone_info'] = df['number'].apply(cls._phone_query)
        pdf = df['phone_info'].str.split('|', expand=True)
        df['city_code'], df['zip_code'], df['short_province'] = pdf[0], pdf[
            1], pdf[2]
        df['short_city'], df['phone_type'] = pdf[3], pdf[4]
        df.drop(columns=['phone_info'], axis=1, inplace=True)
        return df
Exemple #18
0
    def _init_data(cls):
        """
        从 my_user_profile.PhoneCity表中读取历史数据,转为dataframe格式
        :return:
        """
        sql = "select distinct code, province, city, corporation, area_code, zip_code from PhoneCity"
        data = execute_sql(sql, db='my_user_profile', dict_cursor=True)

        df = pd.DataFrame(data)
        df.rename(columns={
            'code': 'number',
            'corporation': 'phone_type',
            'area_code': 'city_code'
        },
                  inplace=True)
        df.rename(columns={
            'province': 'short_province',
            'city': 'short_city'
        },
                  inplace=True)
        df = df[df['number'].apply(is_valid_phone_7)]
        df.replace('', np.nan, inplace=True)
        return df
Exemple #19
0
 def existed_id(cls):
     sql = "select * from common_id_attribution"
     data = execute_sql(sql, dict_cursor=True)
     return data