def wrap(result_dict):
    # 先处理从Excel读取的记录,然后导入数据库
    insert_list = []
    for app_name, value_list in result_dict.items():
        # 单位千
        mau = value_list[0] * 1000
        dau = value_list[1] * 1000
        # 单位次,分钟
        per_capita_daily_startup_counts, per_capita_daily_use_time, app_name_en, target_date, sector = value_list[2:]

        per_startup_use_time = per_capita_daily_use_time / per_capita_daily_startup_counts
        total_startup_counts = dau * per_capita_daily_startup_counts * days_num
        total_use_time = (per_startup_use_time * total_startup_counts) / 60

        insert_list.append([
            app_name, app_name_en, sector, target_date, dau, per_capita_daily_startup_counts, per_startup_use_time,
            mau, per_capita_daily_use_time, total_startup_counts, total_use_time, 'TrustData', str(TODAY),
            '30', 'Mobile', 'APP', 'iOS + Android'])

    sql = (
        'INSERT INTO app(app_name, app_name_en, sector, starting_date, daily_active_user, per_capita_daily_startup_counts, '
        ' per_startup_use_time, active_users, per_capita_daily_use_time, total_startup_counts , total_use_time,'
        ' source, createddate, date_type, app_type, data_type, os) '
        ' VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)')

    upsert_batch(sql, insert_list)
def main(filename):
    # 读取文件
    file_dict = {}
    with open(filename, 'r', encoding='utf8') as a_file:
        for line in a_file:
            fields = line.strip().split('\t')
            app_id, sector, sub_sector, starting_date = fields[0], fields[4], fields[5], fields[7]
            if app_id in file_dict:
                if file_dict[app_id][0] < starting_date:
                    file_dict[app_id] = [starting_date, sector, sub_sector]
            else:
                file_dict[app_id] = [starting_date, sector, sub_sector]
    # 查询数据库
    sql = (
        'SELECT app_id, app_name, COUNT(DISTINCT sector) AS num '
        ' FROM app WHERE source="QuestMobile" GROUP BY app_id')
    data = query(sql)

    update_list = []
    for row in data:
        app_id = row['app_id']
        if row['num'] > 1 and app_id in file_dict:
            update_list.append([file_dict[app_id][1], file_dict[app_id][2], app_id])

    update_sql = ('UPDATE app SET sector=%s, sub_sector=%s WHERE app_id=%s')
    print(len(update_list))
    upsert_batch(update_sql, update_list)
Esempio n. 3
0
def file_to_db(path, pattern=None):
    # 查询已经存在的司机
    driver_data = query(DRIVER_SELECT_SQL)
    driver_set = set(map(lambda row: row["driver_id"], driver_data))
    # 查询本月已经存在的司机
    driver_month_data = query(DRIVER_MONTH_SELECT_SQL % (CURRENT_MONTH))
    driver_month_set = set(map(lambda row: row["driver_id"], driver_month_data))

    for file_name in list_files(path, pattern):
        print(file_name)
        with open(file_name, "r", encoding="utf8") as a_file:
            detail_dict, driver_dict, retention_dict = {}, {}, {}
            for line in a_file:
                fields = line.strip().split("\t")
                length = len(fields)

                if 9 < len(fields) < 13:
                    # 读取字段值,格式转换
                    driver_id, driver_name, license, photo_url, level, longitude, latitude, \
                        order_num_str = fields[:8]
                    capture_dtm, city = fields[-2:]
                    order_num = int(order_num_str)
                    flag_dt = capture_dtm.split(" ")[0]
                    city = city[:2] + re.sub(CITY_RE, "", city[2:])

                    # 确定新司机
                    if driver_id not in driver_set:
                        driver_dict[driver_id] = [driver_id, driver_name, license, photo_url, level, flag_dt]
                        driver_set.add(driver_id)
                    # 确定本月出现司机
                    # if driver_id not in retention_dict:
                    #     retention_dict[driver_id] = [driver_id, city, CURRENT_MONTH]
                    if driver_id not in driver_month_set:
                        retention_dict[driver_id] = [driver_id, city, CURRENT_MONTH]
                        driver_month_set.add(driver_id)
                    # 详细
                    if driver_id in detail_dict:
                        detail_dict[driver_id][2] = min(detail_dict[driver_id][2], order_num)
                        detail_dict[driver_id][3] = max(detail_dict[driver_id][3], order_num)
                        detail_dict[driver_id][4] = min(detail_dict[driver_id][4], capture_dtm)
                        detail_dict[driver_id][5] = max(detail_dict[driver_id][5], capture_dtm)
                    else:
                        detail_dict[driver_id] = [driver_id, city, order_num, order_num,
                            capture_dtm, capture_dtm, flag_dt]

            upsert_batch(NEW_DRIVER_SQL, list(driver_dict.values()))
            print("导入: " + str(len(driver_dict)) + " 个新司机")
            upsert_batch(NEW_RETENTION_SQL, list(retention_dict.values()))
            print("导入: " + str(len(retention_dict)) + " 个本月出现司机")
            upsert_batch(NEW_DETAIL_SQL, list(detail_dict.values()))
            print("导入: " + str(len(detail_dict)) + " details信息")
def update_history(filename):
    # 先读取文件
    file_dict = {}
    min_starting_date = None
    with open(filename, "r", encoding="utf8") as a_file:
        date_set = set()
        for line in a_file:
            fields = line.strip().split("\t")
            app_id, os, starting_date = fields[0], OS_MAPPING[fields[6]], fields[7]
            date_set.add(starting_date)
            key = app_id + "|-|" + os + "|-|" + starting_date

            # 单位转换
            # os
            fields[6] = OS_MAPPING[fields[6]]
            # down_user
            fields[8] = float(fields[8]) * 10
            # active_users
            fields[10] = float(fields[10]) * 10
            # install_user
            fields[12] = float(fields[12]) * 10
            # remove_user
            fields[14] = float(fields[14]) * 10
            # daily_use_time
            fields[16] = float(fields[16]) / 6
            # daily_startup_counts
            fields[17] = float(fields[17]) * 10
            # daily_active_user
            fields[18] = float(fields[18]) * 10
            # total_use_time
            fields[20] = float(fields[20]) / 6
            # total_startup_counts
            fields[24] = float(fields[24]) * 10

            file_dict[key] = fields

        min_starting_date = min(date_set)

    # 读取数据库,找出已存在的应用
    sql = (
        "SELECT DISTINCT app_id, os, starting_date, active_users, app_name FROM app WHERE source='QuestMobile'"
        " AND starting_date >= '%s'") % (min_starting_date)
    data = query(sql)

    db_dict = {}
    for row in data:
        app_id, os, starting_date, active_users, app_name = row['app_id'], row['os'], \
            row['starting_date'], row['active_users'], row['app_name']
        key = app_id + "|-|" + os + "|-|" + str(starting_date)
        db_dict[key] = [active_users, app_name]

    insert_list, update_list = [], []
    for key in file_dict.keys():
        if key in db_dict:
            db_mau = db_dict[key][0]
            file_mau = file_dict[key][10]
            diff = file_mau - db_mau
            # 找出有变化的行
            if diff >= 1:
                # print(key + '\t' + db_dict[key][1] + '\t' + str(db_dict[key][0]) + '\t' + str(file_dict[key][10]))
                temp_list = list(file_dict[key])
                del temp_list[0], temp_list[5], temp_list[5]
                temp_list.extend(key.split("|-|"))
                update_list.append(temp_list)
        else:
            file_dict[key].extend(["QuestMobile", "Mobile", "APP", str(TODAY)])
            insert_list.append(file_dict[key])

    # 插入/更新数据库
    insert_sql = (
        "INSERT INTO app(app_id, app_name, company, app_name_en, sector,"
        " sub_sector, os, starting_date, down_user, down_user_permeability, active_users, "
        " active_users_penetration, install_user, install_user_permeability, remove_user, "
        " remove_user_permeability, daily_use_time, daily_startup_counts, daily_active_user,"
        " daily_active_user_penetration, total_use_time, total_use_time_share, "
        " per_capita_use_time, per_startup_use_time, total_startup_counts, "
        " total_startup_counts_share, per_capita_startup_counts, source, app_type, data_type, createddate)"
        " VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,"
        " %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
    update_sql = (
        "UPDATE app SET"
        " app_name=%s, company=%s, app_name_en=%s, sector=%s, sub_sector=%s,"
        " down_user=%s, down_user_permeability=%s, active_users=%s,"
        " active_users_penetration=%s, install_user=%s, install_user_permeability=%s,"
        " remove_user=%s, remove_user_permeability=%s, daily_use_time=%s, "
        " daily_startup_counts=%s, daily_active_user=%s, daily_active_user_penetration=%s,"
        " total_use_time=%s, total_use_time_share=%s, per_capita_use_time=%s,"
        " per_startup_use_time=%s, total_startup_counts=%s, total_startup_counts_share=%s,"
        " per_capita_startup_counts=%s "
        " WHERE app_id=%s AND os=%s AND source='QuestMobile' AND app_type='Mobile' AND starting_date=%s")
    print("插入数据条数: " + str(len(insert_list)))
    print("更新数据条数: " + str(len(update_list)))
    upsert_batch(insert_sql, insert_list)
    upsert_batch(update_sql, update_list)
def update_history(filename):
    # 先读取文件
    file_dict = {}
    min_starting_date = None
    with open(filename, "r", encoding="utf8") as a_file:
        date_set = set()
        for line in a_file:
            fields = line.strip().split("\t")
            app_id, os, starting_date = fields[0], fields[6], fields[7]
            date_set.add(starting_date)
            key = app_id + "|-|" + OS_MAPPING[os] + "|-|" + starting_date

            # 单位转换
            fields[6] = OS_MAPPING[fields[6]]
            # down_user
            fields[8] = float(fields[8]) * 10
            # active_users
            fields[10] = float(fields[10]) * 10
            # install_user
            fields[12] = float(fields[12]) * 10
            # remove_user
            fields[14] = float(fields[14]) * 10
            # daily_use_time
            fields[16] = float(fields[16]) / 6
            # daily_startup_counts
            fields[17] = float(fields[17]) * 10
            # daily_active_user
            fields[18] = float(fields[18]) * 10
            # total_use_time
            fields[20] = float(fields[20]) / 6
            # total_startup_counts
            fields[24] = float(fields[24]) * 10

            file_dict[key] = fields

        min_starting_date = min(date_set)

    # 读取数据库,找出已存在的应用
    sql = (
        "SELECT app_id, os, starting_date, active_users FROM app WHERE source='QuestMobile'"
        " AND starting_date >= '%s' GROUP BY app_id, os, starting_date") % (min_starting_date)
    data = query(sql)
    # key_set = set([
    #     row["app_id"] + "|-|" + row["os"] + "|-|" + str(row["starting_date"]) for row in data])
    db_dict = {row["app_id"] + "|-|" + row["os"] + "|-|" + str(row["starting_date"]): row['active_users'] for row in data}

    insert_list, update_list = [], []
    for key in file_dict.keys():
        if key in db_dict.keys():
            if math.fabs(file_dict[key][10] - db_dict[key]) >= 1:
                temp_list = list(file_dict[key])
                # 注意删除之后,后续的会排上来
                del temp_list[0], temp_list[5], temp_list[5]
                temp_list.extend(key.split("|-|"))
                update_list.append(temp_list)
        else:
            file_dict[key].extend(["QuestMobile", "Mobile", "APP", str(TODAY)])
            insert_list.append(file_dict[key])

    # 插入/更新数据库
    insert_sql = (
        "INSERT INTO app(app_id, app_name, company, app_name_en, sector,"
        " sub_sector, os, starting_date, down_user, down_user_permeability, active_users, "
        " active_users_penetration, install_user, install_user_permeability, remove_user, "
        " remove_user_permeability, daily_use_time, daily_startup_counts, daily_active_user,"
        " daily_active_user_penetration, total_use_time, total_use_time_share, "
        " per_capita_use_time, per_startup_use_time, total_startup_counts, "
        " total_startup_counts_share, per_capita_startup_counts, source, app_type, data_type, createddate)"
        " VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,"
        " %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
    update_sql = (
        "UPDATE app SET"
        " app_name=%s, company=%s, app_name_en=%s, sector=%s, sub_sector=%s,"
        " down_user=%s, down_user_permeability=%s, active_users=%s,"
        " active_users_penetration=%s, install_user=%s, install_user_permeability=%s,"
        " remove_user=%s, remove_user_permeability=%s, daily_use_time=%s, "
        " daily_startup_counts=%s, daily_active_user=%s, daily_active_user_penetration=%s,"
        " total_use_time=%s, total_use_time_share=%s, per_capita_use_time=%s,"
        " per_startup_use_time=%s, total_startup_counts=%s, total_startup_counts_share=%s,"
        " per_capita_startup_counts=%s "
        " WHERE app_id=%s AND os=%s AND source='QuestMobile' AND app_type='Mobile' AND starting_date=%s")
    print("插入数据条数: " + str(len(insert_list)))
    print("更新数据条数: " + str(len(update_list)))
    upsert_batch(insert_sql, insert_list)
    upsert_batch(update_sql, update_list)