def merge(start, end):
    sql = (
        "SELECT app_name, sector, sub_sector, os, active_users, daily_active_user,"
        " total_use_time, starting_date FROM app WHERE source = 'QuestMobile' AND data_type='APP'"
        " AND starting_date >= '%s' AND starting_date <= '%s'") % (start, end)
    data = query(sql)

    # key - [app_name, sector, sub_sector, staring_date, mau, dau, time]
    data_dict = {}
    for row in data:
        app_name, sector, sub_sector, active_users, daily_active_user, total_use_time, \
            starting_date = row["app_name"], row["sector"], row["sub_sector"], \
            row["active_users"], row["daily_active_user"], row["total_use_time"], \
            row["starting_date"]
        key = app_name + str(starting_date)
        if key in data_dict:
            data_dict[key][4] += active_users
            data_dict[key][5] += daily_active_user
            data_dict[key][6] += total_use_time
        else:
            data_dict[key] = [
                app_name, sector, sub_sector, starting_date, active_users,
                daily_active_user, total_use_time]

    insert_sql = (
        "INSERT INTO questmobile_merge(app_name, sector, sub_sector, starting_date,"
        " mau, dau, time) VALUES(%s, %s, %s, %s, %s, %s, %s)")
    insert_batch(insert_sql, list(data_dict.values()))
Ejemplo n.º 2
0
def main(fpath, createddate, pattern=None):
    sql = (
        "INSERT INTO soufang(title, serial_num, AREA, release_date, is_individual, city, "
        " url, picture_num, Tag, capture_datetime, createddate) "
        " VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")

    for filename in list_files(fpath, pattern):
        with open(filename, "r", encoding="utf8") as a_file:
            # 跳过文件头
            next(a_file)
            insert_list, deduplicate_set = [], set()
            for line in a_file:
                fields = line.split(",")
                # 过滤
                if filters(fields):
                    fields.append(createddate)
                    url = fields[6]
                    # 开始除重
                    if url in deduplicate_set:
                        continue
                    else:
                        insert_list.append(fields)
                        deduplicate_set.add(url)
            # 导入数据库
            print("总共有:" + str(len(insert_list)) + "条记录")
            insert_batch(sql, insert_list)
Ejemplo n.º 3
0
def gj_haoche(path, pattern=None):
    sql = ("INSERT INTO Haoche(city,title,carddate,mileage,price,newcarprice,code,checker,"
        " checkdate,url,createdAt,updatedAt) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        " ON DUPLICATE KEY UPDATE updatedAt=VALUES(updatedAt), price=VALUES(price)")

    for file_name in list_files(path, pattern):
        with open(file_name, "r", encoding="utf8") as a_file:
            print(file_name + " starts to insert...")
            insert_list, row_count = [], 0
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields) == 12:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    carddate = datetime.strptime(fields[2], "%Y-%m") if fields[2] else None
                    city, title, mileage, price, newcarprice, code, checker, \
                    checkdate, url, createdAt, updatedAt = fields[0], fields[1], \
                    fields[3], fields[4], fields[5], fields[6], fields[7], fields[8], fields[9], \
                    fields[10], fields[10]
                    insert_list.append([city, title, carddate, mileage, price, newcarprice, code, \
                        checker, checkdate, url, createdAt, updatedAt])
                    # 如果超过100000行的话,重置计数器
                    if row_count >= 100000:
                        insert_batch(sql, insert_list)
                        print("100000 inserted...")
                        insert_list, row_count = [], 0
            # 把最后没有到100000条的记录插入数据库
            insert_batch(sql, insert_list)
            print("Last ", len(insert_list), "inserted...")
Ejemplo n.º 4
0
def main(path, pattern=None):
    sql = (
        "INSERT INTO Haoche(city,title,carddate,mileage,price,newcarprice,code,checker,"
        " checkdate,url,createdAt,updatedAt) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        " ON DUPLICATE KEY UPDATE updatedAt=VALUES(updatedAt), price=VALUES(price)")

    for file_name in list_files(path, pattern):
        with open(file_name, "r", encoding="utf8") as a_file:
            # 日期
            match = re.search(r"(\d{4}-\d{2}-\d{2})", file_name)
            if match:
                updatedAt = match.group(0)
            # 跳过表头
            next(a_file)
            insert_list = []
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields) == 12:
                    # 开始取值
                    carddate = datetime.strptime(fields[2], "%Y-%m") if fields[2] else None
                    city, title, mileage, price, newcarprice, code, checker, \
                        checkdate, url = fields[0], fields[1], \
                        fields[3], fields[4], fields[5], fields[6], fields[7], fields[8], \
                        fields[9]
                    # 过滤无效数据
                    if filters(url, code, checkdate):
                        insert_list.append([
                            city, title, carddate, mileage, price, newcarprice, code,
                            checker, checkdate, url, updatedAt, updatedAt])
            # 过滤完成,导入数据库
            insert_batch(sql, insert_list)
Ejemplo n.º 5
0
def kuaidi(path, pattern=None):
    sql = (
        "INSERT INTO kuaidi(capture_dtm, city, longitude_request, latitude_request, "
        " longitude_car, latitude_car, driver_id, driver_type, car_type, create_dt, flag_dt)"
        " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")

    for file_name in list_files(path, pattern):
        with open(file_name, "r", encoding="utf8") as a_file:
            print(file_name + " starts to insert....")
            insert_list, row_count = [], 0
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields) == 10:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    capture_dtm, city, longitude_request, latitude_request, longitude_car, \
                        latitude_car, driver_id, driver_type, car_type = fields[0], fields[2], \
                        fields[3], fields[4], fields[5], fields[6], fields[7], fields[8], fields[9]
                    city = fields[2][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[2][2:])
                    insert_list.append([
                        capture_dtm, city, longitude_request, latitude_request, longitude_car,
                        latitude_car, driver_id, driver_type, car_type, TODAY,
                        capture_dtm.split(" ")[0]])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                        print("100000 inserted...")
            # 把最后没有到100000条的记录插入数据库
            insert_batch(sql, insert_list)
            print("Last ", len(insert_list), "inserted...")
Ejemplo n.º 6
0
def local_ganji(fpath, pattern=None):
    for filename in list_files(fpath, pattern):
        sql = (
            "INSERT INTO {table} VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,"
            " %s, %s, %s, %s)")
        # 正则匹配
        regex = re.compile("\('([^\)]+)\)[,;]+")
        # Map - 除重
        a_dict = {}
        with open(filename, "r", encoding="utf8") as a_file:
            print(filename + " starts to insert...", flush=True)
            for line in a_file:
                # 匹配表名
                if line.startswith("LOCK TABLES"):
                    table = re.search(r"`(.+)`", line).group(0).strip("`")
                    sql = sql.format(table=table)
                # 匹配数据
                if line.startswith("INSERT INTO"):
                    match = regex.findall(line)
                    for row in match:
                        fields = row.replace("'", "").split(",")
                        # 舍弃正则解析出错的记录(数量很少)
                        # if len(fields) != 14:
                        #     print(fields, flush=True)
                        #     continue

                        # 从11月4号开始添加一个字段
                        # 舍弃tt == 1的记录(大幅度减小记录数)
                        if fields[9] == "1":
                                continue
                        if len(fields) == 14:
                            fields.insert(11, None)
                            # mid - fields[0], createdDate - fields[13]
                            key = fields[0] + "-" + fields[14] + "-" + fields[9]
                            if key not in a_dict:
                                a_dict[key] = fields
                            else:
                                continue
                        elif len(fields) == 15:
                            # puid - fields[11], createdDate - fields[14]
                            key = fields[11] + "-" + fields[14] + "-" + fields[9]
                            if key not in a_dict:
                                a_dict[key] = fields
                            else:
                                continue

                        # 舍弃tt == 1的记录(大幅度减小记录数)
                        # if fields[9] == "1":
                        #     continue
                        # else:
                        #     # mid - fields[0], createdDate - fields[13]
                        #     key = fields[0] + "-" + fields[13] + "-" + fields[9]
                        #     if key not in a_dict:
                        #         a_dict[key] = fields
                        #     else:
                        #         continue
            insert_batch(sql, list(a_dict.values()))
            print("Last ", len(a_dict), " inserted...", flush=True)
Ejemplo n.º 7
0
def ershoufang(fpath, pattern=None):
    for filename in list_files(fpath, pattern):
        sql = (
            "INSERT INTO {table} VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,"
            " %s, %s, %s, %s)")
        # 正则匹配
        regex = re.compile("\(([^\)]+)\)[,;]+")
        # Map - 除重
        a_dict = {}
        with open(filename, "r", encoding="utf8") as a_file:
            print(filename + " starts to insert...", flush=True)
            for line in a_file:
                # 匹配表名
                if line.startswith("LOCK TABLES"):
                    table = re.search(r"`(.+)`", line).group(0).strip("`")
                    sql = sql.format(table=table)
                    print(sql)
                # 匹配数据
                if line.startswith("INSERT INTO"):
                    match = regex.findall(line)
                    for row in match:
                        fields = row.replace("'", "").split(",")
                        # 舍弃正则解析出错的记录(数量很少)
                        if len(fields) != 15:
                            print(fields, flush=True)
                            continue
                        # 舍弃tt == 1的记录(大幅度减小记录数)
                        if fields[9] == "1":
                            continue
                        else:
                            # 如果puid为NULL
                            if fields[11] == "NULL":
                                match = re.search("&entinfo=(\d+)", fields[8])
                                if match:
                                    puid = re.search(
                                        "&entinfo=(\d+)",
                                        fields[8]).group().replace("&entinfo=", "")
                                else:
                                    print("puid和URL都为空")
                                    continue
                            else:
                                puid = fields[11]
                            # puid - fields[11], createdDate - fields[14]
                            key = puid + "-" + fields[14] + "-" + fields[9]
                            if key not in a_dict:
                                a_dict[key] = fields
                            else:
                                continue
            insert_batch(sql, list(a_dict.values()))
            print("Last ", len(a_dict), " inserted...", flush=True)
Ejemplo n.º 8
0
def wuba_job(fpath, pattern=None):
    for filename in list_files(fpath, pattern):
        sql = (
            "INSERT INTO {table} VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,"
            " %s, %s, %s, %s)")
        # 正则匹配
        regex = re.compile("\(([^\)]+)\)[,;]+")
        merchants_ids_set = get_merchant_ids()
        print(len(merchants_ids_set))
        # Map - 除重
        a_dict = {}
        with open(filename, "r", encoding="utf8") as a_file:
            print(filename + " starts to insert...", flush=True)
            # 匹配数据
            insert_list, line_count = [], 0
            for line in a_file:
                # 匹配表名
                if line.startswith("LOCK TABLES"):
                    table = re.search(r"`(.+)`", line).group(0).strip("`")
                    sql = sql.format(table=table)
                    print(sql)
                if line.startswith("INSERT INTO"):
                    # 计数器加1
                    line_count += 1
                    match = regex.findall(line)
                    for row in match:
                        fields = row.replace("'", "").split(",")
                        # if fields[9] == "1":
                        #     continue
                        # 过滤Merchant Ids
                        if fields[0] in merchants_ids_set and fields[9] == "1":
                            continue
                        # 舍弃正则解析出错的记录(数量很少)
                        if len(fields) != 15:
                            print(fields, flush=True)
                            continue
                        # puid - fields[11], createdDate - fields[14]
                        key = fields[11] + "-" + fields[14] + "-" + fields[9]
                        if key not in a_dict:
                            a_dict[key] = fields
                        else:
                            continue
                        insert_list.append(fields)
                    if line_count >= 3000:
                        print("To insert " + str(len(insert_list)) + " records")
                        insert_batch(sql, insert_list)
                        insert_list, line_count = [], 0

            insert_batch(sql, insert_list)
            print("Last ", len(insert_list), " inserted...", flush=True)
Ejemplo n.º 9
0
def main(fpath, pattern=None):
    for filename in list_files(fpath, pattern):
        sql = (
            "INSERT INTO cheyipai_b2b(auc_Id, car_Id, brand, manufacturer, model, final_offer,"
            " winner_id, winner_id_nonlocal, mileage, reg_dt, reg_area,reg_area_total, "
            " car_source_id, root_id, root_name, capture_dtm)"
            " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
        with open(filename, "r", encoding="utf8") as a_file:
            print(filename)
            next(a_file)
            insert_list = []
            for line in a_file:
                fields = line.split(",")
                insert_list.append(fields)
            insert_batch(sql, insert_list)
Ejemplo n.º 10
0
def main(fpath, capture_dt, pattern=None):
    sql = "INSERT INTO iwjw(url, city, NAME, pics, video, capture_dt) " " VALUES(%s, %s, %s, %s, %s, %s)"

    for filename in list_files(fpath, pattern):
        with open(filename, "r", encoding="utf8") as a_file:
            # 跳过文件头
            next(a_file)
            insert_list, key_set = [], set()
            for line in a_file:
                fields = line.split(",")[:5]
                if fields[0] not in key_set:
                    fields[1] = CITY_DICT.get(fields[1])
                    fields.append(capture_dt)
                    insert_list.append(fields)
                    key_set.add(fields[0])
            insert_batch(sql, insert_list)
Ejemplo n.º 11
0
def new_version(path, table_name, pattern=None):
    new_sql = (
        "INSERT INTO " + table_name +
        "(driver_id, city, status, driver_type, flag_dt) values(%s, %s, %s, %s, %s)")
    for file_name in list_files(path, pattern):
        # 日期
        match = re.search(r"(\d{4}-\d{2}-\d{2})", file_name)
        if match:
            starting_date = match.group(0)
        with open(file_name, "r", encoding="utf8") as a_file:
            print(file_name + " starts to insert....", flush=True)
            insert_list = []
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields) == 4:
                    fields[1] = fields[1][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[1][2:])
                    fields.append(starting_date)
                    insert_list.append(fields)
            insert_batch(new_sql, insert_list)
Ejemplo n.º 12
0
def main(fpath, capture_dt, pattern=None):
    sql = (
        "INSERT INTO lianjia(id, url, title, pics, is_exclusive, community, house_type, AREA, "
        " orientation, price, down_payment, monthly_payment, deals_done, agent_comments, "
        " customer_visits, route, city, capture_dt) "
        " VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")

    for filename in list_files(fpath, pattern):
        with open(filename, "r", encoding="utf8") as a_file:
            # 跳过文件头
            next(a_file)
            insert_list = []
            for line in a_file:
                fields = line.split(",")
                if len(fields) == FIELDS_LEN:
                    # 从URL中读取城市信息
                    city = CITYS.get(fields[1][7:9])
                    fields.extend([city, capture_dt])
                    insert_list.append(fields)
            print("总共有:" + str(len(insert_list)) + "条记录")
            insert_batch(sql, insert_list)
Ejemplo n.º 13
0
def insert_ayi(a_path, pattern=None):
    # 读取已存在的阿姨
    existing_ayi = query(AYI_SELECT_SQL)

    ayi_dict = {}
    for row in existing_ayi:
        ayi_dict[row["uid"]] = row["entrytime"]

    for file_name in list_files(a_path, pattern):
        logging.debug("读取文件" + file_name)
        with open(file_name, "r", encoding="utf8") as a_file:
            # 跳过Header
            next(a_file)
            new_ayis, new_details = [], []
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields) == FIELDS_LEN:
                    address, age, comment, commentcountofthismonth, distance, distancewithunit, \
                        entrytime, goodrate, idcard, mobile, name, pic, province, servicecount, \
                        servicecountofthismonth, sex, star, uid, valid, workage, worktime, city, \
                        capturedate, appointmentdate = fields
                    # String -> Date
                    capturedate = datetime.strptime(capturedate, "%Y-%m-%d").date()
                    # 新阿姨根据单量推算entrytime
                    if uid not in ayi_dict:
                        entrytime = capturedate - timedelta(days=int(servicecount))
                        new_ayis.append([
                            uid, province, sex, city, name, mobile, str(entrytime), idcard, age, pic])
                        ayi_dict[uid] = entrytime
                    # 获取最新的entrytime
                    entrytime = ayi_dict[uid]
                    new_details.append([
                        address, age, comment, commentcountofthismonth, distance,
                        distancewithunit, entrytime, goodrate, idcard, mobile, name, pic,
                        province, servicecount, servicecountofthismonth, sex, star, uid, valid,
                        workage, worktime, city, capturedate, appointmentdate])

            logging.debug(" 今日新阿姨数量: " + str(len(new_ayis)))
            insert_batch(AYI_INSERT_SQL, new_ayis)
            insert_batch(DETAILS_INSERT_SQL, new_details)
Ejemplo n.º 14
0
def read_sql_file(path):
    for filename in list_files(path, "*.sql"):
        sql = ("INSERT INTO {table}(cate1, cate2, cate3, cate4, cate5, cate6, cate7, time, plat,"
            " name, sales, amount, deals) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
            "ON DUPLICATE KEY UPDATE sales=VALUES(sales), amount=VALUES(amount), deals=VALUES(deals)")
        regex = re.compile("\(((\'[^\']+\',){10}\d+,\d+,\d+)\)")
        # 正则匹配
        with open(filename, "r", encoding="utf8") as a_file:
            for line in a_file:
                insert_list = []
                if line.startswith("LOCK TABLES"):
                    table = re.search(r"`(.+)`", line).group(0).strip("`")
                    sql = sql.format(table=table)
                    print(sql)
                if line.startswith("INSERT INTO"):
                    match = regex.findall(line)
                    for row in match:
                        fields = row[0].replace("'", "").split(",")
                        # if len(fields) == 13 and fields[7] == "201601":
                        if len(fields) == 13:
                            insert_list.append(fields)
                    insert_batch(sql, insert_list)
            print(filename + " 更新完毕。。。")
Ejemplo n.º 15
0
def main(fpath, pattern=None):
    for filename in list_files(fpath, pattern):
        sql = (
            "INSERT INTO {table} VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,"
            " %s, %s, %s, %s)")
        # 正则匹配
        regex = re.compile("\(([^\)]+)\)[,;]+")
        with open(filename, "r", encoding="utf8") as a_file:
            print(filename + " starts to insert...", flush=True)
            # 匹配数据
            insert_list, line_count = [], 0
            for line in a_file:
                # 匹配表名
                if line.startswith("LOCK TABLES"):
                    table = re.search(r"`(.+)`", line).group(0).strip("`")
                    sql = sql.format(table=table)
                    print(sql)
                if line.startswith("INSERT INTO"):
                    # 计数器加1
                    line_count += 1
                    match = regex.findall(line)
                    for row in match:
                        fields = row.replace("'", "").split(",")
                        # 舍弃正则解析出错的记录(数量很少)
                        if len(fields) != 15:
                            print(fields, flush=True)
                            continue
                        if fields[9] == "1":
                            continue
                        insert_list.append(fields)
                    if line_count >= 2000:
                        print("To insert " + str(len(insert_list)) + " records")
                        insert_batch(sql, insert_list)
                        insert_list, line_count = [], 0

            insert_batch(sql, insert_list)
            print("Last ", len(insert_list), " inserted...", flush=True)
Ejemplo n.º 16
0
def wuba_ayi(path):
    sql = ("INSERT INTO 58_ayi(address, age, comment, commentcountofthismonth, distance, "
        " distancewithunit, entrytime, goodrate, idcard, mobile, NAME, pic, province, "
        " servicecount, servicecountofthismonth, sex, star, uid, valid, workage, worktime, "
        " city, capturedate, appointmentdate) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
        " %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
    for file_name in list_files(path):
        with open(file_name, "r", encoding="utf8") as a_file:
            print(file_name + " starts to insert...")
            insert_list, row_count = [], 0
            # 跳过Header
            next(a_file)
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields)  == 24:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    address, age, comment, commentcountofthismonth, distance, distancewithunit, \
                    entrytime, goodrate, idcard, mobile, NAME, pic, province, servicecount, \
                    servicecountofthismonth, sex, star, uid, valid, workage, worktime, city, \
                    capturedate, appointmentdate = fields[0], fields[1], fields[2], fields[3], \
                    fields[4], fields[5], fields[6], fields[7], fields[8] ,fields[9], fields[10], fields[11], \
                    fields[12], fields[13], fields[14], fields[15], fields[16], fields[17], \
                    fields[18], fields[19], fields[20], fields[21], fields[22], fields[23]
                    insert_list.append([address, age, comment, commentcountofthismonth, distance, \
                    distancewithunit, entrytime, goodrate, idcard, mobile, NAME, pic, \
                    province, servicecount, servicecountofthismonth, sex, star, uid, valid, \
                    workage, worktime, city, capturedate, appointmentdate])
                    # 如果超过100000行的话,重置计数器
                    if row_count >= 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
            # 把最后没有到100000条的记录插入数据库
            print("Last ", len(insert_list), "inserted...")
            insert_batch(sql, insert_list)
Ejemplo n.º 17
0
def didizhuanche(path, table_name, pattern=None):
    sql = (
        "INSERT INTO " + table_name + "(driver_name, longitude, latitude, "
        " cnt_order, license, car_type, driver_id, capture_dtm, city, created_dt, flag_dt) values("
        " %s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")

    for file_name in list_files(path, pattern):
        with open(file_name, "r", encoding="utf8") as a_file:
            print(file_name + " starts to insert....", flush=True)
            insert_list, row_count = [], 0
            for line in a_file:
                fields = line.strip().split("\t")
                if len(fields) == 14:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_name, longitude, latitude, cnt_order, license, car_type, driver_id, \
                        capture_dtm = fields[1], fields[3], fields[4], fields[8], fields[9], \
                        fields[10], fields[11], fields[12]
                    city = fields[13][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[13][2:])
                    insert_list.append([
                        driver_name, longitude, latitude, cnt_order, license,
                        car_type, driver_id, capture_dtm, city, TODAY, capture_dtm.split(" ")[0]])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...", flush=True)
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                elif len(fields) == 27:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_name, longitude, latitude, cnt_order, license, car_type, driver_id, \
                        capture_dtm = fields[1], fields[3], fields[4], fields[8], fields[9], \
                        fields[10], fields[11], fields[12]
                    city = fields[13][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[13][2:])
                    insert_list.append([
                        driver_name, longitude, latitude, cnt_order, license, car_type, driver_id,
                        capture_dtm, city, TODAY, capture_dtm.split(" ")[0]])

                    driver_name, longitude, latitude, cnt_order, license, car_type, driver_id, \
                        capture_dtm = fields[14], fields[16], fields[17], fields[21], fields[22], \
                        fields[23], fields[24], fields[25]
                    city = fields[26][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[26][2:])
                    insert_list.append([
                        driver_name, longitude, latitude, cnt_order, license, car_type, driver_id,
                        capture_dtm, city, TODAY, capture_dtm.split(" ")[0]])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...", flush=True)
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
            # 把最后没有到100000条的记录插入数据库
            print("Last ", len(insert_list), "inserted...", flush=True)
            insert_batch(sql, insert_list)
Ejemplo n.º 18
0
def main(path, pattern=None):
    for filename in list_files(path, pattern):
        match = re.search(r"(\d{4}-\d{2})", filename)
        if match:
            starting_date = match.group(0) + "-01"

        all_lines = []
        with open(filename, encoding="utf8") as a_csv:
            spam_reader = csv.reader(a_csv, delimiter=",")
            # 跳过第一行Title
            next(spam_reader)
            for row in spam_reader:
                line = []
                # app_id
                line.append(row[8])
                # app_name
                line.append(row[9])
                # app_url
                line.append(row[13])
                # app_iap
                line.append(row[14])
                # region
                line.append(row[5])
                # os
                line.append("iOS")
                # device
                line.append(row[3])
                # Source
                line.append("Appannie")
                # store
                line.append("Apple")
                # sector
                line.append(row[1])
                # sub_sector
                line.append(row[15])
                # type
                line.append(row[4])
                # rank
                line.append(row[0])
                # starting_date
                line.append(starting_date)
                # date_type
                line.append("30")
                # download_or_revenue
                line.append(row[10])
                # unit
                line.append(row[11])
                # app_average_price_usd
                line.append(row[19])
                # avg_rating_all
                line.append(row[27])
                # app_version
                line.append(row[7])
                # app_release_date
                line.append(row[20])
                # publisher_id
                line.append(row[21])
                # publisher_name
                line.append(row[22])
                # company_name
                line.append(row[23])
                # parent_company_name
                line.append(row[24])

                all_lines.append(line)

        insert_sql = (
            "INSERT INTO app_download(app_id, app_name, app_url, app_iap, region, os,"
            " device, source, store, sector, sub_sector, type, rank, starting_date, date_type,"
            " download_or_revenue, unit, app_average_price_usd, avg_rating_all, app_version, "
            " app_release_date, publisher_id, publisher_name, company_name, parent_company_name)"
            " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,"
            " %s, %s, %s, %s, %s, %s)")
        insert_batch(insert_sql, all_lines)
        print(filename + "*************************Done.")
def main(new_date, old_date):
    old = fetch_data(old_date)
    new = fetch_data(new_date)

    print("*******************************MAU*************************************")
    mau_lines = []
    for key, value in new.mau_dict.items():
        old_mau_dict = old.mau_dict
        sector = new.sector_dict.get(key).split("-")[0]
        sub_sector = new.sector_dict.get(key).split("-")[1]

        if key in old_mau_dict:
            old_value = old_mau_dict.get(key)
            margin = value - old_value
            if margin > 0 and old_value > 0:
                percent = (value - old_value) / old_value
                print("%s\t%s\t%s\t%s\t%s\t%s" % (
                    key, sector, sub_sector, percent, value, old_value))
                line_set = [
                    key, sector, sub_sector, "MAU", percent, value, old_value, new_date,
                    "QuestMobile"]
                mau_lines.append(line_set)
    # 二维Array,按照Array内部的Percent排序,从大到小,并且取前300个
    # mau_lines = sorted(mau_lines, key=lambda l: l[4], reverse=True)[0:300]
    mau_lines = sorted(mau_lines, key=lambda l: l[4], reverse=True)
    print("*******************************DAU*************************************")
    dau_lines = []
    for key, value in new.dau_dict.items():
        old_dau_dict = old.dau_dict
        sector = new.sector_dict.get(key).split("-")[0]
        sub_sector = new.sector_dict.get(key).split("-")[1]

        if key in old_dau_dict:
            old_value = old_dau_dict.get(key)
            margin = value - old_value
            if margin > 0:
                percent = (value - old_value) / old_value
                print("%s\t%s\t%s\t%s\t%s\t%s" % (
                    key, sector, sub_sector, percent, value, old_value))
                line_set = [
                    key, sector, sub_sector, "DAU", percent, value, old_value, new_date,
                    "QuestMobile"]
                dau_lines.append(line_set)
    # dau_lines = sorted(dau_lines, key=lambda l: l[4], reverse=True)[0:300]
    dau_lines = sorted(dau_lines, key=lambda l: l[4], reverse=True)
    print("*******************************TIME*************************************")
    time_lines = []
    for key, value in new.time_dict.items():
        old_time_dict = old.time_dict
        sector = new.sector_dict.get(key).split("-")[0]
        sub_sector = new.sector_dict.get(key).split("-")[1]

        if key in old_time_dict:
            old_value = old_time_dict.get(key)
            margin = value - old_value
            if margin > 0 and old_value > 0:
                percent = (value - old_value) / old_value
                print("%s\t%s\t%s\t%s\t%s\t%s" % (
                    key, sector, sub_sector, percent, value, old_value))
                line_set = [
                    key, sector, sub_sector, "Time", percent, value, old_value, new_date,
                    "QuestMobile"]
                time_lines.append(line_set)
    # time_lines = sorted(time_lines, key=lambda l: l[4], reverse=True)[0:300]
    time_lines = sorted(time_lines, key=lambda l: l[4], reverse=True)

    # insert calculation into database
    insert_sql = (
        "INSERT INTO top_growth_apps(app_name, sector, sub_sector, type, growth,"
        " new_data, old_data, starting_date, source) values(%s, %s, %s, %s, %s, %s, %s, %s, %s)")
    insert_batch(insert_sql, mau_lines + time_lines + dau_lines)
Ejemplo n.º 20
0
def didi(path):
    sql = ("INSERT INTO didi(driver_id, driver_name, license, longitude, latitude, "
        " order_num, company, capture_dtm, city, flag_dt, createddate)"
        " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")

    for file_name in list_files(path):
        print(file_name)
        with open(file_name, "r", encoding="utf8") as a_file:
            insert_list, row_count = [], 0
            for line in a_file:
                print(line)
                fields = line.split("\t")
                length = len(fields)
                if length == 10:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_id, driver_name, license, longitude, latitude, order_num, \
                    capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \
                        fields[7], fields[8], fields[9]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, None, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                elif length == 11:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_id, driver_name, license, longitude, latitude, order_num, company,\
                    capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \
                        fields[7], fields[8], fields[9], fields[10]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                elif length == 12:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_id, driver_name, license, longitude, latitude, order_num, company,\
                    capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \
                        fields[7], fields[8], fields[10], fields[11]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                elif length == 19:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_id, driver_name, license, longitude, latitude, order_num, \
                    capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \
                        fields[7], fields[8], fields[9]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, None, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])

                    driver_id, driver_name, license, longitude, latitude, order_num, \
                    capture_dtm, city = re.findall(r"\d+", fields[9])[0], fields[10], fields[11], \
                    fields[14], fields[15], fields[16], fields[17], fields[18]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, None, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                elif length == 20:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_id, driver_name, license, longitude, latitude, order_num, \
                    capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \
                        fields[7], fields[8], fields[9]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, None, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])

                    driver_id, driver_name, license, longitude, latitude, order_num, company, \
                    capture_dtm, city = re.findall(r"\d+", fields[9])[0], fields[10], fields[11], \
                    fields[14], fields[15], fields[16], fields[17], fields[18], fields[19]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                elif length == 21:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_id, driver_name, license, longitude, latitude, order_num, company,\
                    capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \
                        fields[7], fields[8], fields[9], fields[10]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])

                    driver_id, driver_name, license, longitude, latitude, order_num, company, \
                    capture_dtm, city = re.findall(r"\d+", fields[10])[0], fields[11], fields[12], \
                    fields[15], fields[16], fields[17], fields[18], fields[19], fields[20]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
            # 把最后没有到100000条的记录插入数据库
            print("Last ", len(insert_list), "inserted...")
            insert_batch(sql, insert_list)