Esempio n. 1
0
def show_mysql_geolife_track(traId, file_name):
    db = db_mysql.db_connection(host_name='localhost', user_name='root', password='******',
                                database='stdatamining', charset='utf8')
    cur = db.cursor()
    select_sql = 'SELECT longitude, latitude, time_date FROM geolife_point  WHERE traid=' + \
                 str(traId) + ' ORDER BY time_date'

    lon = []
    lat = []
    try:
        cur.execute(select_sql)
        result = cur.fetchall()
        for record in result:
            lon.append(record[0])
            lat.append(record[1])
        if file_name != 'default':
            file_name += '\\' + str(traId) + '.txt'
            f = open(file_name, 'w')
            for result in cur:
                line = ''
                for ele in result:
                    line += str(ele)
                    line += ','
                f.write(line + '\n')
            f.close()
        cur.close()
    except:
        print 'errot to execute the sql'
    db.close()
    pl.plot(lon, lat, 'g')
    pl.xlabel('longitude')
    pl.ylabel('latitude')
    pl.title('traId_' + str(traId) + '_' + str(len(lon)))
    pl.show()
Esempio n. 2
0
def show_region_ratio():  # 显示不同类型的场所在不同poi区域中的比值规律
    connection = db.db_connection()
    sql = "SELECT * FROM poi_feature ORDER BY poi_id"
    cursor = connection.cursor()
    result = {}
    try:
        cursor.execute(sql)
        records = cursor.fetchall()
        for record in records:
            for j in range(1, 26):
                if j in result.keys():
                    result[j].append(float(record[j]))
                else:
                    result[j] = [float(record[j])]
    except Exception:
        print 'wrong'
    for key in result.keys():
        value = result[key]
        pl.plot(
            value,
            'g',
            label='region ' + str(key),
        )
        pl.legend()
        pl.savefig('data_file\\' + str(key) + '.png')
        pl.close()
Esempio n. 3
0
def extract_data_by_day():
    connection = db.db_connection()
    # 起始日期
    start_time = '2016-01-01 00:00:00'
    # 结束时间
    end_time = '2016-01-01 23:59:59'
    while start_time < '2016-01-31 00:00:00':
        sql_head = "CREATE TABLE "+start_time.split(' ')[0].replace('-', '_') + "_order_data" + \
                   " AS SELECT * FROM order_data "
        # sql_head = "CREATE TABLE " + start_time.split(' ')[0].replace('-', '_') + \
        #            "_pass_orderNum AS SELECT COUNT(*) orderNum,passenger_id FROM " + \
        #            start_time.split(' ')[0].replace('-', '_') + "_order_data GROUP BY passenger_id"
        sql_condition = "where  time<'"
        sql_condition += end_time
        sql_condition += "' and time>'"
        sql_condition += start_time + "'"
        sql = sql_head + sql_condition
        print sql
        print sql_head
        try:
            cursor = connection.cursor()
            cursor.execute(sql)
            # cursor.execute(sql_head)
            cursor.close()
        except Exception:
            print 'Wrong!'
        # 日期按天自增
        t1 = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
        t2 = datetime.datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S")
        start_time = str(t1 + datetime.timedelta(days=1))
        end_time = str(t2 + datetime.timedelta(days=1))
    connection.close()
Esempio n. 4
0
def calculate_poi_feature(feature_file):
    connection = db.db_connection()
    cursor = connection.cursor()
    f = open(feature_file, 'w')
    # poi_sql = 'select * from poi_region_order'
    poi_sql = 'select * from poi_region'
    cursor.execute(poi_sql)
    region_num_list = []
    records = cursor.fetchall()
    for record in records:
        # num = 0
        # for i in range(1, len(record)-1):
        #     num += int(record[i])
        num = int(record[len(record) - 1])
        region_num_list.append(num)
    max_region_num = max(region_num_list)
    print max_region_num
    for i, record in enumerate(records):
        line = [record[0]]
        region_num = int(region_num_list[i])
        for j in range(1, len(record) - 1):
            if record[j] is not None:
                line.append(str(float(record[j]) /
                                region_num))  # the ratio in district_id
            else:
                line.append('0')
        line.append(
            str(region_num /
                float(max_region_num)))  # the ratio of district_id in all
        line.append(str(region_num))
        line_content = ','.join(line)
        f.write(line_content + '\n')
    f.close()
    connection.close()
Esempio n. 5
0
def execute_in_sql(file_name, save_file):
    print file_name
    connection = db.db_connection()
    cursor = connection.cursor()
    r_f = open(file_name, 'r')
    w_f = open(save_file, 'w')
    line = r_f.readline()
    delete_sql_head = "DELETE FROM " + file_name.split('E:\\data_clean\\')[1].split('.')[0] + \
                      "_order_data_copy WHERE order_id IN "

    print delete_sql_head
    order_id_list = []
    while line:
        if line != '':
            order_id_list.append(line.split('=')[1].split(';')[0])
        line = r_f.readline()
    delete_sql_condi = '(' + ','.join(order_id_list) + ');\n'
    sql = delete_sql_head + delete_sql_condi
    try:
        cursor.execute(sql)
        connection.commit()
    except Exception:
        print 'Wrong'
    print '*' * 10
    w_f.write(sql)
    connection.close()
    r_f.close()
    w_f.close()
Esempio n. 6
0
def feature_analysis(feature, district, days, save_dir):
    connection = db.db_connection()
    cursor = connection.cursor()
    sql_base = "SELECT %s FROM gap_table WHERE day='%s' AND district_id='%s' ORDER BY time_slice "
    colors = ['b', 'g', 'r', 'y', 'k', 'w', 'm']
    count = 0
    is_first = True
    for day in days:
        sql = sql_base % (feature, day, district)
        print sql
        result = []
        try:
            cursor.execute(sql)
            records = cursor.fetchall()
            for record in records:
                if record[0] is not None:
                    result.append(float(record[0]))
                else:
                    result.append(-1)
        except Exception:
            print 'wrong'
        result = data_supplement(result)
        # if is_first:
        #     pl.plot(result, color=colors[count % len(colors)], label=feature)
        #     is_first = False
        # else:
        #     pl.plot(result, color=colors[count % len(colors)])
        pl.plot(result, color=colors[count % len(colors)], label=day)
        count += 1
    pl.legend()
    pl.title(feature)
    fig_name = district + "_" + feature + ".png"
    pl.savefig(save_dir + fig_name)
    pl.close()
Esempio n. 7
0
def extract_batch_data():
    db = db_mysql.db_connection(host_name='localhost', user_name='root', password='******',
                                                database='stdatamining', charset='utf8')
    cur = db.cursor()
    traId_list_sql = "SELECT objid,traid FROM continuous_8h_track"
    print traId_list_sql
    cur.execute(traId_list_sql)
    result = cur.fetchall()
    traId_list = [[ele for ele in record] for record in result]  #选出所有满足条件的objId和traId
    cur.close()
    # 先创建目录
    if not os.path.exists('data\\geolife_data\\geolife_data_8h'):
        os.makedirs('data\\geolife_data\\geolife_data_8h')
    for objId_traId in traId_list:
        file_name = 'data\\geolife_data\\geolife_data_8h\\'
        objId = objId_traId[0]
        traId = objId_traId[1]
        txt_name = str(objId) + '_' + str(traId) + '.txt'
        file_name += txt_name
        select_sql = 'SELECT longitude, latitude, time_date FROM geolife_point  WHERE traid=' + \
                     str(traId) + ' ORDER BY time_date'
        print select_sql
        cur = db.cursor()
        cur.execute(select_sql)
        result = cur.fetchall()
        f = open(file_name, 'w')
        for record in result:
            line = ''
            for ele in record:
                line += str(ele)
                line += ','
            f.write(line + '\n')
        f.close()
        cur.close()
    db.close()
Esempio n. 8
0
def extract_data(traid_file, data_dir):
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    db = db_mysql.db_connection(host_name='localhost', user_name='root', password='******',
                                database='stdatamining', charset='utf8')
    cur = db.cursor()
    f_r = open(traid_file, 'r')
    lines = f_r.readlines()
    for line in lines:
        traid = line.split(',')[0]
        select_sql = 'SELECT longitude, latitude, time_date FROM geolife_point  WHERE traid=' + \
                     str(traid) + ' ORDER BY time_date'
        cur.execute(select_sql)
        result = cur.fetchall()
        data_file = data_dir + '\\' + str(traid) + '.txt'
        f_w = open(data_file, 'w')
        for record in result:
            line = ''
            for ele in record:
                line += str(ele)
                line += ','
            f_w.write(line + '\n')
        f_w.close()
    cur.close()
    db.close
Esempio n. 9
0
def create_gap_table(sql_file):
    w_f = open(sql_file, 'w')
    create_table_sql = "create table `gap_table` (\n" + "`restrict_id` varchar(10),\n" + \
                       "`time_slice` varchar(10),\n" + \
                       "`gap` varchar(10)" + "\n);\n"
    print create_table_sql
    w_f.write(create_table_sql)
    select_sql = "SELECT start_district_id,time_slice FROM order_data WHERE driver_id='null'"
    connection = db.db_connection()
    cursor = connection.cursor()
    result_dic = {}
    try:
        cursor.execute(select_sql)
        record_results = cursor.fetchall()
        count = 0
        num = 0
        for record in record_results:
            if count > 10000:
                num += 1
                print '*'*10 + str(num)
                count = 0
            else:
                count += 1
            dic_key = record[0] + "#" + record[1]
            if dic_key in result_dic.keys():
                result_dic[dic_key] += 1
            else:
                result_dic[dic_key] = 1
    except Exception:
        print 'Wrong'
    if len(result_dic) > 0:
        count = 0
        num = 0
        for record in result_dic.iteritems():
            if count > 10000:
                num += 1
                print '-'*10 + str(num)
                count = 0
            else:
                count += 1
            dic_key = record[0]
            district_id = dic_key.split('#')[0]
            time_slice = dic_key.split('#')[1]
            dic_value = record[1]
            insert_sql = "insert into `gap_table` (`restrict_id`,`time_slice`,`gap`) values (" + \
                         "'" + district_id + "','" + time_slice + "','" + str(dic_value) + "');\n"
            w_f.write(insert_sql)
        w_f.write('commit;')
    cursor.close()
    connection.close()
    w_f.close()
Esempio n. 10
0
def get_poi_map():
    connection = db.db_connection()
    sql = 'SELECT * FROM cluster_map'
    cursor = connection.cursor()
    dic = {}
    try:
        cursor.execute(sql)
        results = cursor.fetchall()
        for row in results:
            key = row[1]
            value = row[0]
            dic[key] = value
    except Exception:
        print 'error'
    connection.close()
    return dic
Esempio n. 11
0
def model_evaluate(csv_file):
    csv_file = file(csv_file, 'rb')
    reader = csv.reader(csv_file)
    connection = db.db_connection()
    cursor = connection.cursor()
    gap_dic = {}
    select_all_gap = 'SELECT * FROM gap_table'
    try:
        cursor.execute(select_all_gap)
        record_results = cursor.fetchall()
        for record in record_results:
            dic_key = record[0] + '#' + record[1]
            dic_value = int(record[2])
            gap_dic[dic_key] = dic_value
    except Exception:
        print 'Wrong'
    cursor.close()
    connection.close()
    test_record_dic = {}
    for line in reader:
        district_id = line[0]
        if district_id in test_record_dic.keys():
            test_record_dic[district_id].append(line[1] + '#' + line[2])
        else:
            test_record_dic[district_id] = [line[1] + '#' + line[2]]
    test_keys = test_record_dic.keys()
    evaluation_score = 0
    evaluation_score_district = 0
    for district in test_keys:
        district_gap_list = test_record_dic[district]
        for ele in district_gap_list:
            gap_key = district + '#' + ele.split('#')[0]
            evaluate_gap = float(ele.split('#')[1])
            if gap_key in gap_dic.keys():
                real_gap = gap_dic[gap_key]
            else:
                real_gap = 0
            if real_gap != 0:
                evaluation_score_district += abs((real_gap - evaluate_gap) / real_gap)
            else:
                continue
        evaluation_score_district /= len(district_gap_list)  # q的选择把所有的(包括为0的都考虑在内)
        evaluation_score += evaluation_score_district
    evaluation_score /= len(test_record_dic)
    return evaluation_score
Esempio n. 12
0
def extract_data_mysql():
    # 起始日期
    start_time = '2008-10-01 00:00:00'
    # 结束时间
    end_time = '2008-10-01 23:59:59'
    while start_time < '2010-01-01 00:00:00':
        file_name = 'data\\geolife_data\\geolife_153_'
        file_name += start_time.split(' ')[0]+'.txt'
        sql_head = "select longitude, latitude, time_date from GEOLIFE_153 t "
        sql_condition = "where  t.time_date < '"
        sql_condition += end_time
        sql_condition += "' and t.time_date > '"
        sql_condition += start_time
        sql_condition += "' order by time_date"
        record_count = db_mysql.get_record_count('select count(*) from GEOLIFE_153 t '+sql_condition)
        print record_count
        if record_count > 0:
            sql = sql_head+sql_condition
            print sql
            connection = db_mysql.db_connection(host_name='localhost', user_name='root', password='******',
                                                database='stdatamining', charset='utf8')
            cursor = connection.cursor()
            try:
                cursor.execute(sql)
                results = cursor.fetchall()
                f = open(file_name, 'w')
                for record in results:
                    line = ''
                    for ele in record:
                        line += str(ele)
                        line += ','
                    f.write(line+'\n')
            except Exception:
                print 'Wrong'
            f.close()
            cursor.close()
            connection.close()

        # 日期按天自增
        t1 = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
        t2 = datetime.datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S")
        start_time = str(t1+datetime.timedelta(days=1))
        end_time = str(t2+datetime.timedelta(days=1))
Esempio n. 13
0
def process_dataobjrelation():
    db = db_mysql.db_connection(host_name='localhost', user_name='root', password='******',
                                database='stdatamining', charset='utf8')
    cur = db.cursor()
    select_sql1 = 'SELECT DISTINCT objid FROM geolife_trajectory WHERE objid>400076'
    cur.execute(select_sql1)
    result = cur.fetchall()
    begin = 10501
    for record in result:
        objid = record[0]
        objname = "'" + str(objid)[3:] + "'"
        insert_sql = "insert into dataobjrelation values(%d,400,'geolife',%d,%s);" % (begin, objid, objname)
        print insert_sql
        begin += 1
        cur2 = db.cursor()
        cur2.execute(insert_sql)
        db.commit()
        cur2.close()
    cur.close()
    db.close()
Esempio n. 14
0
def show_mean_gap():
    start_time = '2016-01-02 00:00:00'
    end_time = '2016-01-20 00:00:00'
    date_list = []
    while start_time <= end_time:
        date_list.append(start_time.split(' ')[0])
        t1 = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
        start_time = str(t1 + datetime.timedelta(days=1))
    connection = db.db_connection()
    cursor = connection.cursor()
    sql = "SELECT * FROM gap_table WHERE district_id='01' ORDER BY time_slice"
    gap_dic = {}
    try:
        cursor.execute(sql)
        records = cursor.fetchall()
        for record in records:
            key = record[1]
            value = record[2]
            if key not in gap_dic.keys():
                gap_dic[key] = value
    except Exception:
        print 'wrong'
    result = []
    for i in range(1, 145):
        total = 0
        for ele in date_list:
            if i < 10:
                key = ele + '-00' + str(i)
            else:
                if i < 100:
                    key = ele + '-0' + str(i)
                else:
                    key = ele + '-' + str(i)
            if key in gap_dic.keys():
                total += float(gap_dic[key])
        result.append(total / float(len(date_list)))
    index = range(1, 145)
    pl.plot(index, result, 'g', label='mean_gap')
    pl.legend()
    pl.show()
Esempio n. 15
0
def show_district_gap(district_id):
    connection = db.db_connection()
    cursor = connection.cursor()
    sql = "SELECT * FROM gap_table WHERE district_id='" + district_id + \
          "' AND time_slice> '2016-01-15-001' AND time_slice<'2016-01-21-144'ORDER BY time_slice"
    print sql
    gap_list = []
    try:
        cursor.execute(sql)
        records = cursor.fetchall()
        for record in records:
            if float(record[2]) > 40:
                gap_list.append(40)
            else:
                gap_list.append(float(record[2]))
    except Exception:
        print 'wrong'
    cursor.close()
    connection.close()
    if len(gap_list) > 0:
        pl.plot(gap_list, 'g', label='gap')
        pl.legend()
        pl.show()
Esempio n. 16
0
def statics_by_poi(output_file):
    poi_map = get_poi_map()
    connection = db.db_connection()
    cursor = connection.cursor()
    f = open(output_file, 'w')
    for poi_id in poi_map.keys():
        poi_hash = poi_map[poi_id]
        select_sql = "SELECT poi_class FROM poi_data where district_hash='%s'" % poi_hash
        cursor.execute(select_sql)
        result = cursor.fetchone()[0]
        detail_eles = result.split(' ')
        region_count_map = {}
        for ele in detail_eles:
            index = ele.find('#')
            if index == -1:  # if doesn't exits symbol #, then split by :
                key = int(ele.split(':')[0])
                value = int(ele.split(':')[1])
                region_count_map[key] = value
            else:
                key = int(ele.split('#')[0])
                leng = len(ele.split('#'))
                value = int(ele.split('#')[leng - 1].split(
                    ':')[1])  # if exits #, take the last element as the number
                if key in region_count_map.keys():
                    region_count_map[key] += value
                else:
                    region_count_map[key] = value
        max_region_index = max(region_count_map.items(), key=lambda x: x[0])[0]
        content1 = str(poi_id) + '#' + str(max_region_index) + '#'
        content2 = ''
        for key in region_count_map.keys():
            content2 += str(key) + ':' + str(region_count_map[key]) + ' '
        content = content1 + content2
        f.write(content)
        f.write('\n')
    connection.close()
    f.close()
Esempio n. 17
0
def feature_judge(sql_file):
    w_f = open(sql_file, 'w')
    w_f.write("DROP TABLE IF EXISTS `feature_judge`;\n")
    w_f.write("CREATE TABLE `feature_judge` (\n" +
              "`district_id` varchar(10) DEFAULT NULL,\n" +
              "`time_slice` varchar(20) DEFAULT NULL,\n " +
              "`district_code` varchar(10) DEFAULT NULL,\n" +
              "`time_slice_code` varchar(10)\n " + ");\n")
    code_1000 = [
        '1', '12', '16', '19', '20', '24', '25', '28', '29', '38', '42', '46',
        '53'
    ]
    code_1100 = ['4', '6', '37']
    code_0100 = ['41']
    code_1010 = ['26']
    code_1110 = ['14', '23']
    code_1111 = ['7', '8', '48', '51']
    connection = db.db_connection()
    sql = "SELECT district_id,time_slice FROM gap_table ORDER BY district_id,time_slice"
    print sql
    cursor = connection.cursor()
    try:
        cursor.execute(sql)
        records = cursor.fetchall()
        insert_base = "insert  into `feature_judge`(`district_id`,`time_slice`,`district_code`,`time_slice_code`)"
        for record in records:
            district_id = record[0]
            time_slice = record[1]
            district_code = '0000'
            if district_id in code_0100:
                district_code = '0100'
            if district_id in code_1000:
                district_code = '1000'
            if district_id in code_1010:
                district_code = '1010'
            if district_id in code_1100:
                district_code = '1100'
            if district_id in code_1110:
                district_code = '1110'
            if district_id in code_1111:
                district_code = '1111'
            segment = time_slice.split('-')[3]
            time_slice_code = '0'  # 默认为0
            if segment >= '100':
                if segment <= '110':
                    time_slice_code = '4'
                else:
                    if '125' <= segment <= '135':
                        time_slice_code = '5'
            else:
                if '000' <= segment <= '030':
                    time_slice_code = '1'
                if '040' <= segment <= '060':
                    time_slice_code = '2'
                if '075' <= segment <= '095':
                    time_slice_code = '3'
            inser_value = "values ('" + district_id + "','" + time_slice + "','" +\
                          district_code + "','" + time_slice_code + "');\n"
            w_f.write(insert_base + inser_value)

    except Exception:
        print 'wrong!'
    w_f.write('commit;')
    cursor.close()
    connection.close()
Esempio n. 18
0
def data_clean():
    connection = db.db_connection()
    # 起始日期
    start_time = '2016-01-01 00:00:00'
    while start_time < '2016-01-31 00:30:00':
        print start_time
        day_order_table = start_time.split(' ')[0].replace(
            '-', '_') + "_order_data_copy"  # 每天的记录表
        cursor = connection.cursor()
        f = open(start_time.split(' ')[0].replace('-', '_') + ".txt", 'w')
        day_start_time = start_time.split(' ')[0] + ' 00:00:00'
        day_end_time = start_time.split(' ')[0] + ' 0:59:59'
        day_deadline = start_time.split(' ')[0] + ' 23:59:59'
        while day_end_time <= day_deadline:
            sql = "SELECT  order_id,driver_id,passenger_id,dest_district_hash,time_slice FROM " + \
                  day_order_table + " WHERE TIME>'" +\
                  day_start_time + "' AND TIME<='" + day_end_time + "' ORDER BY passenger_id,TIME"
            # sql = "SELECT order_id,driver_id,passenger_id,dest_district_hash,time_slice " \
            #       "FROM 2016_01_01_order_data_copy WHERE passenger_id='00156d54838c42117ddc25cf5bf330bc' ORDER BY time"
            print sql
            try:
                cursor.execute(sql)
                order_results = cursor.fetchall()
                is_first_record = True
                for record in order_results:
                    if is_first_record:
                        is_first_record = False
                        last_order_id = record[0]
                        last_drive_id = record[1]
                        last_pass_id = record[2]
                        last_dest_hash = record[3]
                        last_time_slice = record[4]
                        continue
                    else:
                        next_order_id = record[0]
                        next_drive_id = record[1]
                        next_pass_id = record[2]
                        next_dest_hash = record[3]
                        next_time_slice = record[4]
                        if next_pass_id == last_pass_id and next_time_slice == last_time_slice:
                            # 应答情况都为空,目的地一样
                            if next_drive_id == 'NULL' and last_drive_id == 'NULL' and next_dest_hash == last_dest_hash:
                                delete_sql = "DELETE FROM " + day_order_table + \
                                             " WHERE order_id='" + last_order_id + "';\n"
                                f.write(delete_sql)
                            # 去的地方一样,且应答情况不一样
                            if next_drive_id != last_drive_id and next_dest_hash == last_dest_hash:
                                if next_drive_id == 'NULL':  # 哪种情况为空就删除哪条记录
                                    delete_sql = "DELETE FROM " + day_order_table + \
                                                 " WHERE order_id='" + next_order_id + "';\n"
                                    f.write(delete_sql)
                                if last_drive_id == 'NULL':
                                    delete_sql = "DELETE FROM " + day_order_table + \
                                                 " WHERE order_id='" + last_order_id + "';\n"
                                    f.write(delete_sql)
                            # 去的地方不一样,应答情况也不一样,
                            if next_drive_id != last_drive_id and next_dest_hash != last_dest_hash:
                                continue
                        last_order_id = next_order_id
                        last_drive_id = next_drive_id
                        last_pass_id = next_pass_id
                        last_dest_hash = next_dest_hash
                        last_time_slice = next_time_slice
                        # last_time = next_time
                        # last_time_second = next_time_second
            except Exception:
                print 'Wrong!!!'
            day_t1 = datetime.datetime.strptime(day_start_time,
                                                "%Y-%m-%d %H:%M:%S")
            day_t2 = datetime.datetime.strptime(day_end_time,
                                                "%Y-%m-%d %H:%M:%S")
            day_start_time = str(day_t1 + datetime.timedelta(hours=1))
            day_end_time = str(day_t2 + datetime.timedelta(hours=1))

        f.close()
        cursor.close()
        # 日期按天自增
        t1 = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
        start_time = str(t1 + datetime.timedelta(days=1))
    connection.close()
Esempio n. 19
0
def data_clean2():
    connection = db.db_connection()
    # 起始日期
    start_time = '2016-01-01 00:00:00'
    while start_time < '2016-01-31 00:30:00':
        print start_time
        day_order_table = start_time.split(' ')[0].replace(
            '-', '_') + "_order_data"  # 每天的记录表
        cursor = connection.cursor()
        f = open(start_time.split(' ')[0].replace('-', '_') + ".txt", 'w')
        day_start_time = start_time.split(' ')[0] + ' 00:00:00'
        day_end_time = start_time.split(' ')[0] + ' 0:59:59'
        day_deadline = start_time.split(' ')[0] + ' 23:59:59'
        while day_end_time <= day_deadline:
            sql = "SELECT  order_id,driver_id,passenger_id,dest_district_hash,time_slice FROM " + \
                  day_order_table + " WHERE TIME>'" + \
                  day_start_time + "' AND TIME<='" + day_end_time + "' ORDER BY passenger_id,TIME"
            # sql = "SELECT order_id,driver_id,passenger_id,dest_district_hash,time_slice " \
            #       "FROM 2016_01_01_order_data WHERE passenger_id='00156d54838c42117ddc25cf5bf330bc' ORDER BY time"
            print sql
            try:
                cursor.execute(sql)
                order_results = cursor.fetchall()
                is_first_record = True
                null_order_id_list = []
                for record in order_results:
                    if is_first_record:
                        is_first_record = False
                        last_order_id = record[0]
                        last_drive_id = record[1]
                        last_pass_id = record[2]
                        last_dest_hash = record[3]
                        last_time_slice = record[4]
                        if last_drive_id == 'NULL':
                            null_order_id_list.append(last_drive_id)
                        continue
                    else:
                        next_order_id = record[0]
                        next_drive_id = record[1]
                        next_pass_id = record[2]
                        next_dest_hash = record[3]
                        next_time_slice = record[4]
                        if next_pass_id == last_pass_id and next_time_slice == last_time_slice:
                            if next_drive_id == 'NULL':
                                null_order_id_list.append(next_drive_id)
                            else:
                                continue
                        else:  # 如果用户跳变,或者是时间片跳变
                            if len(null_order_id_list) > 1:
                                for i in range(1, len(null_order_id_list)):
                                    delete_sql = "DELETE FROM " + day_order_table + \
                                                 " WHERE order_id='" + null_order_id_list[i] + "';\n"
                                    f.write(delete_sql)
                            null_order_id_list = []
                            if next_drive_id == 'NULL':
                                null_order_id_list.append(next_drive_id)
                            else:
                                continue
                        last_order_id = next_order_id
                        last_drive_id = next_drive_id
                        last_pass_id = next_pass_id
                        last_dest_hash = next_dest_hash
                        last_time_slice = next_time_slice
                        # last_time = next_time
                        # last_time_second = next_time_second
                if len(null_order_id_list) > 1:
                    for i in range(1, len(null_order_id_list)):
                        delete_sql = "DELETE FROM " + day_order_table + \
                                     " WHERE order_id='" + null_order_id_list[i] + "';\n"
                        f.write(delete_sql)
                    null_order_id_list = []
            except Exception:
                print 'Wrong!!!'
            day_t1 = datetime.datetime.strptime(day_start_time,
                                                "%Y-%m-%d %H:%M:%S")
            day_t2 = datetime.datetime.strptime(day_end_time,
                                                "%Y-%m-%d %H:%M:%S")
            day_start_time = str(day_t1 + datetime.timedelta(hours=1))
            day_end_time = str(day_t2 + datetime.timedelta(hours=1))

        f.close()
        cursor.close()
        # 日期按天自增
        t1 = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
        start_time = str(t1 + datetime.timedelta(days=1))
    connection.close()
Esempio n. 20
0
def process(data_dir, temp_file_name):
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    traid_begin = 20095
    pointid_begin = 119443809
    dataid = 500
    objid = 500001
    UTC_FORMAT = '%Y-%m-%dT%H:%M:%S.%fz'
    now_stamp = time.time()
    local_time = datetime.datetime.fromtimestamp(now_stamp)
    utc_time = datetime.datetime.utcfromtimestamp(now_stamp)
    offset = local_time - utc_time
    for parent, dirName, file_names in os.walk(data_dir):
        for file_name in file_names:
            temp_file = open(temp_file_name, 'w')
            csvfile = file(data_dir + '\\' + file_name, 'r')
            reader = csv.reader(csvfile)
            line_number = 1
            traname = '\'' + file_name.split('.')[0] + '\''
            is_first = True
            count = 0
            for line in reader:
                if line_number <= 4:
                    line_number += 1
                    continue
                else:
                    count += 1
                    longitude = float(line[3])
                    latitude = float(line[2])
                    height = round(float(line[4]))
                    utc_time = line[8]
                    local_time = datetime.datetime.strptime(
                        utc_time, UTC_FORMAT) + offset
                    t = '\'' + local_time.strftime('%Y-%m-%d %H:%M:%S') + '\''
                    if is_first:
                        from_time = t
                        is_first = False
                    to_time = t
                    insert_sql = "INSERT INTO owndata_point(pointid, traid, longitude, latitude,time_date,height) VALUES" \
                                 " (%d,%d,%f,%f,%s,%d);" % (pointid_begin, traid_begin, longitude, latitude, t, height)
                    temp_file.write(insert_sql + '\n')
                    pointid_begin += 1
            temp_file.close()
            insert_trajectory_sql = "INSERT INTO owndata_trajectory(traid,traname,dataid,objid,starttime,endtime,totalpoint)" \
                                    " VALUES (%d,%s,%d,%d,%s,%s,%d);" % \
                                    (traid_begin, traname, dataid, objid, from_time, to_time, count)
            traid_begin += 1
            csvfile.close()
            print insert_trajectory_sql
            db = db_mysql.db_connection(host_name='localhost',
                                        user_name='root',
                                        password='******',
                                        database='stdatamining',
                                        charset='utf8')
            cur = db.cursor()
            cur.execute(insert_trajectory_sql)
            db.commit()
            f_r = open(temp_file_name, 'r')
            for line_sql in f_r.readlines():
                sql = line_sql.split('\n')[0]
                cur.execute(sql)
                db.commit()
            cur.close()
            f_r.close()
Esempio n. 21
0
def data_clean():
    connection = db.db_connection()
    # 起始日期
    start_time = '2016-01-01 00:00:00'
    # 结束时间
    end_time = '2016-01-01 23:59:59'
    while start_time < '2016-01-31 00:00:00':
        print start_time
        pass_ordernum_table = start_time.split(' ')[0].replace(
            '-', '_') + "_pass_orderNum"  # 用户记录数表
        day_order_table = start_time.split(' ')[0].replace(
            '-', '_') + "_order_data"  # 每天的记录表
        pass_id_list = []
        select_pass_id_sql = "SELECT passenger_id FROM " + pass_ordernum_table + " WHERE orderNum>1"
        cursor = connection.cursor()
        try:
            cursor.execute(select_pass_id_sql)
            results = cursor.fetchall()
            for pass_id in results:
                pass_id_list.append(pass_id[0])
        except Exception:
            print 'Error!'
        if len(pass_id_list) > 0:
            f = open("start_time.split(' ')[0].replace('-', '_')" + ".txt",
                     'w')
            for id_ele in pass_id_list:  # 对每个用户进行处理
                select_pass_order_sql = "SELECT order_id,TIME FROM " + day_order_table + \
                                        " WHERE passenger_id='" + id_ele + "' ORDER BY TIME "
                order_id_time_dic = {}
                try:
                    cursor.execute(select_pass_order_sql)
                    order_id_time_results = cursor.fetchall()
                    for ele in order_id_time_results:
                        order_id_time_dic[ele[0]] = ele[1]  #订单号和时间组成字典
                except Exception:
                    print 'WrongWrong!!'
                if len(order_id_time_dic) > 0:
                    sorted_dic = sorted(order_id_time_dic.iteritems(),
                                        key=lambda x: x[1])
                    time_list = []
                    order_id_list = []
                    for i in range(len(sorted_dic)):
                        time_list.append(sorted_dic[i][1])
                        order_id_list.append(sorted_dic[i][0])
                    # last_time = time.strptime(time_list[0], "%Y-%m-%d %H:%M:%S")
                    last_time = time_list[0]

                    last_time_second = int(time.mktime(last_time.timetuple()))
                    for i in range(1, len(time_list)):
                        # next_time = time.strptime(time_list[i], "%Y-%m-%d %H:%M:%S")
                        next_time = time_list[i]
                        next_time_second = int(
                            time.mktime(next_time.timetuple()))
                        time_gap = next_time_second - last_time_second
                        last_time_second = next_time_second
                        if time_gap < 10 * 60:
                            order_id = order_id_list[i]
                            delete_sql = "DELETE FROM " + day_order_table + " WHERE order_id='" + order_id + "';\n"
                            f.write(delete_sql)
        f.close()
        cursor.close()
        # 日期按天自增
        t1 = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
        t2 = datetime.datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S")
        start_time = str(t1 + datetime.timedelta(days=1))
        end_time = str(t2 + datetime.timedelta(days=1))
    connection.close()