def show_mysql_geolife_track(traId, file_name): db = db_mysql.db_connection(host_name='localhost', user_name='root', password='******', database='stdatamining', charset='utf8') cur = db.cursor() select_sql = 'SELECT longitude, latitude, time_date FROM geolife_point WHERE traid=' + \ str(traId) + ' ORDER BY time_date' lon = [] lat = [] try: cur.execute(select_sql) result = cur.fetchall() for record in result: lon.append(record[0]) lat.append(record[1]) if file_name != 'default': file_name += '\\' + str(traId) + '.txt' f = open(file_name, 'w') for result in cur: line = '' for ele in result: line += str(ele) line += ',' f.write(line + '\n') f.close() cur.close() except: print 'errot to execute the sql' db.close() pl.plot(lon, lat, 'g') pl.xlabel('longitude') pl.ylabel('latitude') pl.title('traId_' + str(traId) + '_' + str(len(lon))) pl.show()
def show_region_ratio(): # 显示不同类型的场所在不同poi区域中的比值规律 connection = db.db_connection() sql = "SELECT * FROM poi_feature ORDER BY poi_id" cursor = connection.cursor() result = {} try: cursor.execute(sql) records = cursor.fetchall() for record in records: for j in range(1, 26): if j in result.keys(): result[j].append(float(record[j])) else: result[j] = [float(record[j])] except Exception: print 'wrong' for key in result.keys(): value = result[key] pl.plot( value, 'g', label='region ' + str(key), ) pl.legend() pl.savefig('data_file\\' + str(key) + '.png') pl.close()
def extract_data_by_day(): connection = db.db_connection() # 起始日期 start_time = '2016-01-01 00:00:00' # 结束时间 end_time = '2016-01-01 23:59:59' while start_time < '2016-01-31 00:00:00': sql_head = "CREATE TABLE "+start_time.split(' ')[0].replace('-', '_') + "_order_data" + \ " AS SELECT * FROM order_data " # sql_head = "CREATE TABLE " + start_time.split(' ')[0].replace('-', '_') + \ # "_pass_orderNum AS SELECT COUNT(*) orderNum,passenger_id FROM " + \ # start_time.split(' ')[0].replace('-', '_') + "_order_data GROUP BY passenger_id" sql_condition = "where time<'" sql_condition += end_time sql_condition += "' and time>'" sql_condition += start_time + "'" sql = sql_head + sql_condition print sql print sql_head try: cursor = connection.cursor() cursor.execute(sql) # cursor.execute(sql_head) cursor.close() except Exception: print 'Wrong!' # 日期按天自增 t1 = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") t2 = datetime.datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S") start_time = str(t1 + datetime.timedelta(days=1)) end_time = str(t2 + datetime.timedelta(days=1)) connection.close()
def calculate_poi_feature(feature_file): connection = db.db_connection() cursor = connection.cursor() f = open(feature_file, 'w') # poi_sql = 'select * from poi_region_order' poi_sql = 'select * from poi_region' cursor.execute(poi_sql) region_num_list = [] records = cursor.fetchall() for record in records: # num = 0 # for i in range(1, len(record)-1): # num += int(record[i]) num = int(record[len(record) - 1]) region_num_list.append(num) max_region_num = max(region_num_list) print max_region_num for i, record in enumerate(records): line = [record[0]] region_num = int(region_num_list[i]) for j in range(1, len(record) - 1): if record[j] is not None: line.append(str(float(record[j]) / region_num)) # the ratio in district_id else: line.append('0') line.append( str(region_num / float(max_region_num))) # the ratio of district_id in all line.append(str(region_num)) line_content = ','.join(line) f.write(line_content + '\n') f.close() connection.close()
def execute_in_sql(file_name, save_file): print file_name connection = db.db_connection() cursor = connection.cursor() r_f = open(file_name, 'r') w_f = open(save_file, 'w') line = r_f.readline() delete_sql_head = "DELETE FROM " + file_name.split('E:\\data_clean\\')[1].split('.')[0] + \ "_order_data_copy WHERE order_id IN " print delete_sql_head order_id_list = [] while line: if line != '': order_id_list.append(line.split('=')[1].split(';')[0]) line = r_f.readline() delete_sql_condi = '(' + ','.join(order_id_list) + ');\n' sql = delete_sql_head + delete_sql_condi try: cursor.execute(sql) connection.commit() except Exception: print 'Wrong' print '*' * 10 w_f.write(sql) connection.close() r_f.close() w_f.close()
def feature_analysis(feature, district, days, save_dir): connection = db.db_connection() cursor = connection.cursor() sql_base = "SELECT %s FROM gap_table WHERE day='%s' AND district_id='%s' ORDER BY time_slice " colors = ['b', 'g', 'r', 'y', 'k', 'w', 'm'] count = 0 is_first = True for day in days: sql = sql_base % (feature, day, district) print sql result = [] try: cursor.execute(sql) records = cursor.fetchall() for record in records: if record[0] is not None: result.append(float(record[0])) else: result.append(-1) except Exception: print 'wrong' result = data_supplement(result) # if is_first: # pl.plot(result, color=colors[count % len(colors)], label=feature) # is_first = False # else: # pl.plot(result, color=colors[count % len(colors)]) pl.plot(result, color=colors[count % len(colors)], label=day) count += 1 pl.legend() pl.title(feature) fig_name = district + "_" + feature + ".png" pl.savefig(save_dir + fig_name) pl.close()
def extract_batch_data(): db = db_mysql.db_connection(host_name='localhost', user_name='root', password='******', database='stdatamining', charset='utf8') cur = db.cursor() traId_list_sql = "SELECT objid,traid FROM continuous_8h_track" print traId_list_sql cur.execute(traId_list_sql) result = cur.fetchall() traId_list = [[ele for ele in record] for record in result] #选出所有满足条件的objId和traId cur.close() # 先创建目录 if not os.path.exists('data\\geolife_data\\geolife_data_8h'): os.makedirs('data\\geolife_data\\geolife_data_8h') for objId_traId in traId_list: file_name = 'data\\geolife_data\\geolife_data_8h\\' objId = objId_traId[0] traId = objId_traId[1] txt_name = str(objId) + '_' + str(traId) + '.txt' file_name += txt_name select_sql = 'SELECT longitude, latitude, time_date FROM geolife_point WHERE traid=' + \ str(traId) + ' ORDER BY time_date' print select_sql cur = db.cursor() cur.execute(select_sql) result = cur.fetchall() f = open(file_name, 'w') for record in result: line = '' for ele in record: line += str(ele) line += ',' f.write(line + '\n') f.close() cur.close() db.close()
def extract_data(traid_file, data_dir): if not os.path.exists(data_dir): os.makedirs(data_dir) db = db_mysql.db_connection(host_name='localhost', user_name='root', password='******', database='stdatamining', charset='utf8') cur = db.cursor() f_r = open(traid_file, 'r') lines = f_r.readlines() for line in lines: traid = line.split(',')[0] select_sql = 'SELECT longitude, latitude, time_date FROM geolife_point WHERE traid=' + \ str(traid) + ' ORDER BY time_date' cur.execute(select_sql) result = cur.fetchall() data_file = data_dir + '\\' + str(traid) + '.txt' f_w = open(data_file, 'w') for record in result: line = '' for ele in record: line += str(ele) line += ',' f_w.write(line + '\n') f_w.close() cur.close() db.close
def create_gap_table(sql_file): w_f = open(sql_file, 'w') create_table_sql = "create table `gap_table` (\n" + "`restrict_id` varchar(10),\n" + \ "`time_slice` varchar(10),\n" + \ "`gap` varchar(10)" + "\n);\n" print create_table_sql w_f.write(create_table_sql) select_sql = "SELECT start_district_id,time_slice FROM order_data WHERE driver_id='null'" connection = db.db_connection() cursor = connection.cursor() result_dic = {} try: cursor.execute(select_sql) record_results = cursor.fetchall() count = 0 num = 0 for record in record_results: if count > 10000: num += 1 print '*'*10 + str(num) count = 0 else: count += 1 dic_key = record[0] + "#" + record[1] if dic_key in result_dic.keys(): result_dic[dic_key] += 1 else: result_dic[dic_key] = 1 except Exception: print 'Wrong' if len(result_dic) > 0: count = 0 num = 0 for record in result_dic.iteritems(): if count > 10000: num += 1 print '-'*10 + str(num) count = 0 else: count += 1 dic_key = record[0] district_id = dic_key.split('#')[0] time_slice = dic_key.split('#')[1] dic_value = record[1] insert_sql = "insert into `gap_table` (`restrict_id`,`time_slice`,`gap`) values (" + \ "'" + district_id + "','" + time_slice + "','" + str(dic_value) + "');\n" w_f.write(insert_sql) w_f.write('commit;') cursor.close() connection.close() w_f.close()
def get_poi_map(): connection = db.db_connection() sql = 'SELECT * FROM cluster_map' cursor = connection.cursor() dic = {} try: cursor.execute(sql) results = cursor.fetchall() for row in results: key = row[1] value = row[0] dic[key] = value except Exception: print 'error' connection.close() return dic
def model_evaluate(csv_file): csv_file = file(csv_file, 'rb') reader = csv.reader(csv_file) connection = db.db_connection() cursor = connection.cursor() gap_dic = {} select_all_gap = 'SELECT * FROM gap_table' try: cursor.execute(select_all_gap) record_results = cursor.fetchall() for record in record_results: dic_key = record[0] + '#' + record[1] dic_value = int(record[2]) gap_dic[dic_key] = dic_value except Exception: print 'Wrong' cursor.close() connection.close() test_record_dic = {} for line in reader: district_id = line[0] if district_id in test_record_dic.keys(): test_record_dic[district_id].append(line[1] + '#' + line[2]) else: test_record_dic[district_id] = [line[1] + '#' + line[2]] test_keys = test_record_dic.keys() evaluation_score = 0 evaluation_score_district = 0 for district in test_keys: district_gap_list = test_record_dic[district] for ele in district_gap_list: gap_key = district + '#' + ele.split('#')[0] evaluate_gap = float(ele.split('#')[1]) if gap_key in gap_dic.keys(): real_gap = gap_dic[gap_key] else: real_gap = 0 if real_gap != 0: evaluation_score_district += abs((real_gap - evaluate_gap) / real_gap) else: continue evaluation_score_district /= len(district_gap_list) # q的选择把所有的(包括为0的都考虑在内) evaluation_score += evaluation_score_district evaluation_score /= len(test_record_dic) return evaluation_score
def extract_data_mysql(): # 起始日期 start_time = '2008-10-01 00:00:00' # 结束时间 end_time = '2008-10-01 23:59:59' while start_time < '2010-01-01 00:00:00': file_name = 'data\\geolife_data\\geolife_153_' file_name += start_time.split(' ')[0]+'.txt' sql_head = "select longitude, latitude, time_date from GEOLIFE_153 t " sql_condition = "where t.time_date < '" sql_condition += end_time sql_condition += "' and t.time_date > '" sql_condition += start_time sql_condition += "' order by time_date" record_count = db_mysql.get_record_count('select count(*) from GEOLIFE_153 t '+sql_condition) print record_count if record_count > 0: sql = sql_head+sql_condition print sql connection = db_mysql.db_connection(host_name='localhost', user_name='root', password='******', database='stdatamining', charset='utf8') cursor = connection.cursor() try: cursor.execute(sql) results = cursor.fetchall() f = open(file_name, 'w') for record in results: line = '' for ele in record: line += str(ele) line += ',' f.write(line+'\n') except Exception: print 'Wrong' f.close() cursor.close() connection.close() # 日期按天自增 t1 = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") t2 = datetime.datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S") start_time = str(t1+datetime.timedelta(days=1)) end_time = str(t2+datetime.timedelta(days=1))
def process_dataobjrelation(): db = db_mysql.db_connection(host_name='localhost', user_name='root', password='******', database='stdatamining', charset='utf8') cur = db.cursor() select_sql1 = 'SELECT DISTINCT objid FROM geolife_trajectory WHERE objid>400076' cur.execute(select_sql1) result = cur.fetchall() begin = 10501 for record in result: objid = record[0] objname = "'" + str(objid)[3:] + "'" insert_sql = "insert into dataobjrelation values(%d,400,'geolife',%d,%s);" % (begin, objid, objname) print insert_sql begin += 1 cur2 = db.cursor() cur2.execute(insert_sql) db.commit() cur2.close() cur.close() db.close()
def show_mean_gap(): start_time = '2016-01-02 00:00:00' end_time = '2016-01-20 00:00:00' date_list = [] while start_time <= end_time: date_list.append(start_time.split(' ')[0]) t1 = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") start_time = str(t1 + datetime.timedelta(days=1)) connection = db.db_connection() cursor = connection.cursor() sql = "SELECT * FROM gap_table WHERE district_id='01' ORDER BY time_slice" gap_dic = {} try: cursor.execute(sql) records = cursor.fetchall() for record in records: key = record[1] value = record[2] if key not in gap_dic.keys(): gap_dic[key] = value except Exception: print 'wrong' result = [] for i in range(1, 145): total = 0 for ele in date_list: if i < 10: key = ele + '-00' + str(i) else: if i < 100: key = ele + '-0' + str(i) else: key = ele + '-' + str(i) if key in gap_dic.keys(): total += float(gap_dic[key]) result.append(total / float(len(date_list))) index = range(1, 145) pl.plot(index, result, 'g', label='mean_gap') pl.legend() pl.show()
def show_district_gap(district_id): connection = db.db_connection() cursor = connection.cursor() sql = "SELECT * FROM gap_table WHERE district_id='" + district_id + \ "' AND time_slice> '2016-01-15-001' AND time_slice<'2016-01-21-144'ORDER BY time_slice" print sql gap_list = [] try: cursor.execute(sql) records = cursor.fetchall() for record in records: if float(record[2]) > 40: gap_list.append(40) else: gap_list.append(float(record[2])) except Exception: print 'wrong' cursor.close() connection.close() if len(gap_list) > 0: pl.plot(gap_list, 'g', label='gap') pl.legend() pl.show()
def statics_by_poi(output_file): poi_map = get_poi_map() connection = db.db_connection() cursor = connection.cursor() f = open(output_file, 'w') for poi_id in poi_map.keys(): poi_hash = poi_map[poi_id] select_sql = "SELECT poi_class FROM poi_data where district_hash='%s'" % poi_hash cursor.execute(select_sql) result = cursor.fetchone()[0] detail_eles = result.split(' ') region_count_map = {} for ele in detail_eles: index = ele.find('#') if index == -1: # if doesn't exits symbol #, then split by : key = int(ele.split(':')[0]) value = int(ele.split(':')[1]) region_count_map[key] = value else: key = int(ele.split('#')[0]) leng = len(ele.split('#')) value = int(ele.split('#')[leng - 1].split( ':')[1]) # if exits #, take the last element as the number if key in region_count_map.keys(): region_count_map[key] += value else: region_count_map[key] = value max_region_index = max(region_count_map.items(), key=lambda x: x[0])[0] content1 = str(poi_id) + '#' + str(max_region_index) + '#' content2 = '' for key in region_count_map.keys(): content2 += str(key) + ':' + str(region_count_map[key]) + ' ' content = content1 + content2 f.write(content) f.write('\n') connection.close() f.close()
def feature_judge(sql_file): w_f = open(sql_file, 'w') w_f.write("DROP TABLE IF EXISTS `feature_judge`;\n") w_f.write("CREATE TABLE `feature_judge` (\n" + "`district_id` varchar(10) DEFAULT NULL,\n" + "`time_slice` varchar(20) DEFAULT NULL,\n " + "`district_code` varchar(10) DEFAULT NULL,\n" + "`time_slice_code` varchar(10)\n " + ");\n") code_1000 = [ '1', '12', '16', '19', '20', '24', '25', '28', '29', '38', '42', '46', '53' ] code_1100 = ['4', '6', '37'] code_0100 = ['41'] code_1010 = ['26'] code_1110 = ['14', '23'] code_1111 = ['7', '8', '48', '51'] connection = db.db_connection() sql = "SELECT district_id,time_slice FROM gap_table ORDER BY district_id,time_slice" print sql cursor = connection.cursor() try: cursor.execute(sql) records = cursor.fetchall() insert_base = "insert into `feature_judge`(`district_id`,`time_slice`,`district_code`,`time_slice_code`)" for record in records: district_id = record[0] time_slice = record[1] district_code = '0000' if district_id in code_0100: district_code = '0100' if district_id in code_1000: district_code = '1000' if district_id in code_1010: district_code = '1010' if district_id in code_1100: district_code = '1100' if district_id in code_1110: district_code = '1110' if district_id in code_1111: district_code = '1111' segment = time_slice.split('-')[3] time_slice_code = '0' # 默认为0 if segment >= '100': if segment <= '110': time_slice_code = '4' else: if '125' <= segment <= '135': time_slice_code = '5' else: if '000' <= segment <= '030': time_slice_code = '1' if '040' <= segment <= '060': time_slice_code = '2' if '075' <= segment <= '095': time_slice_code = '3' inser_value = "values ('" + district_id + "','" + time_slice + "','" +\ district_code + "','" + time_slice_code + "');\n" w_f.write(insert_base + inser_value) except Exception: print 'wrong!' w_f.write('commit;') cursor.close() connection.close()
def data_clean(): connection = db.db_connection() # 起始日期 start_time = '2016-01-01 00:00:00' while start_time < '2016-01-31 00:30:00': print start_time day_order_table = start_time.split(' ')[0].replace( '-', '_') + "_order_data_copy" # 每天的记录表 cursor = connection.cursor() f = open(start_time.split(' ')[0].replace('-', '_') + ".txt", 'w') day_start_time = start_time.split(' ')[0] + ' 00:00:00' day_end_time = start_time.split(' ')[0] + ' 0:59:59' day_deadline = start_time.split(' ')[0] + ' 23:59:59' while day_end_time <= day_deadline: sql = "SELECT order_id,driver_id,passenger_id,dest_district_hash,time_slice FROM " + \ day_order_table + " WHERE TIME>'" +\ day_start_time + "' AND TIME<='" + day_end_time + "' ORDER BY passenger_id,TIME" # sql = "SELECT order_id,driver_id,passenger_id,dest_district_hash,time_slice " \ # "FROM 2016_01_01_order_data_copy WHERE passenger_id='00156d54838c42117ddc25cf5bf330bc' ORDER BY time" print sql try: cursor.execute(sql) order_results = cursor.fetchall() is_first_record = True for record in order_results: if is_first_record: is_first_record = False last_order_id = record[0] last_drive_id = record[1] last_pass_id = record[2] last_dest_hash = record[3] last_time_slice = record[4] continue else: next_order_id = record[0] next_drive_id = record[1] next_pass_id = record[2] next_dest_hash = record[3] next_time_slice = record[4] if next_pass_id == last_pass_id and next_time_slice == last_time_slice: # 应答情况都为空,目的地一样 if next_drive_id == 'NULL' and last_drive_id == 'NULL' and next_dest_hash == last_dest_hash: delete_sql = "DELETE FROM " + day_order_table + \ " WHERE order_id='" + last_order_id + "';\n" f.write(delete_sql) # 去的地方一样,且应答情况不一样 if next_drive_id != last_drive_id and next_dest_hash == last_dest_hash: if next_drive_id == 'NULL': # 哪种情况为空就删除哪条记录 delete_sql = "DELETE FROM " + day_order_table + \ " WHERE order_id='" + next_order_id + "';\n" f.write(delete_sql) if last_drive_id == 'NULL': delete_sql = "DELETE FROM " + day_order_table + \ " WHERE order_id='" + last_order_id + "';\n" f.write(delete_sql) # 去的地方不一样,应答情况也不一样, if next_drive_id != last_drive_id and next_dest_hash != last_dest_hash: continue last_order_id = next_order_id last_drive_id = next_drive_id last_pass_id = next_pass_id last_dest_hash = next_dest_hash last_time_slice = next_time_slice # last_time = next_time # last_time_second = next_time_second except Exception: print 'Wrong!!!' day_t1 = datetime.datetime.strptime(day_start_time, "%Y-%m-%d %H:%M:%S") day_t2 = datetime.datetime.strptime(day_end_time, "%Y-%m-%d %H:%M:%S") day_start_time = str(day_t1 + datetime.timedelta(hours=1)) day_end_time = str(day_t2 + datetime.timedelta(hours=1)) f.close() cursor.close() # 日期按天自增 t1 = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") start_time = str(t1 + datetime.timedelta(days=1)) connection.close()
def data_clean2(): connection = db.db_connection() # 起始日期 start_time = '2016-01-01 00:00:00' while start_time < '2016-01-31 00:30:00': print start_time day_order_table = start_time.split(' ')[0].replace( '-', '_') + "_order_data" # 每天的记录表 cursor = connection.cursor() f = open(start_time.split(' ')[0].replace('-', '_') + ".txt", 'w') day_start_time = start_time.split(' ')[0] + ' 00:00:00' day_end_time = start_time.split(' ')[0] + ' 0:59:59' day_deadline = start_time.split(' ')[0] + ' 23:59:59' while day_end_time <= day_deadline: sql = "SELECT order_id,driver_id,passenger_id,dest_district_hash,time_slice FROM " + \ day_order_table + " WHERE TIME>'" + \ day_start_time + "' AND TIME<='" + day_end_time + "' ORDER BY passenger_id,TIME" # sql = "SELECT order_id,driver_id,passenger_id,dest_district_hash,time_slice " \ # "FROM 2016_01_01_order_data WHERE passenger_id='00156d54838c42117ddc25cf5bf330bc' ORDER BY time" print sql try: cursor.execute(sql) order_results = cursor.fetchall() is_first_record = True null_order_id_list = [] for record in order_results: if is_first_record: is_first_record = False last_order_id = record[0] last_drive_id = record[1] last_pass_id = record[2] last_dest_hash = record[3] last_time_slice = record[4] if last_drive_id == 'NULL': null_order_id_list.append(last_drive_id) continue else: next_order_id = record[0] next_drive_id = record[1] next_pass_id = record[2] next_dest_hash = record[3] next_time_slice = record[4] if next_pass_id == last_pass_id and next_time_slice == last_time_slice: if next_drive_id == 'NULL': null_order_id_list.append(next_drive_id) else: continue else: # 如果用户跳变,或者是时间片跳变 if len(null_order_id_list) > 1: for i in range(1, len(null_order_id_list)): delete_sql = "DELETE FROM " + day_order_table + \ " WHERE order_id='" + null_order_id_list[i] + "';\n" f.write(delete_sql) null_order_id_list = [] if next_drive_id == 'NULL': null_order_id_list.append(next_drive_id) else: continue last_order_id = next_order_id last_drive_id = next_drive_id last_pass_id = next_pass_id last_dest_hash = next_dest_hash last_time_slice = next_time_slice # last_time = next_time # last_time_second = next_time_second if len(null_order_id_list) > 1: for i in range(1, len(null_order_id_list)): delete_sql = "DELETE FROM " + day_order_table + \ " WHERE order_id='" + null_order_id_list[i] + "';\n" f.write(delete_sql) null_order_id_list = [] except Exception: print 'Wrong!!!' day_t1 = datetime.datetime.strptime(day_start_time, "%Y-%m-%d %H:%M:%S") day_t2 = datetime.datetime.strptime(day_end_time, "%Y-%m-%d %H:%M:%S") day_start_time = str(day_t1 + datetime.timedelta(hours=1)) day_end_time = str(day_t2 + datetime.timedelta(hours=1)) f.close() cursor.close() # 日期按天自增 t1 = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") start_time = str(t1 + datetime.timedelta(days=1)) connection.close()
def process(data_dir, temp_file_name): if not os.path.exists(data_dir): os.makedirs(data_dir) traid_begin = 20095 pointid_begin = 119443809 dataid = 500 objid = 500001 UTC_FORMAT = '%Y-%m-%dT%H:%M:%S.%fz' now_stamp = time.time() local_time = datetime.datetime.fromtimestamp(now_stamp) utc_time = datetime.datetime.utcfromtimestamp(now_stamp) offset = local_time - utc_time for parent, dirName, file_names in os.walk(data_dir): for file_name in file_names: temp_file = open(temp_file_name, 'w') csvfile = file(data_dir + '\\' + file_name, 'r') reader = csv.reader(csvfile) line_number = 1 traname = '\'' + file_name.split('.')[0] + '\'' is_first = True count = 0 for line in reader: if line_number <= 4: line_number += 1 continue else: count += 1 longitude = float(line[3]) latitude = float(line[2]) height = round(float(line[4])) utc_time = line[8] local_time = datetime.datetime.strptime( utc_time, UTC_FORMAT) + offset t = '\'' + local_time.strftime('%Y-%m-%d %H:%M:%S') + '\'' if is_first: from_time = t is_first = False to_time = t insert_sql = "INSERT INTO owndata_point(pointid, traid, longitude, latitude,time_date,height) VALUES" \ " (%d,%d,%f,%f,%s,%d);" % (pointid_begin, traid_begin, longitude, latitude, t, height) temp_file.write(insert_sql + '\n') pointid_begin += 1 temp_file.close() insert_trajectory_sql = "INSERT INTO owndata_trajectory(traid,traname,dataid,objid,starttime,endtime,totalpoint)" \ " VALUES (%d,%s,%d,%d,%s,%s,%d);" % \ (traid_begin, traname, dataid, objid, from_time, to_time, count) traid_begin += 1 csvfile.close() print insert_trajectory_sql db = db_mysql.db_connection(host_name='localhost', user_name='root', password='******', database='stdatamining', charset='utf8') cur = db.cursor() cur.execute(insert_trajectory_sql) db.commit() f_r = open(temp_file_name, 'r') for line_sql in f_r.readlines(): sql = line_sql.split('\n')[0] cur.execute(sql) db.commit() cur.close() f_r.close()
def data_clean(): connection = db.db_connection() # 起始日期 start_time = '2016-01-01 00:00:00' # 结束时间 end_time = '2016-01-01 23:59:59' while start_time < '2016-01-31 00:00:00': print start_time pass_ordernum_table = start_time.split(' ')[0].replace( '-', '_') + "_pass_orderNum" # 用户记录数表 day_order_table = start_time.split(' ')[0].replace( '-', '_') + "_order_data" # 每天的记录表 pass_id_list = [] select_pass_id_sql = "SELECT passenger_id FROM " + pass_ordernum_table + " WHERE orderNum>1" cursor = connection.cursor() try: cursor.execute(select_pass_id_sql) results = cursor.fetchall() for pass_id in results: pass_id_list.append(pass_id[0]) except Exception: print 'Error!' if len(pass_id_list) > 0: f = open("start_time.split(' ')[0].replace('-', '_')" + ".txt", 'w') for id_ele in pass_id_list: # 对每个用户进行处理 select_pass_order_sql = "SELECT order_id,TIME FROM " + day_order_table + \ " WHERE passenger_id='" + id_ele + "' ORDER BY TIME " order_id_time_dic = {} try: cursor.execute(select_pass_order_sql) order_id_time_results = cursor.fetchall() for ele in order_id_time_results: order_id_time_dic[ele[0]] = ele[1] #订单号和时间组成字典 except Exception: print 'WrongWrong!!' if len(order_id_time_dic) > 0: sorted_dic = sorted(order_id_time_dic.iteritems(), key=lambda x: x[1]) time_list = [] order_id_list = [] for i in range(len(sorted_dic)): time_list.append(sorted_dic[i][1]) order_id_list.append(sorted_dic[i][0]) # last_time = time.strptime(time_list[0], "%Y-%m-%d %H:%M:%S") last_time = time_list[0] last_time_second = int(time.mktime(last_time.timetuple())) for i in range(1, len(time_list)): # next_time = time.strptime(time_list[i], "%Y-%m-%d %H:%M:%S") next_time = time_list[i] next_time_second = int( time.mktime(next_time.timetuple())) time_gap = next_time_second - last_time_second last_time_second = next_time_second if time_gap < 10 * 60: order_id = order_id_list[i] delete_sql = "DELETE FROM " + day_order_table + " WHERE order_id='" + order_id + "';\n" f.write(delete_sql) f.close() cursor.close() # 日期按天自增 t1 = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") t2 = datetime.datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S") start_time = str(t1 + datetime.timedelta(days=1)) end_time = str(t2 + datetime.timedelta(days=1)) connection.close()