def main(): country = 'uk' traj_table = 'tak.%s_traj' % country evnt_table = 'tak.%s_evnt' % country crash_table = 'tak.%s_crash' % country drop_table = True max_speed = 0.07 space_treshold = 0.05 time_treshold = 1200 # split traj if time gap between consecutive points > max_time_gap users = pd.read_csv('../../dataset/users_%s.csv' % country, header=None)[0].values.tolist() client = pymongo.MongoClient('mongodb://username@ipaddress:port/') db = client['dataset2'] con = database_io.get_connection() print(datetime.now(), 'Building trajectories') for i, uid in enumerate(users): traj_data = db.POSITIONS.find({ 'T&K_VOUCHER_ID': uid }).sort('TIMESTAMP_LOCAL', pymongo.ASCENDING) evnt_data = db.EVENTS.find({ 'T&K_VOUCHER_ID': uid }).sort('TIMESTAMP_LOCAL', pymongo.ASCENDING) crash_data = db.CRASH.find({ 'T&K_VOUCHER_ID': uid }).sort('TIMESTAMP_LOCAL', pymongo.ASCENDING) print( datetime.now(), 'Processing user %s, %s of %s (%.2f)' % (uid, i, len(users), 100.0 * i / len(users))) gps2trajevntcrash(traj_data, evnt_data, crash_data, traj_table, evnt_table, crash_table, con, drop_table=drop_table, max_speed=max_speed, space_treshold=space_treshold, time_treshold=time_treshold) drop_table = False print(datetime.now(), 'Process ended.')
def imn_extract(filename, path, type_user, traj_table, evnt_table, min_traj_nbr, min_length, min_duration, area, overwrite=False): output_filename = path + filename con = database_io.get_connection() cur = con.cursor() #users_list = [100225,101127,100742,100747,100690,100578,1003,100191,100192,100193,321463] users_list = [100619, 100554] users_list = sorted(users_list) nbr_users = len(users_list) print("user ids before checking :") print(nbr_users, len(users_list)) if os.path.isfile(output_filename) and not overwrite: processed_users = list() fout = gzip.GzipFile(output_filename, 'r') for row in fout: customer_obj = json.loads(row) processed_users.append(customer_obj['uid']) fout.close() users_list = [uid for uid in users_list if uid not in processed_users] print("user ids after checking :") print(nbr_users, len(users_list)) for i, uid in enumerate(users_list): if i % 1 == 0: print( datetime.datetime.now(), '%s %s %s [%s/%s] - %.2f' % (traj_table, area, type_user, i, nbr_users, i / nbr_users * 100.0)) imh = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length, min_duration) events = database_io.load_individual_event_history( cur, uid, evnt_table) if evnt_table is not None else None #imh['trajectories']=dict(list(islice(imh['trajectories'].items(), 200))) if len(imh['trajectories']) < min_traj_nbr: print('len trajectories]) < min_traj_nbr', len(imh['trajectories']), min_traj_nbr) continue main_imh = imh['trajectories'] jan_feb_tid = [] march_april_id = [] for tid, t in imh['trajectories'].items(): start_time = str(t.start_time()) if ('2017-01' in start_time) or ('2017-02' in start_time): jan_feb_tid.append(tid) if ('2017-03' in start_time) or ('2017-04' in start_time): march_april_id.append(tid) imh['trajectories'] = {x: imh['trajectories'][x] for x in jan_feb_tid} imn1 = individual_mobility_network.build_imn(imh, reg_loc=True, events=events, verbose=False) period_imn1 = {"01-02": imn1} imh['trajectories'] = {x: main_imh[x] for x in march_april_id} imn2 = individual_mobility_network.build_imn(imh, reg_loc=True, events=events, verbose=False) period_imn2 = {"03-04": imn2} customer_obj = {'uid': uid} period_imn1.update(period_imn2) customer_obj.update(period_imn1) json_str = '%s\n' % json.dumps(clear_tuples4json(customer_obj), default=agenda_converter) json_bytes = json_str.encode('utf-8') with gzip.GzipFile(output_filename, 'a') as fout: fout.write(json_bytes) print("done") cur.close() con.close()
def imn_extract_for_one_month(filename, path, type_user, traj_table, evnt_table, min_traj_nbr, min_length, min_duration, area, overwrite=False): output_filename = path + filename con = database_io.get_connection() cur = con.cursor() users_list = find_user_list(cur, traj_table) nbr_users = len(users_list) print("user ids before checking :") print(nbr_users, len(users_list)) if os.path.isfile(output_filename) and not overwrite: processed_users = list() fout = gzip.GzipFile(output_filename, 'r') for row in fout: customer_obj = json.loads(row) processed_users.append(customer_obj['uid']) fout.close() users_list = [uid for uid in users_list if uid not in processed_users] print("user ids after checking :") print(nbr_users, len(users_list)) for i, uid in enumerate(users_list): try: if i % 1 == 0: print( datetime.datetime.now(), '%s %s %s [%s/%s] - %.2f' % (traj_table, area, type_user, i, nbr_users, i / nbr_users * 100.0)) imh = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length, min_duration) events = database_io.load_individual_event_history( cur, uid, evnt_table) if evnt_table is not None else None if len(imh['trajectories']) < min_traj_nbr: print('len trajectories]) < min_traj_nbr', len(imh['trajectories']), min_traj_nbr) continue main_imh = imh['trajectories'] jan_tid = [] for tid, t in imh['trajectories'].items(): start_time = str(t.start_time()) if ('2017-01' in start_time): jan_tid.append(tid) imh['trajectories'] = {x: imh['trajectories'][x] for x in jan_tid} imn1 = individual_mobility_network.build_imn(imh, reg_loc=True, events=events, verbose=False) period_imn1 = {"01": imn1} customer_obj = {'uid': uid} customer_obj.update(period_imn1) json_str = '%s\n' % json.dumps(clear_tuples4json(customer_obj), default=agenda_converter) json_bytes = json_str.encode('utf-8') with gzip.GzipFile(output_filename, 'a') as fout: fout.write(json_bytes) except (TypeError): print("type error") continue print("done") cur.close() con.close()
def imn_extract_all_year(filename, path, type_user, traj_table, evnt_table, min_traj_nbr, min_length, min_duration, area, overwrite=False): output_filename = path + filename con = database_io.get_connection() cur = con.cursor() #users_list=find_user_list(cur,traj_table): #users_list = [100225,101127,100742,100747,100690,100578,1003,100191,100192,100193,318819,100619,100554,100498] #users_list = [100843,100836,100827,100795,100747,100717,100681,100669,101293,101194,101091] users_list = [7925] users_list = sorted(users_list) nbr_users = len(users_list) print("user ids before checking :") print(nbr_users, len(users_list)) if os.path.isfile(output_filename) and not overwrite: processed_users = list() fout = gzip.GzipFile(output_filename, 'r') for row in fout: customer_obj = json.loads(row) processed_users.append(customer_obj['uid']) fout.close() users_list = [uid for uid in users_list if uid not in processed_users] print("user ids after checking :") print(nbr_users, len(users_list)) for i, uid in enumerate(users_list): try: if i % 1 == 0: print( datetime.datetime.now(), '%s %s %s [%s/%s] - %.2f' % (traj_table, area, type_user, i, nbr_users, i / nbr_users * 100.0)) imh = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length, min_duration) events = database_io.load_individual_event_history( cur, uid, evnt_table) if evnt_table is not None else None #imh['trajectories']=dict(list(islice(imh['trajectories'].items(), 200))) if len(imh['trajectories']) < min_traj_nbr: print('len trajectories]) < min_traj_nbr', len(imh['trajectories']), min_traj_nbr) continue imn = individual_mobility_network.build_imn(imh, reg_loc=True, events=events, verbose=False) customer_obj = {'uid': uid} customer_obj.update(imn) json_str = '%s\n' % json.dumps(clear_tuples4json(customer_obj), default=agenda_converter) json_bytes = json_str.encode('utf-8') with gzip.GzipFile(output_filename, 'a') as fout: fout.write(json_bytes) except (TypeError): print("type error") continue print("done") cur.close() con.close()
def main(): area = sys.argv[1] # 'rome' 'tuscany' 'london' type_user = sys.argv[2] # 'crash' 'nocrash' overwrite = int(sys.argv[3]) country = 'uk' if area == 'london' else 'italy' min_length = 1.0 min_duration = 60.0 print(datetime.datetime.now(), 'Crash Prediction - Train Test Partitioner') if not overwrite: print(datetime.datetime.now(), '(restart)') path = './' path_imn = path + 'imn_new/' path_dataset = path + 'dataset/' path_traintest = path + 'traintest/' path_quadtree = path + 'quadtree/' traj_table = 'tak.%s_traj' % country evnt_table = 'tak.%s_evnt' % country crash_table = 'tak.%s_crash' % country if area == 'london' and type_user == 'nocrash': users_filename = path_dataset + '%s_%s_users_list.csv' % (area, 'all') users_filename_crash = path_dataset + '%s_%s_users_list.csv' % ( area, 'crash') else: users_filename = path_dataset + '%s_%s_users_list.csv' % (area, type_user) users_filename_crash = None users_list = pd.read_csv(users_filename).values[:, 0].tolist() users_list = sorted(users_list) if users_filename_crash is not None: users_list_crash = pd.read_csv( users_filename_crash).values[:, 0].tolist() users_list_crash = sorted(users_list_crash) users_list = [uid for uid in users_list if uid not in users_list_crash] nbr_users = len(users_list) print(datetime.datetime.now(), 'Reading quadtree') quadtree_poi_filename = path_quadtree + '%s_personal_osm_poi_lv17.json.gz' % area fout = gzip.GzipFile(quadtree_poi_filename, 'r') quadtree = json.loads(fout.readline()) fout.close() print(datetime.datetime.now(), 'Reading quadtree features') quadtree_features_filename = path_quadtree + '%s_quadtree_features.json.gz' % area fout = gzip.GzipFile(quadtree_features_filename, 'r') quadtrees_features_str = json.loads(fout.readline()) quadtrees_features = {int(k): v for k, v in quadtrees_features_str.items()} fout.close() processed_users = set() if overwrite: for index in range(0, 7): output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % ( area, type_user, index) if os.path.exists(output_filename): os.remove(output_filename) else: processed_users = set() for index in range(0, 7): output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % ( area, type_user, index) if os.path.isfile(output_filename): fout = gzip.GzipFile(output_filename, 'r') for row in fout: customer_obj = json.loads(row) processed_users.add(customer_obj['uid']) fout.close() window = 4 datetime_from = datetime.datetime.strptime('2017-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') datetime_to = datetime.datetime.strptime('2018-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') print(datetime.datetime.now(), 'Generating month boundaries') months = pd.date_range(start=datetime_from, end=datetime_to, freq='MS') boundaries = [[lm, um] for lm, um in zip(months[:-window], months[window:])] training_months = list() test_months = list() for i in range(len(boundaries) - 1): training_months.append(boundaries[i]) test_months.append(boundaries[i + 1]) index = 0 tr_data_map = dict() ts_data_map = dict() for tr_months, ts_months in zip(training_months, test_months): tr_data_map[tuple(tr_months)] = index ts_data_map[tuple(ts_months)] = index index += 1 print(datetime.datetime.now(), 'Initializing quadtree features') tr_quadtree_features = dict() for m in quadtrees_features: for lu, index in tr_data_map.items(): if lu[0].month <= m < lu[1].month: if index not in tr_quadtree_features: tr_quadtree_features[index] = dict() for path in quadtrees_features[m]: if path not in tr_quadtree_features[index]: tr_quadtree_features[index][path] = { 'nbr_traj_start': 0, 'nbr_traj_stop': 0, 'nbr_traj_move': 0, 'traj_speed_sum': 0, 'traj_speed_count': 0, 'nbr_evnt_A': 0, 'nbr_evnt_B': 0, 'nbr_evnt_C': 0, 'nbr_evnt_Q': 0, 'nbr_evnt_start': 0, 'nbr_evnt_stop': 0, 'speed_A_sum': 0, 'max_acc_A_sum': 0, 'avg_acc_A_sum': 0, 'speed_B_sum': 0, 'max_acc_B_sum': 0, 'avg_acc_B_sum': 0, 'speed_C_sum': 0, 'max_acc_C_sum': 0, 'avg_acc_C_sum': 0, 'speed_Q_sum': 0, 'max_acc_Q_sum': 0, 'avg_acc_Q_sum': 0, 'nbr_crash': 0, } for k, v in quadtrees_features[m][path].items(): tr_quadtree_features[index][path][k] += v ts_quadtree_features = dict() for m in quadtrees_features: for lu, index in tr_data_map.items(): if lu[0].month <= m < lu[1].month: if index not in ts_quadtree_features: ts_quadtree_features[index] = dict() for path in quadtrees_features[m]: if path not in ts_quadtree_features[index]: ts_quadtree_features[index][path] = { 'nbr_traj_start': 0, 'nbr_traj_stop': 0, 'nbr_traj_move': 0, 'traj_speed_sum': 0, 'traj_speed_count': 0, 'nbr_evnt_A': 0, 'nbr_evnt_B': 0, 'nbr_evnt_C': 0, 'nbr_evnt_Q': 0, 'nbr_evnt_start': 0, 'nbr_evnt_stop': 0, 'speed_A_sum': 0, 'max_acc_A_sum': 0, 'avg_acc_A_sum': 0, 'speed_B_sum': 0, 'max_acc_B_sum': 0, 'avg_acc_B_sum': 0, 'speed_C_sum': 0, 'max_acc_C_sum': 0, 'avg_acc_C_sum': 0, 'speed_Q_sum': 0, 'max_acc_Q_sum': 0, 'avg_acc_Q_sum': 0, 'nbr_crash': 0, } for k, v in quadtrees_features[m][path].items(): ts_quadtree_features[index][path][k] += v print(datetime.datetime.now(), 'Connecting to database') con = database_io.get_connection() cur = con.cursor() count = 0 imn_filedata = gzip.GzipFile( path_imn + '%s_imn_%s.json.gz' % (area, type_user), 'r') print(datetime.datetime.now(), 'Calculating features and partitioning dataset') for row in imn_filedata: if len(row) <= 1: print('new file started ;-)') continue user_obj = json.loads(row) uid = user_obj['uid'] count += 1 if uid in processed_users: continue if count % 10 == 0: print( datetime.datetime.now(), 'train test partition %s %s [%s/%s] - %.2f' % (area, type_user, count, nbr_users, 100 * count / nbr_users)) imh = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length, min_duration) events = database_io.load_individual_event_history( cur, uid, evnt_table) if evnt_table is not None else None trajectories = imh['trajectories'] tr_data = dict() ts_data = dict() # partitioning imn for train and test for imn_months in user_obj: if imn_months == 'uid': continue # print(imn_months) m0 = int(imn_months.split('-')[0]) m1 = int(imn_months.split('-')[1]) for lu, index in tr_data_map.items(): if lu[0].month <= m0 < m1 < lu[1].month: if index not in tr_data: tr_data[index] = { 'uid': uid, 'crash': False, 'trajectories': dict(), 'imns': dict(), 'events': dict(), } tr_data[index]['imns'][imn_months] = user_obj[imn_months] for lu, index in ts_data_map.items(): if lu[0].month <= m0 < lu[1].month: if index not in ts_data: ts_data[index] = { 'uid': uid, 'crash': False, 'trajectories': dict(), 'imns': dict(), 'events': dict(), } ts_data[index]['imns'][imn_months] = user_obj[imn_months] # partitioning trajectories for train and test for tid, traj in trajectories.items(): for lu, index in tr_data_map.items(): if lu[0] <= traj.start_time() < lu[1] and index in tr_data: tr_data[index]['trajectories'][tid] = traj for lu, index in ts_data_map.items(): if lu[0] <= traj.start_time() < lu[1] and index in ts_data: ts_data[index]['trajectories'][tid] = traj # partitioning events for train and test for eid, evnt in events.items(): # print(evnt) for lu, index in tr_data_map.items(): if lu[0] <= evnt[0]['date'] < lu[1] and index in tr_data: tr_data[index]['events'][eid] = evnt[0] for lu, index in ts_data_map.items(): if lu[0] <= evnt[0]['date'] < lu[1] and index in ts_data: ts_data[index]['events'][eid] = evnt[0] # get has crash next month for lu, index in tr_data_map.items(): if index not in tr_data: continue query = """SELECT * FROM %s WHERE uid = '%s' AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % ( crash_table, uid, str( lu[1]), str(lu[1] + relativedelta(months=1))) cur.execute(query) rows = cur.fetchall() has_crash_next_month = len(rows) > 0 tr_data[index]['crash'] = has_crash_next_month for lu, index in ts_data_map.items(): if index not in ts_data: continue query = """SELECT * FROM %s WHERE uid = '%s' AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % ( crash_table, uid, str( lu[1]), str(lu[1] + relativedelta(months=1))) cur.execute(query) rows = cur.fetchall() has_crash_next_month = len(rows) > 0 ts_data[index]['crash'] = has_crash_next_month tr_features, ts_features = feature_extractor.extract_features( uid, tr_data, ts_data, quadtree, tr_quadtree_features, ts_quadtree_features) for index in tr_features: if index in ts_features: output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % ( area, type_user, index) store_obj = { 'uid': uid, 'train': tr_features[index], 'test': ts_features[index] } feature_extractor.store_features(output_filename, store_obj) imn_filedata.close()
def main(): filename = "exp.csv" if os.path.isfile(filename): os.remove(filename) with open(filename, 'a', newline='\n') as file: writer = csv.writer(file) writer.writerow([ "initial_threshold", "uid", "len(alltraj)", "M1 len(traj_list)", "M1 user_temporal_thresholds", "M1 avg_nbr_points", "M1 time_precision", "M1 dist_coverage", "M1 mobility_f1", "M2 len(traj_list)", "M2 user_temporal_thresholds", "M2 avg_nbr_points", "M2 time_precision", "M2 dist_coverage", "M2 mobility_f1", "len(traj_list_random )", "avg_nbr_points_random", "time_precision_random", "dist_coverage_random", "mobility_random_f1", "len(traj_list_random4 )", "avg_nbr_points_random4", "time_precision_random4", "dist_coverage_random4", "mobility_random4_f1" ]) input_table = 'tak.italy_traj' # con = database_io.get_connection() # cur = con.cursor() # users_list = database_io.extract_users_list('tak.italy_traj', cur) # cur.close() # con.close() users_list = [ '100006', '100022', '100026', '10008', '100086', '100087', '100088', '100090', '100100', '100117' ] # uid = users_list[0] # con = database_io.get_connection() # cur = con.cursor() # imh = database_io.load_individual_mobility_history(cur, uid, input_table) # cur.close() # con.close() con = database_io.get_connection() cur = con.cursor() #users_list = database_io.extract_users_list('tak.italy_traj', cur) eval_adaptive = list() eval_fix1200 = list() eval_random = list() eval_random2 = list() traj_number = list() thresholds = [60, 120, 180, 240] for t in thresholds: for uid in users_list: print(uid, input_table) imh = database_io.load_individual_mobility_history( cur, uid, input_table) trajectories = imh['trajectories'] alltraj = merge_trajectories(trajectories) #metodo 1: funzione che usa la mediana mobile traj_list1, user_temporal_thr1 = segment_trajectories_user_adaptive( alltraj, uid, temporal_thr=t, spatial_thr=50, max_speed=0.07, gap=60, max_lim=3600 * 48, window=15, smooth_fun=moving_median, min_size=10, return_cut=True) avg_nbr_points1 = np.mean([len(t) for t in traj_list1]) print('user_temporal_thr', user_temporal_thr1) print('NT %d - ANP %.2f' % (len(traj_list1), avg_nbr_points1)) time_precision1, dist_coverage1, mobility1_f1 = evalaute_segmentation( alltraj, traj_list1, print_report=True) eval_adaptive.append( (time_precision1, dist_coverage1, mobility1_f1)) #metodo 2: funzione semplice traj_list2 = segment_trajectories(alltraj, uid, temporal_thr=1200, spatial_thr=50, max_speed=0.07) avg_nbr_points2 = np.mean([len(t) for t in traj_list2]) user_temporal_thr2 = 1200 print('NT %d - ANP %.2f' % (len(traj_list2), avg_nbr_points2)) time_precision2, dist_coverage2, mobility2_f1 = evalaute_segmentation( alltraj, traj_list2, print_report=True) eval_fix1200.append( (time_precision2, dist_coverage2, mobility2_f1)) #metodo 3: funzione random traj_list_random = segment_trajectories_random(alltraj, uid, nbr_traj=2000) avg_nbr_points_random = np.mean([len(t) for t in traj_list_random]) print('NT %d - ANP %.2f' % (len(traj_list_random), avg_nbr_points_random)) time_precision_random, dist_coverage_random, mobility_random_f1 = evalaute_segmentation( alltraj, traj_list_random, print_report=True) eval_random.append((time_precision_random, dist_coverage_random, mobility_random_f1)) #metodo 4: funzione random con nbr_traj_max dato dal primo metodo traj_list_random4 = segment_trajectories_random( alltraj, uid, nbr_traj_min=2, nbr_traj_max=len(traj_list2)) avg_nbr_points_random4 = np.mean( [len(t) for t in traj_list_random4]) print('NT %d - ANP %.2f' % (len(traj_list_random4), avg_nbr_points_random4)) time_precision_random4, dist_coverage_random4, mobility_random4_f1 = evalaute_segmentation( alltraj, traj_list_random4, print_report=True) eval_random2.append((time_precision_random4, dist_coverage_random4, mobility_random4_f1)) #riempire file with open(filename, 'a', newline='\n') as file: writer = csv.writer(file) writer.writerow([ t, uid, len(alltraj), len(traj_list1), user_temporal_thr1, avg_nbr_points1, time_precision1, dist_coverage1, mobility1_f1, len(traj_list2), user_temporal_thr2, avg_nbr_points2, time_precision2, dist_coverage2, mobility2_f1, len(traj_list_random), avg_nbr_points_random, time_precision_random, dist_coverage_random, mobility_random_f1, len(traj_list_random4), avg_nbr_points_random4, time_precision_random4, dist_coverage_random4, mobility_random4_f1 ])
def main(): area = sys.argv[1] country = 'uk' if area == 'london' else 'italy' overwrite = True depth = 16 store_evry = 100 path = './' path_dataset = path + 'dataset/' path_quadtree = path + 'quadtree/' traj_table = 'tak.%s_traj' % country evnt_table = 'tak.%s_evnt' % country crash_table = 'tak.%s_crash' % country users_filename = path_dataset + '%s_all_users_list.csv' % area quadtree_output_filename = path_quadtree + '%s_quadtree_features.json.gz' % area quadtrees_features = dict() datetime_from = datetime.datetime.strptime('2017-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') datetime_to = datetime.datetime.strptime('2018-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') months = pd.date_range(start=datetime_from, end=datetime_to, freq='MS') boundaries = [[lm, um] for lm, um in zip(months[:-1], months[1:])] index = 0 data_map = dict() for months in boundaries: data_map[tuple(months)] = index quadtrees_features[index] = dict() index += 1 users_list = sorted(pd.read_csv(users_filename).values[:, 0].tolist()) last_processed_user = None if os.path.isfile(quadtree_output_filename) and not overwrite: fout = gzip.GzipFile(quadtree_output_filename, 'r') quadtrees_features_str = json.loads(fout.readline()) quadtrees_features = { int(k): v for k, v in quadtrees_features_str.items() } last_processed_user = json.loads(fout.readline()) fout.close() con = database_io.get_connection() cur = con.cursor() for i, uid in enumerate(users_list): if last_processed_user is not None and uid <= last_processed_user: continue if i % store_evry == 0: print( datetime.datetime.now(), '%s %s %.2f' % (traj_table, area, i / len(users_list) * 100.0)) trajectories = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length=1.0, min_duration=60.0)['trajectories'] events = database_io.load_individual_event_history( cur, uid, evnt_table) quadtree_data = dict() # partitioning trajectories for train and test for tid, traj in trajectories.items(): for lu, index in data_map.items(): if lu[0] <= traj.start_time() < lu[1]: if index not in quadtree_data: quadtree_data[index] = { 'uid': uid, 'crash': None, 'trajectories': dict(), 'events': dict(), } quadtree_data[index]['trajectories'][tid] = traj # partitioning events for train and test for eid, evnt in events.items(): for lu, index in data_map.items(): if lu[0] <= evnt[0]['date'] < lu[1] and index in quadtree_data: quadtree_data[index]['events'][eid] = evnt[0] # get has crash this month for lu, index in data_map.items(): if index not in quadtree_data: continue query = """SELECT lat, lon FROM %s WHERE uid = '%s' AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % ( crash_table, uid, str(lu[0]), str(lu[1])) cur.execute(query) rows = cur.fetchall() if len(rows) > 0: quadtree_data[index]['crash'] = { 'lat': float(rows[0][0]), 'lon': float(rows[0][1]) } quadtrees_features = quadtrees_features_extract( quadtrees_features, quadtree_data, depth) if i % store_evry == 0: json_str_quadtree = '%s\n' % json.dumps(quadtrees_features) json_bytes_quadtree = json_str_quadtree.encode('utf-8') json_str_lpu = '%s\n' % json.dumps(last_processed_user) json_bytes_lpu = json_str_lpu.encode('utf-8') with gzip.GzipFile(quadtree_output_filename, 'w') as fout: fout.write(json_bytes_quadtree) fout.write(json_bytes_lpu) last_processed_user = uid
def imn_extract(filename, path, type_user, traj_table, evnt_table, min_traj_nbr, min_length, min_duration, area, overwrite=False, users_filename_crash=None): output_filename = path + '%s_imn_%s.json.gz' % (area, type_user) con = database_io.get_connection() cur = con.cursor() users_list = pd.read_csv(filename).values[:, 0].tolist() users_list = sorted(users_list) if users_filename_crash is not None: users_list_crash = pd.read_csv( users_filename_crash).values[:, 0].tolist() users_list_crash = sorted(users_list_crash) users_list = [uid for uid in users_list if uid not in users_list_crash] nbr_users = len(users_list) print(nbr_users, len(users_list)) if os.path.isfile(output_filename) and not overwrite: processed_users = list() fout = gzip.GzipFile(output_filename, 'r') # count = 0 for row in fout: customer_obj = json.loads(row) processed_users.append(customer_obj['uid']) # print(customer_obj['uid']) # if count == 100: # break # count += 1 fout.close() users_list = [uid for uid in users_list if uid not in processed_users] print(nbr_users, len(users_list)) # from_perc = 95 # to_perc = 100 for i, uid in enumerate(users_list): # if not from_perc < i / len(users_list) * 100.0 <= to_perc: # continue if i % 1 == 0: print( datetime.datetime.now(), '%s %s %s [%s/%s] - %.2f' % (traj_table, area, type_user, i, nbr_users, i / nbr_users * 100.0)) # print(datetime.datetime.now(), '%s %s %s %.2f' % ( # traj_table, area, type_user, i / len(users_list) * 100.0), from_perc, to_perc) imh = database_io.load_individual_mobility_history( cur, uid, traj_table, min_length, min_duration) events = database_io.load_individual_event_history( cur, uid, evnt_table) if evnt_table is not None else None if len(imh['trajectories']) < min_traj_nbr: # print('len trajectories]) < min_traj_nbr', len(imh['trajectories']), min_traj_nbr) continue # print(len(events)) # print(list(events.keys())) wimh_dict = dict() wevents_dict = dict() for tid, traj in imh['trajectories'].items(): st = traj.start_time() stk_list = start_time_map(st) for stk in stk_list: if stk is None: continue if stk not in wimh_dict: wimh_dict[stk] = {'uid': uid, 'trajectories': dict()} wevents_dict[stk] = dict() wimh_dict[stk]['trajectories'][tid] = traj if tid in events: wevents_dict[stk][tid] = events[tid] customer_obj = {'uid': uid} for stk in wimh_dict: wimh = wimh_dict[stk] wevents = wevents_dict[stk] # print(stk, len(wimh['trajectories']), len(wevents)) if len(wimh['trajectories']) < min_traj_nbr // 12: continue imn = individual_mobility_network.build_imn(wimh, reg_loc=True, events=wevents, verbose=False) customer_obj[stk] = imn json_str = '%s\n' % json.dumps(clear_tuples4json(customer_obj), default=agenda_converter) json_bytes = json_str.encode('utf-8') with gzip.GzipFile(output_filename, 'a') as fout: fout.write(json_bytes) # with gzip.GzipFile(output_filename.replace('.json.gz', '_%s_%s.json.gz' % (from_perc, to_perc)), 'a') as fout: # fout.write(json_bytes) cur.close() con.close()
def main(): input_table = 'tak.uk_traj' path = '/home/agnese/PycharmProjects/TrajectorySegmentation/Risultati/' filename = 'LONDON_traj_seg_exp2000.csv' # data = pd.read_csv("/home/agnese/PycharmProjects/TrajectorySegmentation/Results/" + "traj_seg_exp100.csv") header = ['input_table', 'uid', 'nbr_points', 'avg_sampling_rate', 'std_sampling_rate', 'med_sampling_rate', 'method', 'nbr_traj', 'avg_nbr_points', 'avg_length', 'avg_duration', 'avg_sampling_rate_traj', 'std_sampling_rate_traj', 'med_sampling_rate_traj', 'time_precision', 'dist_coverage', 'mobility_f1', 'temporal_thr'] processed_users = list() if os.path.isfile(filename): # os.remove(filename) df = pd.read_csv(path+filename) processed_users = list(df['uid']) fileout = open(filename, 'a') else: fileout = open(filename, 'w') fileout.write('%s\n' % (','.join(header))) fileout.flush() # users_list = ['100006', # '100022', # '100026', # '10008', # '100086', # '100087', # '100088', # '100090', # '100100', # '100117'] # con = database_io.get_connection() # cur = con.cursor() # users_list = database_io.extract_users_list('tak.italy_traj', cur) # cur.close() # con = database_io.get_connection() cur = con.cursor() users_list = pd.read_csv(path+'london_all_users_list.csv') print(users_list.head()) users_list= users_list['uid'].tolist() # return -1 #users_list = database_io.extract_users_list('tak.uk_traj', cur) # users_list = map(int, users_list) # print(users_list) # users_list = [int(uid) for uid in users_list] print(len(users_list)) count = 0 nbr_exp = 2000 #for i, uid in enumerate(users_list): #print(datetime.datetime.now(), uid, input_table, '[%s/%s]' % (i, len(users_list))) #results = run(cur, uid, input_table) #for j, res in enumerate(results): # fileout.write('%s\n' % (','.join([str(r) for r in res]))) # f1_dict[res[6]].append(res[-2]) # tp_dict[res[6]].append(res[-4]) #fileout.flush() for i, uid in enumerate(users_list): print(datetime.datetime.now(), uid, input_table, '[%s/%s]' % (i, len(users_list))) if uid in processed_users: count+=1 if count>= nbr_exp: break continue try: results = run(cur, uid, input_table) for j, res in enumerate(results): fileout.write('%s\n' % (','.join([str(r) for r in res]))) fileout.flush() except Exception: print(datetime.datetime.now(), uid, input_table, 'Error') continue count += 1 if count >= nbr_exp: break fileout.flush() cur.close() con.close() fileout.close()
def main(): input_table = 'tak.italy_traj' # con = database_io.get_connection() # cur = con.cursor() # users_list = database_io.extract_users_list('tak.italy_traj', cur) # cur.close() # con.close() users_list = [ '100006', '100022', '100026', '10008', '100086', '100087', '100088', '100090', '100100', '100117' ] # uid = users_list[0] # con = database_io.get_connection() # cur = con.cursor() # imh = database_io.load_individual_mobility_history(cur, uid, input_table) # cur.close() # con.close() con = database_io.get_connection() cur = con.cursor() eval_adaptive = list() eval_fix1200 = list() eval_random = list() for uid in users_list: print(uid, input_table) imh = database_io.load_individual_mobility_history( cur, uid, input_table) trajectories = imh['trajectories'] alltraj = merge_trajectories(trajectories) traj_list, user_temporal_thr = segment_trajectories_user_adaptive( alltraj, uid, temporal_thr=60, spatial_thr=50, max_speed=0.07, gap=60, max_lim=3600 * 48, window=15, smooth_fun=moving_median, min_size=10, return_cut=True) avg_nbr_points = np.mean([len(t) for t in traj_list]) print('user_temporal_thr', user_temporal_thr) print('NT %d - ANP %.2f' % (len(traj_list), avg_nbr_points)) time_precision, dist_coverage, mobility_f1 = evalaute_segmentation( alltraj, traj_list, print_report=True) eval_adaptive.append((time_precision, dist_coverage, mobility_f1)) print('------') traj_list = segment_trajectories(alltraj, uid, temporal_thr=1200, spatial_thr=50, max_speed=0.07) avg_nbr_points = np.mean([len(t) for t in traj_list]) print('NT %d - ANP %.2f' % (len(traj_list), avg_nbr_points)) time_precision, dist_coverage, mobility_f1 = evalaute_segmentation( alltraj, traj_list, print_report=True) eval_fix1200.append((time_precision, dist_coverage, mobility_f1)) print('------') traj_list = segment_trajectories(alltraj, uid, temporal_thr=120, spatial_thr=50, max_speed=0.07) avg_nbr_points = np.mean([len(t) for t in traj_list]) print('NT %d - ANP %.2f' % (len(traj_list), avg_nbr_points)) time_precision, dist_coverage, mobility_f1 = evalaute_segmentation( alltraj, traj_list, print_report=True) print('------') traj_list = segment_trajectories_random(alltraj, uid, nbr_traj=2000) avg_nbr_points = np.mean([len(t) for t in traj_list]) print('NT %d - ANP %.2f' % (len(traj_list), avg_nbr_points)) time_precision, dist_coverage, mobility_f1 = evalaute_segmentation( alltraj, traj_list, print_report=True) eval_random.append((time_precision, dist_coverage, mobility_f1)) cur.close() con.close() print('') print('ADP - TP: %.3f - DC: %.3f - F1: %.3f' % (np.median( [v[0] for v in eval_adaptive]), np.median( [v[1] for v in eval_adaptive]), np.median([v[2] for v in eval_adaptive]))) print('FIX - TP: %.3f - DC: %.3f - F1: %.3f' % (np.median( [v[0] for v in eval_fix1200]), np.median( [v[1] for v in eval_fix1200]), np.median([v[2] for v in eval_fix1200]))) print('ADP - TP: %.3f - DC: %.3f - F1: %.3f' % (np.mean( [v[0] for v in eval_adaptive]), np.mean( [v[1] for v in eval_adaptive]), np.mean([v[2] for v in eval_adaptive]))) print('FIX - TP: %.3f - DC: %.3f - F1: %.3f' % (np.mean( [v[0] for v in eval_fix1200]), np.mean( [v[1] for v in eval_fix1200]), np.mean([v[2] for v in eval_fix1200]))) print('ADP - TP: %.3f - DC: %.3f - F1: %.3f' % (np.std( [v[0] for v in eval_adaptive]), np.std( [v[1] for v in eval_adaptive]), np.std([v[2] for v in eval_adaptive]))) print( 'FIX - TP: %.3f - DC: %.3f - F1: %.3f' % (np.std([v[0] for v in eval_fix1200]), np.std([v[1] for v in eval_fix1200]), np.std([v[2] for v in eval_fix1200])))