def smooth_is_vehicle(df_trip, vehicle_or_not, non_vehi_seg_min_dura=1 * 60, vehi_seg_min_dura=2.5 * 60): vehicle_or_not_chunks = list(chunks(vehicle_or_not)) dt_all = df_trip['TIME_DELTA'] vehicle_or_not_smoothed = vehicle_or_not # remove short non-vehicle segments between vehicle segments num_chunks = len(vehicle_or_not_chunks) for idx, chunk in enumerate(vehicle_or_not_chunks): if idx != 0 and idx != num_chunks - 1 and vehicle_or_not[ chunk[0]] == 0: chunk_dura = sum(dt_all[chunk[0]:chunk[1]]) if chunk_dura < non_vehi_seg_min_dura: vehicle_or_not_smoothed[chunk[0]:chunk[1]] = [1] * (chunk[1] - chunk[0]) # remove vehicle segments which are still short after combining is_vehicle_chunks = list(chunks(vehicle_or_not_smoothed)) for chunk in is_vehicle_chunks: if vehicle_or_not[chunk[0]] == 1: chunk_dura = sum(dt_all[chunk[0]:chunk[1]]) if chunk_dura < vehi_seg_min_dura: vehicle_or_not_smoothed[chunk[0]:chunk[1]] = [0] * (chunk[1] - chunk[0]) del vehicle_or_not_chunks, dt_all, num_chunks, is_vehicle_chunks return vehicle_or_not_smoothed
def get_affiliation_data(self): try: affiliation_ids = RedisTemplate.get(AFFILIATION_ALL_ID) affiliation_ids = json.loads(affiliation_ids) self.affiliation_ids = affiliation_ids for ids in chunks(self.affiliation_ids, 500): article_keys = [ AFFILIATION_RELATED_ARTICLE_ID_KEY_TEMPLATE.format(i) for i in ids ] article_values = RedisTemplate.mget(keys=article_keys) article_id_dict = {} author_keys = [ AFFILIATION_RELATED_AUTHOR_ID_KEY_TEMPLATE.format(i) for i in ids ] author_values = RedisTemplate.mget(keys=author_keys) author_id_dict = {} for i in range(len(ids)): if article_values[i] is None: article_id_dict[ids[i]] = [] else: article_id_dict[ids[i]] = json.loads(article_values[i]) if author_values[i] is None: author_id_dict[ids[i]] = [] else: author_id_dict[ids[i]] = json.loads(author_values[i]) self.related_article_dict.update(article_id_dict) self.related_author_dict.update(author_id_dict) except Exception: traceback.format_exc() return
def smooth_vehicle_type(df_trip, original_result_label, vehi_seg_min_dura=1 * 60): trip_segments = list(chunks(original_result_label, True)) dt_all = df_trip['TIME_DELTA'] res = original_result_label num_chunks = len(trip_segments) for idx, chunk in enumerate(trip_segments): if idx == 0 or idx == num_chunks - 1 or chunk[2] == 5: continue pre_label = trip_segments[idx - 1][2] post_label = trip_segments[idx + 1][2] if pre_label == post_label: if pre_label == 5: continue else: cur_chunk_duration = sum(dt_all[chunk[0]:chunk[1]]) if cur_chunk_duration < vehi_seg_min_dura: res[chunk[0]:chunk[1]] = [pre_label ] * (chunk[1] - chunk[0]) del trip_segments, dt_all, num_chunks return res
def update_affiliation_keyword_job(): affiliation = AffiliationLoader() affiliation.get_affiliation_data() related_article_list = sorted(affiliation.related_article_dict.items(), key=lambda x: x[0], reverse=False) related_keyword_dict = {} sql = '''SELECT keyword_id,keyword_desc,COUNT(article_id)AS num FROM keyword_article WHERE article_id IN %s GROUP BY keyword_id,keyword_desc ORDER BY num DESC''' for affiliations_articles in chunks(related_article_list, 500): related_dict = {} for affiliation_articles in affiliations_articles: affiliation_id = affiliation_articles[0] articles = affiliation_articles[1] #机构没有对应的文章 if not articles or len(articles) == 0: continue Cursor.execute(sql, (articles, )) raw_result = list(Cursor.fetchall()) if raw_result is None: continue keywords = list(map(parseKeyword, raw_result)) related_dict[affiliation_id] = keywords related_keyword_dict.update(related_dict) print("{} related_keyword_dict_len: {}".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), len(related_keyword_dict))) pipeline = RedisTemplate.pipeline() for articles in chunks(related_article_list, 500): for article in articles: article_key = AFFILIATION_RELATED_KEYWORD_KEY_TEMPLATE.format( article[0]) keywords = related_keyword_dict.get(article[0]) if keywords: pipeline.set(article_key, json.dumps(keywords)) pipeline.execute() time.sleep(1) print("{} update_affiliation_keyword_job".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
def update_author_collaboration_job(): sql = 'SELECT id FROM author' Cursor.execute(sql) author_list = list(map(lambda x: x[0], list(Cursor.fetchall()))) update_sql = ''' INSERT INTO author_collaboration(start_id,end_id,distance,predict_collaboration) VALUES (%s,%s,%s,%s) ''' author_collaboration_list = [] wfile = open( "/Users/Karl/Desktop/SoftwareExercise/authorCollaboration.txt", "a+", encoding="utf-8") for authors in chunks(author_list, 500): start_time = time.time() for author in authors: with Neo4jDriver.session() as session: res = session.read_transaction(searchCoAuthor, author) # data = [] # for record in res: # data.append(record["authorId"]) # author_collaboration_dict[author]=data # wfile.write(json.dumps(author_collaboration_dict, indent=4)) # author_collaboration_dict.clear() # end_time = time.time() # duration = end_time - start_time # print('update_author_collaboration_job runtime is:{0:.3f}s'.format(duration)) # wfile.close() for coAuthor in res: jaccrdDistance = computeJaccrdDistance( author, coAuthor["authorId"]) print((author, coAuthor["authorId"], round(jaccrdDistance[0], 2), json.dumps(jaccrdDistance[1]))) author_collaboration_list.append( (author, coAuthor["authorId"], round(jaccrdDistance[0], 2), json.dumps(jaccrdDistance[1]))) try: Cursor.executemany(update_sql, author_collaboration_list) Connection.commit() except Exception as e: print(e) Connection.rollback() end_time = time.time() duration = end_time - start_time print('update_author_collaboration_job 500 runtime is:{0:.3f}s'.format( duration)) time.sleep(1)
def update_affiliation_new_article_job(): affiliation = AffiliationLoader() affiliation.get_affiliation_data() related_article_list = sorted(affiliation.related_article_dict.items(),key=lambda x:x[0],reverse=False) related_new_article_dict = {} sql = '''SELECT id FROM article WHERE id IN %s ORDER BY date DESC LIMIT 1''' for affiliations_articles in chunks(related_article_list,500): related_dict = {} for affiliation_articles in affiliations_articles: affiliation_id = affiliation_articles[0] articles = affiliation_articles[1] #机构没有对应的文章 if not articles or len(articles)==0: continue Cursor.execute(sql, (articles,)) raw_result = list(Cursor.fetchone()) if raw_result is None: continue article_id = raw_result[0] related_dict[affiliation_id] = article_id related_new_article_dict.update(related_dict) print("{} related_new_article_dict_len: {}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), len(related_new_article_dict))) pipeline = RedisTemplate.pipeline() for articles in chunks(related_article_list,500): for article in articles: article_key = AFFILIATION_RELATED_NEW_ARTICLE_ID_KEY_TEMPLATE.format(article[0]) new_article_id = related_new_article_dict.get(article[0]) if new_article_id: pipeline.set(article_key,json.dumps(new_article_id)) pipeline.execute() time.sleep(1) print("{} update_affiliation_new_article_job finished".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
def _load_related_article_dict(self): for ids in chunks(self.affiliation_ids, 500): sql = ''' SELECT affiliation_id, group_concat(article_id) as article_ids FROM affiliation_article WHERE affiliation_id IN %s GROUP BY affiliation_id ''' Cursor.execute(sql, (ids, )) raw_result = list(Cursor.fetchall()) related_dict = {} for info in raw_result: if info is None: continue if info[1] is None or len(info) == 0: self.related_article_dict[info[0]] = [] else: related_dict[info[0]] = info[1].split(',') self.related_article_dict.update(related_dict) time.sleep(1) print("{} related_article_dict_len: {}".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), len(self.related_article_dict)))
def _save_to_redis(self): if len(self.affiliation_ids) > 0: RedisTemplate.set(AFFILIATION_ALL_ID, json.dumps(self.affiliation_ids)) pipeline = RedisTemplate.pipeline() for ids in chunks(self.affiliation_ids, 500): for _id in ids: article_key = AFFILIATION_RELATED_ARTICLE_ID_KEY_TEMPLATE.format( _id) author_key = AFFILIATION_RELATED_AUTHOR_ID_KEY_TEMPLATE.format( _id) related_article_ids = self.related_article_dict.get(_id) related_author_ids = self.related_author_dict.get(_id) if related_article_ids: pipeline.set(article_key, json.dumps(self.related_article_dict[_id])) if related_author_ids: pipeline.set(author_key, json.dumps(self.related_author_dict[_id])) pipeline.execute() time.sleep(1) print("{} save_to_redis finished".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
def save_tripsummary_PSQL_2016(conn, cur, tablename_trip, tablename_extra, trips_dict): """ Save extra point-level information into DB Input: conn: DB connection cur: DB cursor tablename_trip: DB table to save the trip dictionary data trips_dict: dictionary of trip summaries Return True if successful and False otherwise. """ nids = [trips_dict['nid']] * len(trips_dict['trip_num']) dates = [trips_dict['analyzed_date']] * len(trips_dict['trip_num']) tot_num_trips = [trips_dict['tot_num_trips']] * len(trips_dict['trip_num']) if not trips_dict['home_loc'] == [None, None]: home_loc = [trips_dict['home_loc']] * len(trips_dict['trip_num']) else: home_loc = [[]] * len(trips_dict['trip_num']) if not trips_dict['school_loc'] == [None, None]: school_loc = [trips_dict['school_loc']] * len(trips_dict['trip_num']) else: school_loc = [[]] * len(trips_dict['trip_num']) trip_num = trips_dict['trip_num'] start_poi_loc = trips_dict['start_poi_loc'] end_poi_loc = trips_dict['end_poi_loc'] start_sgt = trips_dict['start_sgt'] end_sgt = trips_dict['end_sgt'] tot_dist = trips_dict['tot_dist(km)'] tot_dura = trips_dict['tot_dura(s)'] valid_loc_perc = trips_dict['valid_loc_perc'] num_pt = trips_dict['num_pt'] manual_label_modes = [] manual_label_strs = [] manual_label_finishs = [] users_ids = [] time_modified_list = [] google_label_modes = [] google_failed_reasons = [] google_label_finishs = [] # check existence by id/nid first existence, manually_labeled, app_labeled, google_labeled_failed, google_labeled_trusted = checkNidDateExistence( cur, tablename_trip, nids[0], dates[0]) if existence is None: logging.error("Existence checking failed!") return False try: if manually_labeled: # need to recreate the manually labeled trip-level modes for the new trips logging.warning( "There are manual labels existing. Start recreating.") cur_date_tuple = datetime.strptime(dates[0], "%Y%m%d") one_day_after = (cur_date_tuple + timedelta(days=1)).strftime("%Y%m%d") # get the triplabel and pt-level label for the whole day allQuery = """SELECT triplabel,gt_mode_manual from """ + tablename_extra + """ WHERE nid=""" + str( nids[0] ) + """ AND (sgt>='""" + dates[ 0] + """ 00:00:00' AND sgt<'""" + one_day_after + """ 00:00:00') ORDER BY sgt""" cur.execute(allQuery) dataAll = cur.fetchall() if len(dataAll) > 0: rawColumns = list(zip(*dataAll)) triplabels = np.array(rawColumns[0]) gt_mode_manual_list = np.array(rawColumns[1]) # go through each trip to get the trip-level modes for item in trip_num: mode_str_cur_trip = '' # get all labels for this trip gt_mode_manual_cur_trip = gt_mode_manual_list[ triplabels == item].tolist() # get mode chunks of this trip mode_chunks = chunks(gt_mode_manual_cur_trip, include_values=True) for mode_chunk in mode_chunks: if mode_chunk[2] is not None: mode_str_cur_trip += str(mode_chunk[2]) manual_label_modes.append(mode_str_cur_trip) manual_label_finishs.append('t') manual_label_strs.append('recreated') users_ids.append(1) time_modified_list.append( datetime.now().strftime("%Y-%m-%d %H:%M:%S")) else: logging.error("Failed to get the pt-level manual labels") if google_labeled_trusted: # need to recreate the automatically labeled trip-level modes for the new trips, if the labels are trusted logging.warning( "There are trusted automatic labels existing. Start recreating." ) cur_date_tuple = datetime.strptime(dates[0], "%Y%m%d") one_day_after = (cur_date_tuple + timedelta(days=1)).strftime("%Y%m%d") # get the triplabel and pt-level label for the whole day allQuery = """SELECT triplabel,gt_mode_google from """ + tablename_extra + """ WHERE nid=""" + str( nids[0] ) + """ AND (sgt>='""" + dates[ 0] + """ 00:00:00' AND sgt<'""" + one_day_after + """ 00:00:00') ORDER BY sgt""" cur.execute(allQuery) dataAll = cur.fetchall() if len(dataAll) > 0: rawColumns = list(zip(*dataAll)) triplabels = np.array(rawColumns[0]) gt_mode_google_list = np.array(rawColumns[1]) # go through each trip to get the trip-level modes for item in trip_num: mode_str_cur_trip = '' # get all labels for this trip gt_mode_google_cur_trip = gt_mode_google_list[ triplabels == item].tolist() # check the percentage of the labeled samples None_mode_cnt = 0 for mode_item in gt_mode_google_cur_trip: if mode_item is None: None_mode_cnt += 1 if None_mode_cnt > 0.3 * len(gt_mode_google_cur_trip): # if too many samples don't have automatic labels google_label_modes.append(None) google_label_finishs.append('t') google_failed_reasons.append( 'Too few labels while recreating') else: # get mode chunks of this trip mode_chunks = chunks(gt_mode_google_cur_trip, include_values=True) for mode_chunk in mode_chunks: if mode_chunk[2] is not None: mode_str_cur_trip += str(mode_chunk[2]) google_label_modes.append(mode_str_cur_trip) google_label_finishs.append('t') google_failed_reasons.append(None) else: logging.error("Failed to get the pt-level manual labels") elif google_labeled_failed: logging.warning( "There are failed automatic labels existing. Save as failed auto-labeling." ) google_label_modes = [None] * len(trip_num) google_label_finishs = ['t'] * len(trip_num) google_failed_reasons = ['Failed before recreating' ] * len(trip_num) else: google_label_modes = [None] * len(trip_num) google_label_finishs = ['f'] * len(trip_num) google_failed_reasons = [None] * len(trip_num) app_labeled_list = [app_labeled] * len(trip_num) if existence: # delete the old data if already exists because the new one might have diff number of trips logging.warning( "Trip exists! DB Connected to delete existing trip summaries") deleteQuery = """DELETE FROM """ + tablename_trip + """ WHERE nid=""" + str( nids[0]) + """ and analyzed_date='""" + dates[0] + """'""" cur.execute(deleteQuery) conn.commit() if manual_label_modes: # insert the new data with existing manual labels logging.warning( "DB Connected to insert new trip summaries with existing manual labels" ) insertQuery = """INSERT INTO """ + tablename_trip + """ (nid,analyzed_date, tot_num_trips,\ trip_num,home_loc, school_loc, start_poi_loc, end_poi_loc,tot_dist_km, tot_dura_s,\ start_sgt,end_sgt, valid_loc_perc,num_pt, manual_label_finish, manual_label_str,\ manual_label_mode, users_id, time_modified, app_label_finish, google_label_mode,\ google_label_finish,google_failed_reason) \ VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" zip2save = zip(nids, dates, tot_num_trips, trip_num, home_loc, school_loc, \ start_poi_loc, end_poi_loc, tot_dist, tot_dura, start_sgt, end_sgt, valid_loc_perc, num_pt, \ manual_label_finishs, manual_label_strs, manual_label_modes, users_ids, time_modified_list, \ app_labeled_list, google_label_modes, google_label_finishs, google_failed_reasons) cur.executemany(insertQuery, zip2save) conn.commit() return True else: # insert the new data w/o any existing manual labels logging.warning( "DB Connected to insert new trip summaries w/o any existing manual labels" ) insertQuery = """INSERT INTO """ + tablename_trip + """ (nid,analyzed_date,tot_num_trips,\ trip_num,home_loc,school_loc,start_poi_loc,end_poi_loc,tot_dist_km,tot_dura_s,\ start_sgt,end_sgt,valid_loc_perc,num_pt, app_label_finish, google_label_mode,\ google_label_finish,google_failed_reason) \ VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" zip2save = zip(nids, dates, tot_num_trips, trip_num, home_loc, school_loc, \ start_poi_loc, end_poi_loc, tot_dist, tot_dura, start_sgt, end_sgt, valid_loc_perc, num_pt, \ app_labeled_list, google_label_modes, google_label_finishs, google_failed_reasons) cur.executemany(insertQuery, zip2save) conn.commit() return True except psycopg2.DatabaseError as e: logging.error(e) return False
def predict(self, data, modes): """predict whether a list of position follows a train route by detecting the nearest train stops. Input is the pandas data frame of measurements and an array of current mode predictions. Returns an array of predicted modes of the same size as the input data frame has rows. """ # extract lat/lon from data frame lat = data['WLATITUDE'].values lon = data['WLONGITUDE'].values # chunk is a tuple (start_idx, end_idx, mode) # go through each CAR, BUS and TRAIN chunk for start_idx, end_idx, _ in filter( lambda chunk: chunk[2] in [MODE_CAR, MODE_BUS, MODE_TRAIN], chunks(modes, include_values=True)): # test for distance first lat_seg = lat[start_idx:end_idx] lon_seg = lon[start_idx:end_idx] valid_lat_seg = lat_seg[np.where(np.invert(np.isnan(lat_seg)))[0]] valid_lon_seg = lon_seg[np.where(np.invert(np.isnan(lon_seg)))[0]] if len(valid_lon_seg) == 0: continue # TODO: parameters have to be tuned carefully is_train = predict_mode_by_location(valid_lat_seg, valid_lon_seg, self.train_location_tree, self.train_location_dict, self.train_route_dict, dist_thre=400, dist_pass_thres=7, num_stops_thre=3, dist_pass_thres_perc=0.7) # check whether the entry and exit points are close to any stations # if both are true then this segment is considered as TRAIN as well entry_pt_near = -1 exit_pt_near = -1 if start_idx - 1 >= 0: if not np.isnan(lat[start_idx - 1]): nearest_station = find_nearest_station( lat[start_idx - 1], lon[start_idx - 1], self.train_location_tree, self.dist_thres_entry_exit) if len(nearest_station) != 0: entry_pt_near = 1 else: entry_pt_near = 0 if end_idx < len(modes): if not np.isnan(lat[end_idx]): nearest_station = find_nearest_station( lat[end_idx], lon[end_idx], self.train_location_tree, self.dist_thres_entry_exit) if len(nearest_station) != 0: exit_pt_near = 1 else: exit_pt_near = 0 if is_train or entry_pt_near + exit_pt_near == 2: modes[start_idx:end_idx] = MODE_TRAIN else: modes[start_idx:end_idx] = MODE_CAR return modes
def update_affiliation_database_job(): affiliation = AffiliationLoader() affiliation.get_affiliation_data() related_article_list = sorted(affiliation.related_article_dict.items(), key=lambda x: x[0], reverse=False) affiliation_info_list = [] sql = ''' SELECT aff.name,AVG(art.citation_count),SUM(art.citation_count), COUNT(art.id),MIN(YEAR(art.date)),MAX(YEAR(art.date)), COUNT(art.pdf_link),AVG(art.total_usage-art.citation_count) FROM article art,affiliation aff WHERE art.id IN %s AND aff.id = %s ''' back_up_sql = ''' SELECT aff.name FROM affiliation aff WHERE aff.id = %s ''' update_sql = ''' INSERT INTO affiliation_info (affiliation_id,affiliation_name,average_citation_per_article, citation_count,publication_count,start_year,end_year, available_download,average_download_per_article, create_time,update_time) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE affiliation_name = VALUES (affiliation_name), average_citation_per_article = VALUES (average_citation_per_article), citation_count = VALUES (citation_count), publication_count = VALUES(publication_count), start_year = VALUES (start_year), end_year = VALUES (end_year), available_download = VALUES (available_download), average_download_per_article = VALUES (average_download_per_article), update_time = VALUES (update_time) ''' for affiliations_articles in chunks(related_article_list, 500): for affiliation_articles in affiliations_articles: affiliation_id = affiliation_articles[0] articles = affiliation_articles[1] update_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if not articles or len(articles) == 0: Cursor.execute(back_up_sql, (affiliation_id, )) affiliation_name = raw_result[0] affiliation_info_list.append( (affiliation_id, affiliation_name, 0.0, 0, 0, -1, -1, 0, 0.0, update_time, update_time)) continue Cursor.execute(sql, ( articles, affiliation_id, )) raw_result = list(Cursor.fetchone()) if raw_result is None: continue affiliation_name = raw_result[0] average_citation_per_article = float( str(raw_result[1].quantize(Decimal('0.00')))) citation_count = int(str(raw_result[2])) publication_count = raw_result[3] start_year = raw_result[4] end_year = raw_result[5] available_download = raw_result[6] average_download_per_article = float( str(raw_result[7].quantize(Decimal('0.00')))) affiliation_info_list.append( (affiliation_id, affiliation_name, average_citation_per_article, citation_count, publication_count, start_year, end_year, available_download, average_download_per_article, update_time, update_time)) print("{} affiliation_info_list_len: {}".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), len(affiliation_info_list))) for affiliation_infos in chunks(affiliation_info_list, 500): try: Cursor.executemany(update_sql, affiliation_infos) Connection.commit() except Exception as e: print(e) Connection.rollback() time.sleep(1) print("{} update_affiliation_database_job finished".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
def segFind(data_frame, daily_summary, mode_thresh, isAM, dist_lim, max_mode, max_walk): # function used to get mode segments out from a trip data frame and obtain the mode and dist # definition of mode code MODE_WALK_IN = 3; MODE_WALK_OUT = 2; MODE_STOP_IN = 1; MODE_STOP_OUT = 0; # thresholds for calculating distance of mode segs ALL_WALK_TIME=15*60 # time shorter than which the distance of walk mode segment is calculated using all points real_to_jump_dist = 2; # for short walking seg, limit the distance by 2 times the jump distance pred_modes = data_frame[['CCMODE']].values[:,0] # take out the predicted modes, copy of the data_frame column # change all STOP_IN, STOP_OUT, WALK_OUT to WALK_IN pred_modes[(pred_modes==MODE_STOP_IN) | (pred_modes==MODE_STOP_OUT) | (pred_modes==MODE_WALK_OUT)] = MODE_WALK_IN mode_segs = list(chunks(pred_modes,True)) # take the mode chunks num_valid_mode_seg = 0 prev_mode = 0 # print pred_modes # print mode_segs # print data_frame['DISTANCE_DELTA'].values.tolist() # print data_frame['TIME_DELTA'].values.tolist() # go through each mode chunk for mode_seg in mode_segs: time_span = np.sum(data_frame['TIME_DELTA'].values[mode_seg[0]:mode_seg[1]]) # abandon if the total segment time is less than threshold, and shorten the list down to 5 mode segments at most if time_span < mode_thresh or num_valid_mode_seg > max_mode-1: continue else: latlon_start = [data_frame['WLATITUDE'].values[mode_seg[0]],data_frame['WLONGITUDE'].values[mode_seg[0]]] latlon_end = [data_frame['WLATITUDE'].values[mode_seg[1]-1],data_frame['WLONGITUDE'].values[mode_seg[1]-1]] jump_dist = great_circle_dist(latlon_start,latlon_end,'meters') num_valid_mode_seg += 1 if isAM: mode_key = 'am_mode' dist_key = 'am_distance' dura_key = 'am_duration' else: mode_key = 'pm_mode' dist_key = 'pm_distance' dura_key = 'pm_duration' # calculate the distance of this mode segment if int(mode_seg[2]) == MODE_WALK_IN: modes_cur_seg = data_frame['CCMODE'].values[mode_seg[0]:mode_seg[1]] dist_cur_seg = data_frame['DISTANCE_DELTA'].values[mode_seg[0]:mode_seg[1]] if time_span<ALL_WALK_TIME: # if the time span is too small, consider all 0-3 modes as walking dist_seg = np.nansum(dist_cur_seg) dist_seg = checkLim_round(dist_seg,jump_dist*real_to_jump_dist) else: # else if the time span is not too small, only consider 2 and 3 modes dist_seg = np.nansum(dist_cur_seg[np.where((modes_cur_seg==MODE_WALK_IN) | (modes_cur_seg==MODE_WALK_OUT))[0]]) dist_seg = checkLim_round(dist_seg,max_walk*1000) else: dist_seg = np.nansum(data_frame['DISTANCE_DELTA'].values[mode_seg[0]:mode_seg[1]]) if dist_seg==0 or np.isnan(dist_seg): # filter out the zero or nan values of dist_seg continue if mode_seg[2]==prev_mode: # if the current mode is same to the previous one, combine the two distance prev_dist = daily_summary[dist_key][len(daily_summary[dist_key])-1] cur_dist = checkLim_round((prev_dist*1000+dist_seg) / 1000,dist_lim) daily_summary[dist_key][len(daily_summary[dist_key])-1]=float(np.round(cur_dist,4)) prev_dura = daily_summary[dura_key][len(daily_summary[dura_key])-1] cur_dura = prev_dura+time_span daily_summary[dura_key][len(daily_summary[dura_key])-1]=int(cur_dura) continue daily_summary[mode_key].append(int(mode_seg[2])) # append the mode daily_summary[dist_key].append(checkLim_round(dist_seg / 1000,dist_lim)) daily_summary[dura_key].append(int(time_span)) prev_mode = mode_seg[2] return num_valid_mode_seg
def label_pts_by_pois(pois_latlon_comb,pois_label_temp,data_frame,home_cover_radius,sch_cover_radius,poi_cover_radius,poi_min_dwell_time): """ label all points to detected POIs Input: pois_latlon_comb: combined pois, but not chronological pois_label_temp: temporary labels for each POI in the given list, -2 for home, -3 for school , -4 to -n for others data_frame: pandas data frame of all one day data of the device Output: pois_latlon_chro: chronological pois, can be duplicated, like: [home, school, home] pois_label_chro: chronological poi labels, -2 for home, -3 for school, 1 to n for other POIs data_frame['POI_LABEL']: label of each point indicating which POI it belongs to, -1 for none, -2 for home... [same as above] """ # list of lat/lon and timestamp of all points across the day latlon_all = data_frame[['WLATITUDE','WLONGITUDE']].values.tolist() ts_all = data_frame['TIMESTAMP'].values sgt_all = data_frame['TIME_SGT'].values lat_all = data_frame['WLATITUDE'].values delta_dist_all = data_frame['DISTANCE_DELTA'].values # pt-level label indicating which poi this point belongs to # initialize the labels with -1 poi_label_pt = np.array([-1]*len(latlon_all)) # go through each POI, label all points for idx,poi in enumerate(pois_latlon_comb): dist2poi_list = map(lambda x: great_circle_dist(x, [poi[0],poi[1]], unit="meters"), latlon_all) dist2poi_array = np.array(dist2poi_list) if pois_label_temp[idx] == -2: # if the poi is home poi_label_pt[dist2poi_array<=home_cover_radius] = -2 elif pois_label_temp[idx] == -3: # if the poi is school poi_label_pt[dist2poi_array<=sch_cover_radius] = -3 else: # if it's other pois poi_label_pt[dist2poi_array<=poi_cover_radius] = pois_label_temp[idx] # wipe out short noise between two adjacent same pois poi_label_chunks = chunks_real(poi_label_pt,include_values=True) num_chunks = len(poi_label_chunks) if num_chunks>1: for idx,label_chunk in enumerate(poi_label_chunks): if label_chunk[2]==-1: # go through all noise chunks lat_cur = lat_all[label_chunk[0]:label_chunk[1]] if idx == 0: # if noise chunk is in the beginning if (ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]]<60*5) or \ (float(ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]])/(label_chunk[1]-label_chunk[0])>1000) \ or (len(lat_cur[np.isnan(lat_cur)])*1.0/len(lat_cur)>0.9): # if the chunk is small, or average delta time is big, or most points have invalid location poi_label_pt[label_chunk[0]:label_chunk[1]]=poi_label_chunks[idx+1][2] elif idx == num_chunks-1: # if noise chunk is in the end if (ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]]<60*5) or \ (float(ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]])/(label_chunk[1]-label_chunk[0])>1000) \ or (len(lat_cur[np.isnan(lat_cur)])*1.0/len(lat_cur)>0.9): # if the chunk is small, or average delta time is big, or most points have invalid location poi_label_pt[label_chunk[0]:label_chunk[1]]=poi_label_chunks[idx-1][2] elif (poi_label_chunks[idx-1][2]==poi_label_chunks[idx+1][2]): # if the former and latter chunks have same labels and this noise chunk is short in duration # or most points have invalid loc, or most of the points are sleeping, or the average velocity is small # set labels of this noise chunk as the same label if (ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]]<60*20) or (len(lat_cur[np.isnan(lat_cur)])*1.0/len(lat_cur)>0.9) \ or (np.nansum(delta_dist_all[label_chunk[0]:label_chunk[1]])/(ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]])<1.0) \ or (float(ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]])/(label_chunk[1]-label_chunk[0])>1000): # logging.debug("noise removed") poi_label_pt[label_chunk[0]:label_chunk[1]]=poi_label_chunks[idx-1][2] # obtain the pois_latlon_chro and pois_label_chro pois_latlon_comb = np.array(pois_latlon_comb) pois_label_temp = np.array(pois_label_temp) pois_latlon_chro = [] pois_label_chro = [] pois_start_idx = [] pois_start_sgt = [] pois_end_idx = [] pois_end_sgt = [] num_normal_poi = 1 poi_label_chunks = chunks(poi_label_pt,include_values=True) for label_chunk in poi_label_chunks: # go through all poi chunks chronologically if label_chunk[2] == -1: # non poi chunk continue else: # poi chunk if (ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]]<poi_min_dwell_time): # if the poi chunk is too short, remove it poi_label_pt[label_chunk[0]:label_chunk[1]]=-1 else: if label_chunk[2]==-2 or label_chunk[2]==-3: # home or school chunk pois_latlon_chro.append(pois_latlon_comb[pois_label_temp==label_chunk[2],:].tolist()[0]) pois_label_chro.append(label_chunk[2]) else: # for normal poi, just count from 1 to n pois_latlon_chro.append(pois_latlon_comb[pois_label_temp==label_chunk[2],:].tolist()[0]) pois_label_chro.append(num_normal_poi) poi_label_pt[label_chunk[0]:label_chunk[1]] = num_normal_poi num_normal_poi += 1 pois_start_idx.append(label_chunk[0]) pois_start_sgt.append(sgt_all[label_chunk[0]]) pois_end_idx.append(label_chunk[1]-1) pois_end_sgt.append(sgt_all[label_chunk[1]-1]) # if there are more than two points with valid location for this poi, then remove the first # and last point with valid location out lat_cur_poi = lat_all[label_chunk[0]:label_chunk[1]] idx_valid_loc = np.where(~np.isnan(lat_cur_poi))[0] if len(idx_valid_loc)>2: if label_chunk[0]!=0: # to avoid creating invalid trips in the beginning of the day poi_label_pt[label_chunk[0]:label_chunk[0]+idx_valid_loc[0]+1]=-1 if label_chunk[1]!=len(lat_all): # to avoid creating invalid trips in the end of the day poi_label_pt[label_chunk[0]+idx_valid_loc[-1]:label_chunk[1]]=-1 data_frame['POI_LABEL'] = pd.Series(poi_label_pt) pois_dict = {'pois_latlon_chro':pois_latlon_chro, 'pois_label_chro': pois_label_chro, 'pois_start_idx': pois_start_idx, \ 'pois_end_idx': pois_end_idx, 'pois_start_sgt': pois_start_sgt, 'pois_end_sgt': pois_end_sgt} return pois_dict
def evaluate_overall_bibibinary(vehicle_or_not_model, bus_or_not_model, mrt_or_car_model, features_test, labels_test, vehicle_or_not_idx, bus_or_not_idx, mrt_or_car_idx, if_smooth=True): write = "**********Evaluate Overall Result**********\n" write += "with 4 labels\n" vehicle_or_not_result = vehicle_or_not_model.predict( np.array(features_test.iloc[:, vehicle_or_not_idx])) if len(np.shape(vehicle_or_not_result)) > 1: vehicle_or_not_result = np.argmax(vehicle_or_not_result, 1) bus_or_not_result = bus_or_not_model.predict( np.array(features_test.iloc[:, bus_or_not_idx])) if len(np.shape(bus_or_not_result)) > 1: bus_or_not_result = np.argmax(bus_or_not_result, 1) mrt_or_car_result = mrt_or_car_model.predict( np.array(features_test.iloc[:, mrt_or_car_idx])) if len(np.shape(mrt_or_car_result)) > 1: mrt_or_car_result = np.argmax(mrt_or_car_result, 1) result_label = [] trip_chunks = list(chunks(features_test['trip_id'].tolist())) if if_smooth is True: for trip_chunk in trip_chunks: vehicle_or_not_result[trip_chunk[0]:trip_chunk[1]] = \ smooth_is_vehicle(features_test.iloc[trip_chunk[0]:trip_chunk[1]], vehicle_or_not_result[trip_chunk[0]:trip_chunk[1]]) for idx, t in enumerate(vehicle_or_not_result): if t == 0: # stationary or stop result_label.append(5) elif t == 1: # vehicle if bus_or_not_result[idx] == 0: if mrt_or_car_result[idx] == 0: result_label.append(2) # mrt elif mrt_or_car_result[idx] == 1: result_label.append(4) # car elif bus_or_not_result[idx] == 1: result_label.append(3) # bus else: print("Error in overall evaluation: wrong label! at idx: %d" % idx) else: # t[1] == 2 print("Error in overall evaluation: wrong label! at idx %d, %d" % (idx, t)) write += str(Counter(labels_test)) + '\n' con_matrix = confusion_matrix(labels_test, result_label) acc = accuracy_score(labels_test, result_label) write += str(con_matrix) + '\n' write += "Classification report:\n" write += str(classification_report(labels_test, result_label)) + '\n' evaluation_report.add_content(write) evaluation_report.add_accuracy(acc) del write, vehicle_or_not_result, bus_or_not_result, mrt_or_car_result, trip_chunks, con_matrix, acc return result_label
def evaluate_overall_lstm(vehicle_or_not_model, vehicle_type_model, features_test, labels_test, vehicle_or_not_idx, vehicle_type_idx, if_smooth=True): write = "**********Evaluate Overall Result**********\n" write += "Using manual labelled data, with 4 labels\n" vehicle_or_not_test = np.reshape( np.array(features_test.iloc[:, vehicle_or_not_idx]), (len(features_test), 6, int(len(vehicle_or_not_idx) / 6))) vehicle_or_not_result = vehicle_or_not_model.predict(vehicle_or_not_test) if len(np.shape(vehicle_or_not_result)) > 1: vehicle_or_not_result = np.argmax(vehicle_or_not_result, 1) vehicle_type_test = np.reshape( np.array(features_test.iloc[:, vehicle_type_idx]), (len(features_test), 6, int(len(vehicle_type_idx) / 6))) vehicle_type_result = vehicle_type_model.predict(vehicle_type_test) if len(np.shape(vehicle_type_result)) > 1: vehicle_type_result = np.argmax(vehicle_type_result, 1) result_label = [] trip_chunks = list(chunks(features_test['trip_id'].tolist())) if if_smooth is True: write += "Smoooooooothing vehicle_or_not_result~~~~~~~~\n" for trip_chunk in trip_chunks: vehicle_or_not_result[trip_chunk[0]:trip_chunk[1]] = \ smooth_is_vehicle(features_test.iloc[trip_chunk[0]:trip_chunk[1]], vehicle_or_not_result[trip_chunk[0]:trip_chunk[1]]) # is_vehicle_smoothing() for idx, t in enumerate(vehicle_or_not_result): if t == 0: # stationary or stop result_label.append(5) elif t == 1: # vehicle if vehicle_type_result[idx] == 0: result_label.append(2) # mrt elif vehicle_type_result[idx] == 1: result_label.append(3) # bus elif vehicle_type_result[idx] == 2: result_label.append(4) # car else: print("Error in overall evaluation: wrong label!" + vehicle_type_model[idx]) else: # t[1] == 2 print("Error in overall evaluation: wrong label! at idx %d, %d" % (idx, t)) if if_smooth is True: write += "Smoooooooothing smooth_vehicle_type~~~~~~~~\n" for trip_chunk in trip_chunks: result_label[trip_chunk[0]:trip_chunk[1]] = \ smooth_vehicle_type(features_test.iloc[trip_chunk[0]:trip_chunk[1]], result_label[trip_chunk[0]:trip_chunk[1]]) write += str(Counter(labels_test)) + '\n' con_matrix = confusion_matrix(labels_test, result_label) acc = accuracy_score(labels_test, result_label) write += str(con_matrix) + '\n' write += "Classification report:\n" write += str(classification_report(labels_test, result_label)) + '\n' evaluation_report.add_content(write) evaluation_report.add_accuracy(acc) del write, vehicle_or_not_test, vehicle_or_not_result, vehicle_type_test, vehicle_type_result, trip_chunks, \ con_matrix, acc return result_label
def calc_geo_time_features(data_frame, queried_date_str, window_size, high_velocity_thresh=40): """ Calculate additional features and attributes from the raw hardware data. New attributes are added as new columns in the data frame in place. Additional features included: ANALYZED_DATE, TIME_DELTA, STEPS, STEPS_DELTA, DISTANCE_DELTA, VELOCITY, ACCELERATION, MOV_AVE_VELOCITY, MOV_AVE_ACCELERATION :param data_frame: The original dataframe, including those raw features :param queried_date_str: Analyzed data :param window_size: Window size of sliding window :param high_velocity_thresh: A threshold to determine whether the velocity is too high, unit : m/s :return: The status of success of feature calculation """ # add analyzed date into the data frame data_frame['ANALYZED_DATE'] = pd.Series([queried_date_str] * len(data_frame)) # calculate the SGT time of the day, in hours time_SGT = map(lambda x: get_hour_SGT(x), data_frame['TIMESTAMP'].values) data_frame['TIME_SGT'] = pd.Series(time_SGT) # calculate time delta since the last measurement, in seconds a = np.array(data_frame.iloc[:-1]['TIMESTAMP']) b = np.array(data_frame.iloc[1:]['TIMESTAMP']) delta_timestamps = list(b - a) if data_frame['TIME_SGT'][0] < 1.5: # add dt to 24 am for the first measurement when first point is within 24 am to 1.5am delta_timestamps = [int(data_frame['TIME_SGT'][0] * 3600) ] + delta_timestamps else: # add a zero value for the first measurement when first point is not from 24 am to 1.5am delta_timestamps = [0] + delta_timestamps data_frame['TIME_DELTA'] = pd.Series(delta_timestamps) # check if there's negative delta_t ts_array = np.array(delta_timestamps) if any(ts_array < 0): logging.error("There's negative delta_t from DB!!! Length is: " + str(sum(ts_array < 0))) return False # calculate steps delta since the last measurement consec_steps = zip(data_frame[['STEPS']].values[:-1], data_frame[['STEPS']].values[1:]) delta_steps = map(lambda x: x[1][0] - x[0][0], consec_steps) # filter out negative delta_steps delta_steps = [dstep if dstep >= 0 else 0 for dstep in delta_steps] # add a zero value for the first measurement where no delta is available data_frame['STEPS_DELTA'] = pd.Series([0] + delta_steps) # select rows in data frame that have valid locations df_validloc = data_frame.loc[~np.isnan(data_frame['WLATITUDE']) & ~np.isnan(data_frame['WLONGITUDE'])] # calculate distance delta from pairs of valid lat/lon locations that follow each other valid_latlon = df_validloc[['WLATITUDE', 'WLONGITUDE']].values dist_delta = list( map( lambda loc_pair: great_circle_dist( loc_pair[0], loc_pair[1], unit="meters"), zip(valid_latlon[:-1], valid_latlon[1:]))) # calculate time delta from pairs of valid timestamps valid_times = df_validloc['TIMESTAMP'].values time_delta = valid_times[1:] - valid_times[:-1] # calculate velocity, m/s velocity = dist_delta / time_delta # create new columns for delta distance, time delta and velocity, initialzied with NaN data_frame['DISTANCE_DELTA'] = pd.Series(dist_delta, df_validloc.index[1:]) data_frame['VELOCITY'] = pd.Series( velocity, df_validloc.index[1:]) # velocity in m/s data_frame['ACCELERATION'] = data_frame['VELOCITY'] / data_frame[ 'TIME_DELTA'] # acceleration in m/s^2 # assign the velocity of those nan-loc points with the latter first valid velocity validloc_label = np.isnan( data_frame['WLATITUDE'].values) # True for points with nan loc validloc_label_chunks = chunks(validloc_label, include_values=True) for label_chunk in validloc_label_chunks: # find True chunks (no loc) and assign the velocity if label_chunk[2] and label_chunk[1] != len(data_frame): data_frame.loc[data_frame.index[0] + label_chunk[0]:data_frame.index[0] + label_chunk[1] - 1, 'VELOCITY'] = \ data_frame['VELOCITY'][label_chunk[1]] # replace very high velocity values which are due to wifi # localizations errors with NaN in VELOCITY column idx_too_high = np.where( data_frame['VELOCITY'].values > high_velocity_thresh)[0].tolist() idx_too_high = [item + data_frame.index[0] for item in idx_too_high] idx_bef_too_high = (np.array(idx_too_high) - 1).tolist() data_frame.loc[ idx_too_high, ['WLATITUDE', 'WLONGITUDE', 'DISTANCE_DELTA', 'VELOCITY']] = np.nan data_frame.loc[ idx_bef_too_high, ['WLATITUDE', 'WLONGITUDE', 'DISTANCE_DELTA', 'VELOCITY']] = np.nan # calculate the moving average of velocity, m/s LARGE_TIME_JUMP = 60 # seconds velocity_all = data_frame['VELOCITY'].values moving_ave_velocity_all = moving_ave_velocity(velocity_all, np.array(delta_timestamps), LARGE_TIME_JUMP, window_size) moving_ave_acc_all = moving_ave_velocity(data_frame['ACCELERATION'].values, np.array(delta_timestamps), LARGE_TIME_JUMP, window_size) data_frame['MOV_AVE_VELOCITY'] = pd.Series( moving_ave_velocity_all) # velocity in m/s data_frame['MOV_AVE_ACCELERATION'] = pd.Series( moving_ave_acc_all) # acceleration in m/s^2 return True