def main(): print 'now entered main' parser = argparse.ArgumentParser() parser.add_argument('-i', '-I', help='input CSV file', required=True) parser.add_argument('-o', '-O', help='output path', required=True) parser.add_argument('-c', '-C', help='cluster data path', required=False) args = parser.parse_args() input_file = args.i output_path = args.o cluster_data_path = args.c cluster_service = gps_service() print 'arguments assigned variables' data = pr.getAllData(input_file) print 'keeping only app init and live listening' data_to_use = pr.filtersurveydata(data, SurveyConstants.CONDITION_ID, ['1', '2', '3', '4']) cluster_data_files = glob(cluster_data_path + '*.data') app_init_data = pr.filtersurveydata(data_to_use, SurveyConstants.USER_INITIATED, ['false']) listening_data = pr.filtersurveydata(data_to_use, SurveyConstants.LISTENING, ['true']) data = app_init_data + listening_data print 'done' per_participant_data = pr.getPerParticipantData(data) print 'per participant data extracted' participant_list = per_participant_data.keys() print participant_list min_data_sample_no = 5 final_result = {} cluster_results = {} conf_mat_results = {} for pid in participant_list: print '\n\npid: ' + pid if len(per_participant_data[pid]) < min_data_sample_no: print '# of samples < min_data_sample_no (' + str(min_data_sample_no) + '), skipping pid' continue if cluster_data_path + pid + '_all_data.data' not in cluster_data_files: print 'could not find data file for pid: ', pid, ', skipping' continue final_result[pid] = {} cluster_results[pid] = {} cluster_service.clean_house() cluster_service.set_pid(pid) cluster_service.set_participant_data(per_participant_data[pid]) with open(cluster_data_path+pid+'_all_data.data', 'rb') as f: data_dict = pickle.load(f) cluster_boundaries = data_dict['boundary'] cluster_labels = data_dict['label'] missing_gps = 0 for data_sample in per_participant_data[pid]: n_pid = data_sample[SurveyConstants.PATIENT_ID] cid = data_sample[SurveyConstants.CONDITION_ID] sid = data_sample[SurveyConstants.SESSION_ID] if '' == data_sample[SurveyConstants.GPS_PATH]: #print 'empty gps file path, skipping \n', data_sample missing_gps += 1 continue gps_coords_clean = pr.getcleangpsdata(data_sample[SurveyConstants.GPS_PATH], remove_duplicates=True, pid=n_pid, cid=cid, sid=sid) if gps_coords_clean is None: print 'no GPS data for ', n_pid, cid, sid, ', skipping' continue travel_result = cluster_service.find_travelling(gps_coords_clean) final_result[pid][(n_pid, cid, sid)] = \ [(LocationContext.LOCATION_CONTEXT_VALUES[data_sample[SurveyConstants.LOCATION_CONTEXT]], len(gps_coords_clean))] cluster_results[pid][(n_pid, cid, sid)] = [ (LocationContext.LOCATION_CONTEXT_VALUES[data_sample[SurveyConstants.LOCATION_CONTEXT]], gps_coords_clean)] if travel_result[0]: if not 0 == len(travel_result[1]): final_result[pid][(n_pid, cid, sid)].append(('Travel', len(travel_result[1]))) cluster_results[pid][(n_pid, cid, sid)].append(('Travel', (travel_result[1]))) if not 0 == len(travel_result[2]): cluster_decisions = gps.check_polygon_memberships(cluster_boundaries, travel_result[2]) cluster_vals = [sum(x) for x in cluster_decisions] for idx in range(len(cluster_vals)): if not 0 == cluster_vals[idx]: final_result[pid][(n_pid, cid, sid)].append((cluster_labels[idx], cluster_vals[idx])) cluster_results[pid][(n_pid, cid, sid)].append((cluster_labels[idx], travel_result[2], cluster_decisions[idx])) print 'confusion matrix: ' conf_mat = {} for main_tuple in final_result[pid]: count_data = final_result[pid][main_tuple] actual_label = count_data[0][0] for count_idx in range(1, len(count_data)): predicted_label = count_data[count_idx][0] if (actual_label, predicted_label) not in conf_mat: conf_mat[(actual_label, predicted_label)] = 0 conf_mat[(actual_label, predicted_label)] += count_data[count_idx][1] print conf_mat conf_mat_results[pid] = conf_mat print 'Missing GPS: ', missing_gps print 'Writing results to file' bD.write_variable(final_result, 'count_result.res', output_path) bD.write_variable(cluster_results, 'cluster_results.res', output_path) bD.write_variable(conf_mat_results, 'conf_mat_results.res', output_path) print 'done' print 'TADAA!!'
def get_travelling_and_stationary_clusters(self, eps_list=range(20, 51, 10), min_sample_list=[3, 5, 7]): ''' for each data point within the participant data, distinguish between the travelling, and non-travelling data. Once all the travelling clusters, and non-travelling points have been extracted perform the DBSCAN clustering on the non-travelling points to obtain the stationary clusters, and noise markers. :return: ''' for data_sample in self.__participant_data: try: self.__internal_location_info[(data_sample[SurveyConstants.PATIENT_ID], data_sample[SurveyConstants.CONDITION_ID], data_sample[SurveyConstants.SESSION_ID])] = \ LocationContext.LOCATION_CONTEXT_VALUES[data_sample[SurveyConstants.LOCATION_CONTEXT]] gps_coords_clean = pr.getcleangpsdata(data_sample[34], remove_duplicates=True, pid=data_sample[0], cid=data_sample[1], sid=data_sample[2]) if not gps_coords_clean: continue except IOError: self.__error_files += 1 continue # TODO: the speed limit has to be decided, are people walking also considered travelling? travel_result = self.find_travelling(gps_coords_clean) if travel_result[0]: # travel_clusters.append(gps_coords_clean) if not 0 == len(travel_result[1]): self.__travel_clusters.append(travel_result[1]) if not 0 == len(travel_result[2]): self.__stationary_points += travel_result[2] else: self.__stationary_points += gps_coords_clean ''' since all the stationary points are being collected for a given participants, the hull intersection functions never get called. ''' print 'collected all points, clustering, eps_list:', eps_list, ', min_sample_list:', min_sample_list sc_nz = clusters.getdbscanclusters(self.__stationary_points, eps_list, min_sample_list) print 'done' if sc_nz is not None: if not ([] == sc_nz['sc']): self.__stationary_clusters = sc_nz['sc'] if not ([] == sc_nz['nz']): self.__noise_markers = sc_nz['nz'] print 'stationary clusters: ' + str(len(self.__stationary_clusters)) + ', travel clusters: ' + str( len(self.__travel_clusters)) x = 0 for cluster_points in self.__stationary_clusters: x += 1 try: boundary_points = gps.getconvexhull(cluster_points) except: print 'Error getting the convex hull of the cluster. cluster #', x self.__stationary_cluster_label.append("Error,C-"+str(x)) continue self.__stationary_cluster_boundaries.append(boundary_points) cluster_point_types = [] for cluster_point in cluster_points: cluster_point_types.append(self.__internal_location_info[(cluster_point[-3], cluster_point[-2], cluster_point[-1])]) label_counts = collections.Counter(cluster_point_types) most_common_label = label_counts.most_common(1) self.__stationary_cluster_label.append(most_common_label[0][0]) return self.__travel_clusters, self.__stationary_clusters, self.__stationary_cluster_boundaries, \ self.__stationary_cluster_label, self.__noise_markers, self.__error_files, self.__stationary_points