Example #1
0
def main():
    print 'now entered main'

    parser = argparse.ArgumentParser()

    parser.add_argument('-i', '-I', help='input CSV file', required=True)
    parser.add_argument('-o', '-O', help='output path', required=True)
    parser.add_argument('-c', '-C', help='cluster data path', required=False)

    args = parser.parse_args()

    input_file = args.i
    output_path = args.o
    cluster_data_path = args.c

    cluster_service = gps_service()

    print 'arguments assigned variables'
    data = pr.getAllData(input_file)
    print 'keeping only app init and live listening'
    data_to_use = pr.filtersurveydata(data, SurveyConstants.CONDITION_ID, ['1', '2', '3', '4'])
    cluster_data_files = glob(cluster_data_path + '*.data')
    app_init_data = pr.filtersurveydata(data_to_use, SurveyConstants.USER_INITIATED, ['false'])
    listening_data = pr.filtersurveydata(data_to_use, SurveyConstants.LISTENING, ['true'])
    data = app_init_data + listening_data
    print 'done'
    per_participant_data = pr.getPerParticipantData(data)
    print 'per participant data extracted'
    participant_list = per_participant_data.keys()
    print participant_list
    min_data_sample_no = 5
    final_result = {}
    cluster_results = {}
    conf_mat_results = {}
    for pid in participant_list:
        print '\n\npid: ' + pid
        if len(per_participant_data[pid]) < min_data_sample_no:
            print '# of samples < min_data_sample_no (' + str(min_data_sample_no) + '), skipping pid'
            continue
        if cluster_data_path + pid + '_all_data.data' not in cluster_data_files:
            print 'could not find data file for pid: ', pid, ', skipping'
            continue
        final_result[pid] = {}
        cluster_results[pid] = {}
        cluster_service.clean_house()
        cluster_service.set_pid(pid)
        cluster_service.set_participant_data(per_participant_data[pid])
        with open(cluster_data_path+pid+'_all_data.data', 'rb') as f:
            data_dict = pickle.load(f)
            cluster_boundaries = data_dict['boundary']
            cluster_labels = data_dict['label']
            missing_gps = 0
        for data_sample in per_participant_data[pid]:
            n_pid = data_sample[SurveyConstants.PATIENT_ID]
            cid = data_sample[SurveyConstants.CONDITION_ID]
            sid = data_sample[SurveyConstants.SESSION_ID]
            if '' == data_sample[SurveyConstants.GPS_PATH]:
                #print 'empty gps file path, skipping \n', data_sample
                missing_gps += 1
                continue
            gps_coords_clean = pr.getcleangpsdata(data_sample[SurveyConstants.GPS_PATH], remove_duplicates=True,
                                                  pid=n_pid, cid=cid, sid=sid)
            if gps_coords_clean is None:
                print 'no GPS data for ', n_pid, cid, sid, ', skipping'
                continue
            travel_result = cluster_service.find_travelling(gps_coords_clean)
            final_result[pid][(n_pid, cid, sid)] = \
                [(LocationContext.LOCATION_CONTEXT_VALUES[data_sample[SurveyConstants.LOCATION_CONTEXT]],
                  len(gps_coords_clean))]
            cluster_results[pid][(n_pid, cid, sid)] = [
                (LocationContext.LOCATION_CONTEXT_VALUES[data_sample[SurveyConstants.LOCATION_CONTEXT]],
                 gps_coords_clean)]
            if travel_result[0]:
                if not 0 == len(travel_result[1]):
                    final_result[pid][(n_pid, cid, sid)].append(('Travel', len(travel_result[1])))
                    cluster_results[pid][(n_pid, cid, sid)].append(('Travel', (travel_result[1])))
                if not 0 == len(travel_result[2]):
                    cluster_decisions = gps.check_polygon_memberships(cluster_boundaries, travel_result[2])
                    cluster_vals = [sum(x) for x in cluster_decisions]
                    for idx in range(len(cluster_vals)):
                        if not 0 == cluster_vals[idx]:
                            final_result[pid][(n_pid, cid, sid)].append((cluster_labels[idx], cluster_vals[idx]))
                            cluster_results[pid][(n_pid, cid, sid)].append((cluster_labels[idx], travel_result[2],
                                                                            cluster_decisions[idx]))
        print 'confusion matrix: '
        conf_mat = {}
        for main_tuple in final_result[pid]:
            count_data = final_result[pid][main_tuple]
            actual_label = count_data[0][0]
            for count_idx in range(1, len(count_data)):
                predicted_label = count_data[count_idx][0]
                if (actual_label, predicted_label) not in conf_mat:
                    conf_mat[(actual_label, predicted_label)] = 0
                conf_mat[(actual_label, predicted_label)] += count_data[count_idx][1]
        print conf_mat
        conf_mat_results[pid] = conf_mat
        print 'Missing GPS: ', missing_gps

    print 'Writing results to file'
    bD.write_variable(final_result, 'count_result.res', output_path)
    bD.write_variable(cluster_results, 'cluster_results.res', output_path)
    bD.write_variable(conf_mat_results, 'conf_mat_results.res', output_path)
    print 'done'
    print 'TADAA!!'
 def get_travelling_and_stationary_clusters(self, eps_list=range(20, 51, 10), min_sample_list=[3, 5, 7]):
     '''
     for each data point within the participant data, distinguish between the travelling, and non-travelling data.
     Once all the travelling clusters, and non-travelling points have been extracted perform the DBSCAN clustering
     on the non-travelling points to obtain the stationary clusters, and noise markers.
     :return:
     '''
     for data_sample in self.__participant_data:
         try:
             self.__internal_location_info[(data_sample[SurveyConstants.PATIENT_ID],
                                            data_sample[SurveyConstants.CONDITION_ID],
                                            data_sample[SurveyConstants.SESSION_ID])] = \
                 LocationContext.LOCATION_CONTEXT_VALUES[data_sample[SurveyConstants.LOCATION_CONTEXT]]
             gps_coords_clean = pr.getcleangpsdata(data_sample[34], remove_duplicates=True,
                                                   pid=data_sample[0], cid=data_sample[1], sid=data_sample[2])
             if not gps_coords_clean:
                 continue
         except IOError:
             self.__error_files += 1
             continue
         # TODO: the speed limit has to be decided, are people walking also considered travelling?
         travel_result = self.find_travelling(gps_coords_clean)
         if travel_result[0]:
             # travel_clusters.append(gps_coords_clean)
             if not 0 == len(travel_result[1]):
                 self.__travel_clusters.append(travel_result[1])
             if not 0 == len(travel_result[2]):
                 self.__stationary_points += travel_result[2]
         else:
             self.__stationary_points += gps_coords_clean
     '''
     since all the stationary points are being collected for a given participants, the hull intersection functions
     never get called.
     '''
     print 'collected all points, clustering, eps_list:', eps_list, ', min_sample_list:', min_sample_list
     sc_nz = clusters.getdbscanclusters(self.__stationary_points, eps_list, min_sample_list)
     print 'done'
     if sc_nz is not None:
         if not ([] == sc_nz['sc']):
             self.__stationary_clusters = sc_nz['sc']
         if not ([] == sc_nz['nz']):
             self.__noise_markers = sc_nz['nz']
     print 'stationary clusters: ' + str(len(self.__stationary_clusters)) + ', travel clusters: ' + str(
         len(self.__travel_clusters))
     x = 0
     for cluster_points in self.__stationary_clusters:
         x += 1
         try:
             boundary_points = gps.getconvexhull(cluster_points)
         except:
             print 'Error getting the convex hull of the cluster. cluster #', x
             self.__stationary_cluster_label.append("Error,C-"+str(x))
             continue
         self.__stationary_cluster_boundaries.append(boundary_points)
         cluster_point_types = []
         for cluster_point in cluster_points:
             cluster_point_types.append(self.__internal_location_info[(cluster_point[-3],
                                                                       cluster_point[-2],
                                                                       cluster_point[-1])])
         label_counts = collections.Counter(cluster_point_types)
         most_common_label = label_counts.most_common(1)
         self.__stationary_cluster_label.append(most_common_label[0][0])
     return self.__travel_clusters, self.__stationary_clusters, self.__stationary_cluster_boundaries, \
            self.__stationary_cluster_label, self.__noise_markers, self.__error_files, self.__stationary_points