def main(file): # Load the ecg beat file data_matrix, label_matrix = utilities.load_train_file(file) # Get the wavelet features for every beat across all the ailments ecg_features = utilities.get_features(data_matrix) # Instantiate the CombinedNeuralNet algorithm to train the features algorithm = manager.CombinedNeuralNet(ecg_features, label_matrix) weights = algorithm.train()
def _run_attribute_query(self, layer, id_attribute, attribute, encoding, use_selected, analysis_step, output_path, file_save_progress_step=0): """ Process the attribute data query. Inputs: layer - A QgsVectorLayer object id_attribute - The name of the attribute that uniquely identifies each feature in the layer attribute - The name of the attribute to use in the processing query encoding - The encoding to use when processing the attributes and saving the results to disk use_selected - A boolean indicating if the processing is to be performed only on the selected features or on all features analysis_step - A number indicating the ammount of overall progress is to be added after this processing is done output_path - The full path to the text file where the results are to be saved. file_save_progress_step - A number indicating the ammount of overall progress to be added after saving the text file with the results """ self.update_info.emit('Running attribute query', 1) data = [] features = utilities.get_features(layer, use_selected) for feat in features: id_attr = self._get_numeric_attribute(feat, id_attribute) attr = self._get_numeric_attribute(feat, attribute, float) if attr is not None and id_attr is not None: if attr < 0: raise ValueError('Attribute must be non negative') else: data.append((id_attr, attr)) self.global_progress += analysis_step self.progress_changed.emit() output_dir, output_name = os.path.split(output_path) output_file = self._save_text_file(data, 'Writing attribute file...', output_dir, output_name, encoding, file_save_progress_step) return output_file
def find_best(method_key, feature_set, training_subjects): # Load up all the data data_dict = utilities.build_data_dictionary(feature_set) # Initialize holders training_set_features = np.array([]) training_set_labels = np.array([]) # Build vectors for training subjects for subject in training_subjects: score_features, full_features = utilities.get_features( subject, data_dict) if np.shape(training_set_features)[0] == 0: training_set_features = full_features training_set_labels = score_features else: training_set_features = np.vstack( (training_set_features, full_features)) training_set_labels = np.vstack( (training_set_labels, score_features)) # Convert raw scores from 0-5 to binary,or 0-2 training_set_labels = utilities.process_raw_scores(training_set_labels, classify_sleep.run_flag) if method_key == 'Logistic Regression': parameters = { 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2'] } classifier = LogisticRegression() if method_key == 'KNeighbors': parameters = {'n_neighbors': [500, 1000, 2000]} classifier = KNeighborsClassifier() if method_key == 'MLP': parameters = { 'solver': ['lbfgs'], 'max_iter': [1000], 'alpha': 10.0**-np.arange(1, 4), 'hidden_layer_sizes': [(30, 30, 30)] } classifier = MLPClassifier() if method_key == 'Random Forest': max_depth = [int(x) for x in np.linspace(10, 110, num=2)] max_depth.append(None) max_depth = [10, 50, 100] min_samples_split = [10] min_samples_leaf = [32] parameters = { 'n_estimators': [50], 'max_features': [None], 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': [True] } classifier = RandomForestClassifier() class_weights = class_weight.compute_class_weight( 'balanced', np.unique(training_set_labels), training_set_labels) class_weight_dict = {0: class_weights[0], 1: class_weights[1]} if len(class_weights) > 2: class_weight_dict = { 0: class_weights[0], 1: class_weights[1], 2: class_weights[2] } classifier.class_weight = class_weight_dict if classify_sleep.run_flag == utilities.RUN_REM: scoring = 'neg_log_loss' else: scoring = 'roc_auc' clf = GridSearchCV(classifier, parameters, scoring=scoring) clf.fit(training_set_features, training_set_labels) if verbose: print('Best parameters for set:') print(clf.best_params_) print('Score on training data: ' + str(clf.score(training_set_features, training_set_labels))) save_name = 'parameters/' + method_key + utilities.string_from_features( feature_set) + '.npy' np.save(save_name, clf.best_params_) return clf.best_params_
def train_and_test_model(training_subjects, testing_subjects, method_key, classifier, feature_set, data_dict, save_to_file=False): """ Trains and tests model for given feature set and classifier. Args: training_subjects ([int]): Subject IDs in training set testing_subjects ([int]): Subject IDs in testing set method_key (str): Key for classifier classifier : Classifier object feature_set (dict): Feature set to test data_dict (dict): Dictionary to look up subject training and testing data save_to_file (bool) : Flag if want to save probabilities to file Returns: [int]: ground truth labels np.array : predicted labels np.array : class prediction probabilities """ classifier_abbrev = str(classifier)[0:4] save_name = 'parameters/' + classifier_abbrev + utilities.string_from_features( feature_set) + '_params.npy' if LOAD_PARAMS or method_key == 'MLP': # TODO: Faster parameter searching with MLP params = np.load(save_name).item() else: params = get_parameters.find_best(method_key, feature_set, training_subjects) np.save(save_name, params) classifier.set_params(**params) training_set_features = np.array([]) training_set_true_labels = np.array([]) testing_set_features = np.array([]) testing_set_true_labels = np.array([]) # Get labels and features for training and testing sets for subject in training_subjects: scores_by_epoch, features_by_epoch = utilities.get_features( subject, data_dict) if np.shape(training_set_features)[0] == 0: training_set_features = features_by_epoch training_set_true_labels = scores_by_epoch else: training_set_features = np.vstack( (training_set_features, features_by_epoch)) training_set_true_labels = np.vstack( (training_set_true_labels, scores_by_epoch)) for subject in testing_subjects: scores_by_epoch, features_by_epoch = utilities.get_features( subject, data_dict) if np.shape(testing_set_features)[0] == 0: testing_set_features = features_by_epoch testing_set_true_labels = scores_by_epoch else: testing_set_features = np.vstack( (testing_set_features, features_by_epoch)) testing_set_true_labels = np.vstack( (testing_set_true_labels, scores_by_epoch)) # Convert raw labels to 0/1 or 0-2 training_set_true_labels = utilities.process_raw_scores( training_set_true_labels, run_flag) testing_set_true_labels = utilities.process_raw_scores( testing_set_true_labels, run_flag) # Set class weights for those methods that allow them class_weights = class_weight.compute_class_weight( 'balanced', classes=np.unique(training_set_true_labels), y=training_set_true_labels) class_weight_dict = {0: class_weights[0], 1: class_weights[1]} if len(class_weights) > 2: # Handles wake/NREM/REM case class_weight_dict = { 0: class_weights[0], 1: class_weights[1], 2: class_weights[2] } classifier.class_weight = class_weight_dict # # Debug-only: Uncomment to reverse the training/testing order, and test Apple Watch data on MESA-trained models # classifier = np.load('trained_models/' + classifier_abbrev + # utilities.string_from_features(feature_set) + '_trained_modelMESA.npy').item() # Fit model to training data, get class predictions and class probabilities classifier.fit(training_set_features, training_set_true_labels) predicted_labels = classifier.predict(testing_set_features) class_probabilities = classifier.predict_proba(testing_set_features) # Save trained model to use for testing MESA cohort save_name = 'trained_models/' + classifier_abbrev + \ utilities.string_from_features(feature_set) + '_trained_model.npy' np.save(save_name, classifier) # Optional; save to file for Kalman filter and print performance metrics if save_to_file: np.savetxt('sleep_modeling/' + str(testing_subjects[0]) + '.csv', classifier.predict_proba(testing_set_features), delimiter=',') np.savetxt('sleep_modeling/' + str(testing_subjects[0]) + '_classes.csv', testing_set_true_labels, delimiter=',') np.savetxt('sleep_modeling/' + str(testing_subjects[0]) + '_predicted_classes.csv', predicted_labels, delimiter=',') true_positive_rate_for_interpolation = 0.85 false_positive_rates, true_positive_rates, thresholds = roc_curve( testing_set_true_labels, class_probabilities[:, 1], pos_label=1, drop_intermediate=False) print('Subject ID: ' + str(testing_subjects[0])) print('False positive rate: ' + str( np.interp(true_positive_rate_for_interpolation, true_positive_rates, false_positive_rates))) print('True positive rate: ' + str(true_positive_rate_for_interpolation)) print('\n\n') return testing_set_true_labels, predicted_labels, class_probabilities
def _run_edge_query(self, layer, id_attribute, encoding, use_selected, analysis_step, output_path=None, file_save_progress_step=0, shape_file_path=None): # for each current and next features # get the closest edge from current to next -> L1 # get the closest edge from next to current -> L2 # project L1's vertices on L2 and get their distance from L1 # project L2's vertices on L1 and get their distance from L2 # the pair with the smallest distance wins! self.update_info.emit('Running edge query', 1) data = [] if layer.crs().geographicFlag(): measurer = self._get_measurer(self.project_crs) transformer = self._get_transformer(layer) else: measurer = self._get_measurer(layer.crs()) transformer = None feature_ids = [f.id() for f in utilities.get_features(layer, use_selected)] feature_step = analysis_step / float(len(feature_ids)) i = 0 j = 0 while i < len(feature_ids): self.update_info.emit( "Processing feature {}/{}".format(i+1, len(feature_ids)), 2) features = utilities.get_features(layer, use_selected, feature_ids[i]) current = iter(features).next() c_id_at = self._get_numeric_attribute(current, id_attribute) if c_id_at is not None: current_geom = current.geometry() geometry_errors = current_geom.validateGeometry() if any(geometry_errors): raise InvalidFeatureError('Layer: %s - Feature %s has ' 'geometry errors. Aborting...' % (layer.name(), c_id_at)) elif current_geom.isMultipart(): raise InvalidFeatureError('Feature %s is multipart. ' 'Aborting...' % c_id_at) current_poly = self._get_polygon(current_geom, transformer) j = i + 1 while j < len(feature_ids): features = utilities.get_features(layer, use_selected, feature_ids[j]) next_ = iter(features).next() n_id_at = self._get_numeric_attribute(next_, id_attribute) if n_id_at is not None: next_geom = next_.geometry() next_poly = self._get_polygon(next_geom, transformer) segments = self.get_closest_segments(current_poly, next_poly) current_segment, next_segment = segments candidates = [] for current_vertex in current_segment: candidate = self.find_candidate_points( current_vertex, next_segment, measurer ) candidates.append(candidate) for next_vertex in next_segment: candidate = self.find_candidate_points( next_vertex, current_segment, measurer ) candidates.append(candidate) ordered_candidates = sorted(candidates, key=lambda c: c[2]) winner = ordered_candidates[0] # transform the winner's coordinates back to layer crs from_restored = self._transform_point(winner[0], transformer, reverse=True) to_restored = self._transform_point(winner[1], transformer, reverse=True) feat_result = { 'distance': winner[2], 'from': from_restored, 'to': to_restored, 'from_attribute': c_id_at, 'to_attribute': n_id_at, } data.append(feat_result) j += 1 i += 1 self.global_progress += feature_step self.progress_changed.emit() output_files = [] if any(data): if output_path is not None: output_dir, output_name = os.path.split(output_path) data_to_write = [] for e_dict in data: from_id = e_dict['from_attribute'] to_id = e_dict['to_attribute'] distance = e_dict['distance'] data_to_write.append((from_id, to_id, distance)) output_file = self._save_text_file(data_to_write, 'Writing ' 'edges file...', output_dir, output_name, encoding, file_save_progress_step) output_files.append(output_file) if shape_file_path is not None: output_dir, output_name = os.path.split(shape_file_path) self.update_info.emit('Creating edge distance file', 1) if not os.path.isdir(output_dir): os.mkdir(output_dir) self.update_info.emit("edges ...", 2) output_shape = self._write_distance_file(data, output_dir, output_name, encoding, layer.crs()) output_files.append(output_shape) self.global_progress += file_save_progress_step self.progress_changed.emit() return output_files
def _run_centroid_query(self, layer, id_attribute, encoding, use_selected, analysis_step, output_path=None, file_save_progress_step=0, shape_file_path=None): self.update_info.emit('Running centroid query...', 1) data = [] if layer.crs().geographicFlag(): measurer = self._get_measurer(self.project_crs) transformer = self._get_transformer(layer) else: measurer = self._get_measurer(layer.crs()) transformer = None feature_ids = [f.id() for f in utilities.get_features(layer, use_selected)] i = 0 j = 0 while i < len(feature_ids): features = utilities.get_features(layer, use_selected, feature_ids[i]) current = iter(features).next() c_id_attr = self._get_numeric_attribute(current, id_attribute) if c_id_attr is not None: current_geom = current.geometry() orig_curr_centroid = current_geom.centroid().asPoint() trans_curr_centroid = self._transform_point( orig_curr_centroid, transformer) j = i + 1 while j < len(feature_ids): features = utilities.get_features(layer, use_selected, feature_ids[j]) next_ = iter(features).next() n_id_attr = self._get_numeric_attribute(next_, id_attribute) if n_id_attr is not None: next_geom = next_.geometry() orig_n_centroid = next_geom.centroid().asPoint() trans_n_centroid = self._transform_point( orig_n_centroid, transformer) distance = measurer.measureLine(trans_curr_centroid, trans_n_centroid) feat_result = { 'current': { 'attribute': c_id_attr, 'centroid': orig_curr_centroid, 'feature_geometry': current_geom, }, 'next': { 'attribute': n_id_attr, 'centroid': orig_n_centroid, 'feature_geometry': next_geom, }, 'distance': distance, } data.append(feat_result) j += 1 i += 1 self.global_progress += analysis_step self.progress_changed.emit() output_files = [] if any(data): if output_path is not None: output_dir, output_name = os.path.split(output_path) data_to_write = [] for c_dict in data: current_id = c_dict['current']['attribute'] next_id = c_dict['next']['attribute'] distance = c_dict['distance'] data_to_write.append((current_id, next_id, distance)) output_file = self._save_text_file(data_to_write, 'Writing centroids file...', output_dir, output_name, encoding, file_save_progress_step) output_files.append(output_file) if shape_file_path is not None: output_dir, output_name = os.path.split(shape_file_path) self.update_info.emit('Creating centroid distance file', 1) if not os.path.isdir(output_dir): os.mkdir(output_dir) data_to_write = [] for c_dict in data: the_data = { 'from': c_dict['current']['centroid'], 'to': c_dict['next']['centroid'], 'distance': c_dict['distance'], 'from_attribute': c_dict['current']['attribute'], 'to_attribute': c_dict['next']['attribute'], } data_to_write.append(the_data) output_shape = self._write_distance_file( data_to_write, output_dir, output_name, encoding, layer.crs() ) output_files.append(output_shape) self.global_progress += file_save_progress_step self.progress_changed.emit() return output_files
def _run_area_query(self, layer, id_attribute, encoding, use_selected, analysis_step, output_path, file_save_progress_step=0): """ Process the area data query. Inputs: layer - A QgsVectorLayer object id_attribute - The name of the attribute that uniquely identifies each feature in the layer encoding - The encoding to use when processing the attributes and saving the results to disk use_selected - A boolean indicating if the processing is to be performed only on the selected features or on all features analysis_step - A number indicating the ammount of overall progress is to be added after this processing is done output_path - The full path to the text file where the results are to be saved. file_save_progress_step - A number indicating the ammount of overall progress to be added after saving the text file with the results """ self.update_info.emit('Running area query...', 1) data = [] if layer.crs().geographicFlag(): if self.project_crs.geographicFlag(): print('Neither the layer nor the project\'s coordinate ' \ 'system is projected. The area calculation will not ' \ 'be acurate.') measurer = self._get_measurer(self.project_crs) transformer = self._get_transformer(layer) else: measurer = self._get_measurer(layer.crs()) transformer = None features = utilities.get_features(layer, use_selected) for feat in features: polygon = feat.geometry().asPolygon() new_polygon = [] for ring in polygon: new_ring = [] for point in ring: if transformer is None: new_ring.append(point) else: new_ring.append(transformer.transform(point)) new_polygon.append(new_ring) if any(new_polygon): outer_area = measurer.measurePolygon(new_polygon[0]) hole_areas = 0 if len(new_polygon) > 1: holes = new_polygon[1:] for hole in holes: hole_areas += measurer.measurePolygon(hole) total_feat_area = outer_area - hole_areas id_attr = self._get_numeric_attribute(feat, id_attribute) if id_attr is not None: data.append((id_attr, total_feat_area)) self.global_progress += analysis_step self.progress_changed.emit() output_file = None if any(data): output_dir, output_name = os.path.split(output_path) output_file = self._save_text_file(data, 'Writing area file...', output_dir, output_name, encoding, file_save_progress_step) return output_file
def _run_edge_query_fast(self, layer, id_attribute, encoding, use_selected, analysis_step, output_path=None, file_save_progress_step=0): """ This method performs a faster edge query. This method is only suitable for creating the output text file with edge distances and cannot be used to also write the vector shape with visual representation of the distances. Distances are calculated with blazing speed using the underlying QGIS (which uses GEOS) distance function. Unfortunately these calculations provide only the distance and not the actual closest point coordinates. Inputs: Returns: """ self.update_info.emit('Running fast edge query', 1) data = [] if layer.crs().geographicFlag(): measurer = self._get_measurer(self.project_crs) transformer = self._get_transformer(layer) else: measurer = self._get_measurer(layer.crs()) transformer = None feature_ids = [f.id() for f in utilities.get_features( layer, use_selected)] feature_step = analysis_step / float(len(feature_ids)) i = 0 j = 0 while i < len(feature_ids): self.update_info.emit( "Processing feature {}/{}".format(i+1, len(feature_ids)), 2) features = utilities.get_features(layer, use_selected, feature_ids[i]) current = iter(features).next() c_id_at = self._get_numeric_attribute(current, id_attribute) if c_id_at is not None: current_geom = current.geometry() if transformer is not None: current_geom.transform(transformer) j = i + 1 while j < len(feature_ids): features = utilities.get_features(layer, use_selected, feature_ids[j]) next_ = iter(features).next() n_id_at = self._get_numeric_attribute(next_, id_attribute) if n_id_at is not None: next_geom = next_.geometry() if transformer is not None: next_geom.transform(transformer) dist = current_geom.distance(next_geom) feat_result = { 'distance' : dist, 'from' : None, 'to' : None, 'from_attribute' : c_id_at, 'to_attribute' : n_id_at, } data.append(feat_result) j += 1 i += 1 self.global_progress += feature_step self.progress_changed.emit() output_files = [] if any(data): if output_path is not None: output_dir, output_name = os.path.split(output_path) data_to_write = [] for e_dict in data: from_id = e_dict['from_attribute'] to_id = e_dict['to_attribute'] distance = e_dict['distance'] data_to_write.append((from_id, to_id, distance)) output_file = self._save_text_file(data_to_write, 'Writing ' 'edges file...', output_dir, output_name, encoding, file_save_progress_step) output_files.append(output_file) self.progress_changed.emit() return output_files