Esempio n. 1
0
def main(file):
    # Load the ecg beat file
    data_matrix, label_matrix = utilities.load_train_file(file)

    # Get the wavelet features for every beat across all the ailments
    ecg_features = utilities.get_features(data_matrix)

    # Instantiate the CombinedNeuralNet algorithm to train the features
    algorithm = manager.CombinedNeuralNet(ecg_features, label_matrix)
    weights = algorithm.train()
    def _run_attribute_query(self, layer, id_attribute, attribute, encoding,
                             use_selected, analysis_step, output_path,
                             file_save_progress_step=0):
        """
        Process the attribute data query.

        Inputs:

            layer - A QgsVectorLayer object

            id_attribute - The name of the attribute that uniquely identifies
                each feature in the layer

            attribute - The name of the attribute to use in the processing
                query

            encoding - The encoding to use when processing the attributes and
                saving the results to disk

            use_selected - A boolean indicating if the processing is to be
                performed only on the selected features or on all features

            analysis_step - A number indicating the ammount of overall
                progress is to be added after this processing is done

            output_path - The full path to the text file where the results
                are to be saved.

            file_save_progress_step - A number indicating the ammount of 
                overall progress to be added after saving the text file with
                the results
        """

        self.update_info.emit('Running attribute query', 1)
        data = []
        features = utilities.get_features(layer, use_selected)
        for feat in features:
            id_attr = self._get_numeric_attribute(feat, id_attribute)
            attr = self._get_numeric_attribute(feat, attribute, float)
            if attr is not None and id_attr is not None:
                if attr < 0:
                    raise ValueError('Attribute must be non negative')
                else:
                    data.append((id_attr, attr))
        self.global_progress += analysis_step
        self.progress_changed.emit()
        output_dir, output_name = os.path.split(output_path)
        output_file = self._save_text_file(data, 'Writing attribute file...',
                                           output_dir, output_name, encoding,
                                           file_save_progress_step)
        return output_file
def find_best(method_key, feature_set, training_subjects):
    # Load up all the data
    data_dict = utilities.build_data_dictionary(feature_set)

    # Initialize holders
    training_set_features = np.array([])
    training_set_labels = np.array([])

    # Build vectors for training subjects
    for subject in training_subjects:
        score_features, full_features = utilities.get_features(
            subject, data_dict)
        if np.shape(training_set_features)[0] == 0:
            training_set_features = full_features
            training_set_labels = score_features
        else:
            training_set_features = np.vstack(
                (training_set_features, full_features))
            training_set_labels = np.vstack(
                (training_set_labels, score_features))

    # Convert raw scores from 0-5 to binary,or 0-2
    training_set_labels = utilities.process_raw_scores(training_set_labels,
                                                       classify_sleep.run_flag)

    if method_key == 'Logistic Regression':
        parameters = {
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2']
        }
        classifier = LogisticRegression()

    if method_key == 'KNeighbors':
        parameters = {'n_neighbors': [500, 1000, 2000]}
        classifier = KNeighborsClassifier()

    if method_key == 'MLP':
        parameters = {
            'solver': ['lbfgs'],
            'max_iter': [1000],
            'alpha': 10.0**-np.arange(1, 4),
            'hidden_layer_sizes': [(30, 30, 30)]
        }
        classifier = MLPClassifier()

    if method_key == 'Random Forest':
        max_depth = [int(x) for x in np.linspace(10, 110, num=2)]
        max_depth.append(None)
        max_depth = [10, 50, 100]
        min_samples_split = [10]
        min_samples_leaf = [32]
        parameters = {
            'n_estimators': [50],
            'max_features': [None],
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'bootstrap': [True]
        }
        classifier = RandomForestClassifier()

    class_weights = class_weight.compute_class_weight(
        'balanced', np.unique(training_set_labels), training_set_labels)
    class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

    if len(class_weights) > 2:
        class_weight_dict = {
            0: class_weights[0],
            1: class_weights[1],
            2: class_weights[2]
        }

    classifier.class_weight = class_weight_dict

    if classify_sleep.run_flag == utilities.RUN_REM:
        scoring = 'neg_log_loss'
    else:
        scoring = 'roc_auc'

    clf = GridSearchCV(classifier, parameters, scoring=scoring)

    clf.fit(training_set_features, training_set_labels)

    if verbose:
        print('Best parameters for set:')
        print(clf.best_params_)
        print('Score on training data: ' +
              str(clf.score(training_set_features, training_set_labels)))

    save_name = 'parameters/' + method_key + utilities.string_from_features(
        feature_set) + '.npy'
    np.save(save_name, clf.best_params_)

    return clf.best_params_
def train_and_test_model(training_subjects,
                         testing_subjects,
                         method_key,
                         classifier,
                         feature_set,
                         data_dict,
                         save_to_file=False):
    """
        Trains and tests model for given feature set and classifier.
        
        Args:
            training_subjects ([int]): Subject IDs in training set
            testing_subjects ([int]): Subject IDs in testing set
            method_key (str): Key for classifier
            classifier : Classifier object
            feature_set (dict): Feature set to test
            data_dict (dict): Dictionary to look up subject training and testing data
            save_to_file (bool) : Flag if want to save probabilities to file

        Returns:
            [int]: ground truth labels
            np.array : predicted labels
            np.array : class prediction probabilities
        """

    classifier_abbrev = str(classifier)[0:4]
    save_name = 'parameters/' + classifier_abbrev + utilities.string_from_features(
        feature_set) + '_params.npy'

    if LOAD_PARAMS or method_key == 'MLP':  # TODO: Faster parameter searching with MLP
        params = np.load(save_name).item()
    else:
        params = get_parameters.find_best(method_key, feature_set,
                                          training_subjects)
        np.save(save_name, params)

    classifier.set_params(**params)

    training_set_features = np.array([])
    training_set_true_labels = np.array([])
    testing_set_features = np.array([])
    testing_set_true_labels = np.array([])

    # Get labels and features for training and testing sets
    for subject in training_subjects:
        scores_by_epoch, features_by_epoch = utilities.get_features(
            subject, data_dict)

        if np.shape(training_set_features)[0] == 0:
            training_set_features = features_by_epoch
            training_set_true_labels = scores_by_epoch
        else:
            training_set_features = np.vstack(
                (training_set_features, features_by_epoch))
            training_set_true_labels = np.vstack(
                (training_set_true_labels, scores_by_epoch))

    for subject in testing_subjects:
        scores_by_epoch, features_by_epoch = utilities.get_features(
            subject, data_dict)
        if np.shape(testing_set_features)[0] == 0:
            testing_set_features = features_by_epoch
            testing_set_true_labels = scores_by_epoch
        else:
            testing_set_features = np.vstack(
                (testing_set_features, features_by_epoch))
            testing_set_true_labels = np.vstack(
                (testing_set_true_labels, scores_by_epoch))

    # Convert raw labels to 0/1 or 0-2
    training_set_true_labels = utilities.process_raw_scores(
        training_set_true_labels, run_flag)
    testing_set_true_labels = utilities.process_raw_scores(
        testing_set_true_labels, run_flag)

    # Set class weights for those methods that allow them
    class_weights = class_weight.compute_class_weight(
        'balanced',
        classes=np.unique(training_set_true_labels),
        y=training_set_true_labels)
    class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

    if len(class_weights) > 2:  # Handles wake/NREM/REM case
        class_weight_dict = {
            0: class_weights[0],
            1: class_weights[1],
            2: class_weights[2]
        }

    classifier.class_weight = class_weight_dict

    # # Debug-only: Uncomment to reverse the training/testing order, and test Apple Watch data on MESA-trained models
    # classifier = np.load('trained_models/' + classifier_abbrev +
    # utilities.string_from_features(feature_set) + '_trained_modelMESA.npy').item()

    # Fit model to training data, get class predictions and class probabilities
    classifier.fit(training_set_features, training_set_true_labels)
    predicted_labels = classifier.predict(testing_set_features)
    class_probabilities = classifier.predict_proba(testing_set_features)

    # Save trained model to use for testing MESA cohort
    save_name = 'trained_models/' + classifier_abbrev + \
                utilities.string_from_features(feature_set) + '_trained_model.npy'
    np.save(save_name, classifier)

    # Optional; save to file for Kalman filter and print performance metrics
    if save_to_file:
        np.savetxt('sleep_modeling/' + str(testing_subjects[0]) + '.csv',
                   classifier.predict_proba(testing_set_features),
                   delimiter=',')
        np.savetxt('sleep_modeling/' + str(testing_subjects[0]) +
                   '_classes.csv',
                   testing_set_true_labels,
                   delimiter=',')
        np.savetxt('sleep_modeling/' + str(testing_subjects[0]) +
                   '_predicted_classes.csv',
                   predicted_labels,
                   delimiter=',')

        true_positive_rate_for_interpolation = 0.85
        false_positive_rates, true_positive_rates, thresholds = roc_curve(
            testing_set_true_labels,
            class_probabilities[:, 1],
            pos_label=1,
            drop_intermediate=False)

        print('Subject ID: ' + str(testing_subjects[0]))
        print('False positive rate: ' + str(
            np.interp(true_positive_rate_for_interpolation,
                      true_positive_rates, false_positive_rates)))
        print('True positive rate: ' +
              str(true_positive_rate_for_interpolation))
        print('\n\n')

    return testing_set_true_labels, predicted_labels, class_probabilities
 def _run_edge_query(self, layer, id_attribute, encoding, use_selected,
                     analysis_step, output_path=None,
                     file_save_progress_step=0, 
                     shape_file_path=None):
     # for each current and next features
     #   get the closest edge from current to next -> L1
     #   get the closest edge from next to current -> L2
     #   project L1's vertices on L2 and get their distance from L1
     #   project L2's vertices on L1 and get their distance from L2
     #   the pair with the smallest distance wins!
     self.update_info.emit('Running edge query', 1)
     data = []
     if layer.crs().geographicFlag():
         measurer = self._get_measurer(self.project_crs)
         transformer = self._get_transformer(layer)
     else:
         measurer = self._get_measurer(layer.crs())
         transformer = None
     feature_ids = [f.id() for f in utilities.get_features(layer, use_selected)]
     feature_step = analysis_step / float(len(feature_ids))
     i = 0
     j = 0
     while i < len(feature_ids):
         self.update_info.emit(
             "Processing feature {}/{}".format(i+1, len(feature_ids)), 2)
         features = utilities.get_features(layer, use_selected, 
                                           feature_ids[i])
         current = iter(features).next()
         c_id_at = self._get_numeric_attribute(current, id_attribute)
         if c_id_at is not None:
             current_geom = current.geometry()
             geometry_errors = current_geom.validateGeometry()
             if any(geometry_errors):
                 raise InvalidFeatureError('Layer: %s - Feature %s has '
                                           'geometry errors. Aborting...'
                                           % (layer.name(), c_id_at))
             elif current_geom.isMultipart():
                 raise InvalidFeatureError('Feature %s is multipart. '
                                           'Aborting...' % c_id_at)
             current_poly = self._get_polygon(current_geom, transformer)
             j = i + 1
             while j < len(feature_ids):
                 features = utilities.get_features(layer, use_selected,
                                                   feature_ids[j])
                 next_ = iter(features).next()
                 n_id_at = self._get_numeric_attribute(next_, id_attribute)
                 if n_id_at is not None:
                     next_geom = next_.geometry()
                     next_poly = self._get_polygon(next_geom, transformer)
                     segments = self.get_closest_segments(current_poly,
                                                          next_poly)
                     current_segment, next_segment = segments
                     candidates = []
                     for current_vertex in current_segment:
                         candidate = self.find_candidate_points(
                             current_vertex,
                             next_segment,
                             measurer
                         )
                         candidates.append(candidate)
                     for next_vertex in next_segment:
                         candidate = self.find_candidate_points(
                             next_vertex,
                             current_segment,
                             measurer
                         )
                         candidates.append(candidate)
                     ordered_candidates = sorted(candidates, 
                                                 key=lambda c: c[2])
                     winner = ordered_candidates[0]
                     # transform the winner's coordinates back to layer crs
                     from_restored = self._transform_point(winner[0],
                                                           transformer,
                                                           reverse=True)
                     to_restored = self._transform_point(winner[1],
                                                         transformer,
                                                         reverse=True)
                     feat_result = {
                         'distance': winner[2],
                         'from': from_restored,
                         'to': to_restored,
                         'from_attribute': c_id_at,
                         'to_attribute': n_id_at,
                     }
                     data.append(feat_result)
                 j += 1
         i += 1
         self.global_progress += feature_step
         self.progress_changed.emit()
     output_files = []
     if any(data):
         if output_path is not None:
             output_dir, output_name = os.path.split(output_path)
             data_to_write = []
             for e_dict in data:
                 from_id = e_dict['from_attribute']
                 to_id = e_dict['to_attribute']
                 distance = e_dict['distance']
                 data_to_write.append((from_id, to_id, distance))
             output_file = self._save_text_file(data_to_write, 'Writing '
                                                'edges file...',
                                                output_dir, output_name,
                                                encoding,
                                                file_save_progress_step)
             output_files.append(output_file)
         if shape_file_path is not None:
             output_dir, output_name = os.path.split(shape_file_path)
             self.update_info.emit('Creating edge distance file', 1)
             if not os.path.isdir(output_dir):
                 os.mkdir(output_dir)
             self.update_info.emit("edges ...", 2)
             output_shape = self._write_distance_file(data, output_dir,
                                                      output_name,
                                                      encoding,
                                                      layer.crs())
             output_files.append(output_shape)
             self.global_progress += file_save_progress_step
             self.progress_changed.emit()
     return output_files
 def _run_centroid_query(self, layer, id_attribute, encoding, use_selected,
                         analysis_step, output_path=None,
                         file_save_progress_step=0,
                         shape_file_path=None):
     self.update_info.emit('Running centroid query...', 1)
     data = []
     if layer.crs().geographicFlag():
         measurer = self._get_measurer(self.project_crs)
         transformer = self._get_transformer(layer)
     else:
         measurer = self._get_measurer(layer.crs())
         transformer = None
     feature_ids = [f.id() for f in utilities.get_features(layer,
                                                           use_selected)]
     i = 0
     j = 0
     while i < len(feature_ids):
         features = utilities.get_features(layer, use_selected,
                                           feature_ids[i])
         current = iter(features).next()
         c_id_attr = self._get_numeric_attribute(current, id_attribute)
         if c_id_attr is not None:
             current_geom = current.geometry()
             orig_curr_centroid = current_geom.centroid().asPoint()
             trans_curr_centroid = self._transform_point(
                 orig_curr_centroid, transformer)
             j = i + 1
             while j < len(feature_ids):
                 features = utilities.get_features(layer, use_selected,
                                                   feature_ids[j])
                 next_ = iter(features).next()
                 n_id_attr = self._get_numeric_attribute(next_,
                                                         id_attribute)
                 if n_id_attr is not None:
                     next_geom = next_.geometry()
                     orig_n_centroid = next_geom.centroid().asPoint()
                     trans_n_centroid = self._transform_point(
                         orig_n_centroid, transformer)
                     distance = measurer.measureLine(trans_curr_centroid,
                                                     trans_n_centroid)
                     feat_result = {
                         'current': {
                             'attribute': c_id_attr,
                             'centroid': orig_curr_centroid,
                             'feature_geometry': current_geom,
                         },
                         'next': {
                             'attribute': n_id_attr,
                             'centroid': orig_n_centroid,
                             'feature_geometry': next_geom,
                         },
                         'distance': distance,
                     }
                     data.append(feat_result)
                 j += 1
         i += 1
     self.global_progress += analysis_step
     self.progress_changed.emit()
     output_files = []
     if any(data):
         if output_path is not None:
             output_dir, output_name = os.path.split(output_path)
             data_to_write = []
             for c_dict in data:
                 current_id = c_dict['current']['attribute']
                 next_id = c_dict['next']['attribute']
                 distance = c_dict['distance']
                 data_to_write.append((current_id, next_id, distance))
             output_file = self._save_text_file(data_to_write,
                                                'Writing centroids file...',
                                                output_dir, output_name, 
                                                encoding,
                                                file_save_progress_step)
             output_files.append(output_file)
         if shape_file_path is not None:
             output_dir, output_name = os.path.split(shape_file_path)
             self.update_info.emit('Creating centroid distance file', 1)
             if not os.path.isdir(output_dir):
                 os.mkdir(output_dir)
             data_to_write = []
             for c_dict in data:
                 the_data = {
                     'from': c_dict['current']['centroid'],
                     'to': c_dict['next']['centroid'],
                     'distance': c_dict['distance'],
                     'from_attribute': c_dict['current']['attribute'],
                     'to_attribute': c_dict['next']['attribute'],
                 }
                 data_to_write.append(the_data)
             output_shape = self._write_distance_file(
                 data_to_write, output_dir, output_name, encoding,
                 layer.crs()
             )
             output_files.append(output_shape)
             self.global_progress += file_save_progress_step
             self.progress_changed.emit()
     return output_files
    def _run_area_query(self, layer, id_attribute, encoding, use_selected,
                        analysis_step, output_path, file_save_progress_step=0):
        """
        Process the area data query.

        Inputs:

            layer - A QgsVectorLayer object

            id_attribute - The name of the attribute that uniquely identifies
                each feature in the layer

            encoding - The encoding to use when processing the attributes and
                saving the results to disk

            use_selected - A boolean indicating if the processing is to be
                performed only on the selected features or on all features

            analysis_step - A number indicating the ammount of overall
                progress is to be added after this processing is done

            output_path - The full path to the text file where the results
                are to be saved.

            file_save_progress_step - A number indicating the ammount of 
                overall progress to be added after saving the text file with
                the results
        """

        self.update_info.emit('Running area query...', 1)
        data = []
        if layer.crs().geographicFlag():
            if self.project_crs.geographicFlag():
                print('Neither the layer nor the project\'s coordinate ' \
                        'system is projected. The area calculation will not ' \
                        'be acurate.')
            measurer = self._get_measurer(self.project_crs)
            transformer = self._get_transformer(layer)
        else:
            measurer = self._get_measurer(layer.crs())
            transformer = None
        features = utilities.get_features(layer, use_selected)
        for feat in features:
            polygon = feat.geometry().asPolygon()
            new_polygon = []
            for ring in polygon:
                new_ring = []
                for point in ring:
                    if transformer is None:
                        new_ring.append(point)
                    else:
                        new_ring.append(transformer.transform(point))
                new_polygon.append(new_ring)
            if any(new_polygon):
                outer_area = measurer.measurePolygon(new_polygon[0])
                hole_areas = 0
                if len(new_polygon) > 1:
                    holes = new_polygon[1:]
                    for hole in holes:
                        hole_areas += measurer.measurePolygon(hole)
                total_feat_area = outer_area - hole_areas
                id_attr = self._get_numeric_attribute(feat, id_attribute)
                if id_attr is not None:
                    data.append((id_attr, total_feat_area))
        self.global_progress += analysis_step
        self.progress_changed.emit()
        output_file = None
        if any(data):
            output_dir, output_name = os.path.split(output_path)
            output_file = self._save_text_file(data, 'Writing area file...',
                                               output_dir, output_name,
                                               encoding,
                                               file_save_progress_step)
        return output_file
    def _run_edge_query_fast(self, layer, id_attribute, encoding, use_selected,
                        analysis_step, output_path=None,
                        file_save_progress_step=0):
        """
        This method performs a faster edge query.

        This method is only suitable for creating the output text file with
        edge distances and cannot be used to also write the vector shape
        with visual representation of the distances. Distances are calculated
        with blazing speed using the underlying QGIS (which uses GEOS)
        distance function. Unfortunately these calculations provide only the
        distance and not the actual closest point coordinates.

        Inputs:

        Returns:
        """

        self.update_info.emit('Running fast edge query', 1)
        data = []
        if layer.crs().geographicFlag():
            measurer = self._get_measurer(self.project_crs)
            transformer = self._get_transformer(layer)
        else:
            measurer = self._get_measurer(layer.crs())
            transformer = None
        feature_ids = [f.id() for f in utilities.get_features(
                       layer, use_selected)]
        feature_step = analysis_step / float(len(feature_ids))
        i = 0
        j = 0
        while i < len(feature_ids):
            self.update_info.emit(
                "Processing feature {}/{}".format(i+1, len(feature_ids)), 2)
            features = utilities.get_features(layer, use_selected, 
                                              feature_ids[i])
            current = iter(features).next()
            c_id_at = self._get_numeric_attribute(current, id_attribute)
            if c_id_at is not None:
                current_geom = current.geometry()
                if transformer is not None:
                    current_geom.transform(transformer)
                j = i + 1
                while j < len(feature_ids):
                    features = utilities.get_features(layer, use_selected,
                                                      feature_ids[j])
                    next_ = iter(features).next()
                    n_id_at = self._get_numeric_attribute(next_, id_attribute)
                    if n_id_at is not None:
                        next_geom = next_.geometry()
                        if transformer is not None:
                            next_geom.transform(transformer)
                        dist = current_geom.distance(next_geom)
                        feat_result = {
                            'distance' : dist,
                            'from' : None,
                            'to' : None,
                            'from_attribute' : c_id_at,
                            'to_attribute' : n_id_at,
                        }
                        data.append(feat_result)
                    j += 1
            i += 1
            self.global_progress += feature_step
            self.progress_changed.emit()
        output_files = []
        if any(data):
            if output_path is not None:
                output_dir, output_name = os.path.split(output_path)
                data_to_write = []
                for e_dict in data:
                    from_id = e_dict['from_attribute']
                    to_id = e_dict['to_attribute']
                    distance = e_dict['distance']
                    data_to_write.append((from_id, to_id, distance))
                output_file = self._save_text_file(data_to_write, 'Writing '
                                                   'edges file...',
                                                   output_dir, output_name,
                                                   encoding,
                                                   file_save_progress_step)
                output_files.append(output_file)
                self.progress_changed.emit()
        return output_files