Ejemplo n.º 1
0
    def __train_test_val_split(self):
        y = self.__same_length_vectors[Data_Fields.get_target()]
        binary_fields = [
            field for field in Data_Fields.get_binary_vars()
            if field in Config.DATA_FIELDS_IN_ANALYSIS
        ]
        continuous_fields = [
            field for field in Data_Fields.get_continuous_vars()
            if field in Config.DATA_FIELDS_IN_ANALYSIS
        ]
        predictors_names = binary_fields + continuous_fields

        predictors_vectors_tuple = tuple(
            [self.__same_length_vectors[name] for name in predictors_names])
        X = np.stack(predictors_vectors_tuple, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.1,
                                                            random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                          y_train,
                                                          test_size=0.2,
                                                          random_state=42)

        self.X_train = X_train
        self.X_val = X_val
        self.X_test = X_test
        self.y_train = y_train
        self.y_val = y_val
        self.y_test = y_test
        self.predictors_names = predictors_names
Ejemplo n.º 2
0
    def __init__(self):
        self.__class__.WORK_DIR = 'C:/Users/normy/corona_classifier_files/classification'
        self.__class__.COMMON_FILES_DIR = 'C:/Users/normy/PycharmProjects/corona_classifier/common_files'
        self.__class__.RAW_DATA_PATH = 'C:/Users/normy/PycharmProjects/covidclinicaldata/data'
        self.__class__.PICKLE_PATH = os.path.join(self.WORK_DIR,
                                                  'pickle_files')
        self.__class__.YAML_FILE_DIR = os.path.join(os.path.dirname(__file__),
                                                    'yaml_files')
        self.__class__.COMMON_YAML_FILE_DIR = os.path.join(
            self.COMMON_FILES_DIR, 'yaml_files')
        self.__class__.OUTPUT_PATH = os.path.join(self.WORK_DIR, 'outputs')

        self.__class__.DATA_FIELDS_IN_ANALYSIS = [
            Data_Fields.get_target(), *Data_Fields.get_binary_vars(),
            *Data_Fields.get_continuous_vars()
        ]

        static_values = self.load_yaml(self.YAML_FILE_DIR, 'static_values')
        self.__class__.DATA_FIELD_MISSING_VALUES_THRESHOLD = static_values[
            'data_field_missing_values_threshold']
        self.__class__.BOOTSTRAP_PATIENT_ENLARGEMENT_SIZE = static_values[
            'bootstrap_patient_enlargement_size']
        self.__class__.MODEL_THRESHOLDS = static_values['model_thresholds']

        self.__class__.CONTINUOUS_FIELDS_THRESHOLDS = self.load_yaml(
            self.COMMON_YAML_FILE_DIR, 'continuous_fields_thresholds')
Ejemplo n.º 3
0
    def __binary_one_hot_encoding(self):
        data_fields = Data_Fields.get_binary_vars()
        data_fields.append(Data_Fields.get_target())

        for patient in self.__patients:
            for field in data_fields:
                if getattr(patient, field) is None:
                    continue
                elif getattr(patient, field) not in ('TRUE', 'Positive'):
                    setattr(patient, field, 0)
                else:
                    setattr(patient, field, 1)
    def __get_vectors_for_analysis(self):
        data_fields_for_analysis = [Data_Fields.get_target()]
        data_fields_for_analysis.extend(Data_Fields.get_continuous_vars())

        vectors_for_analysis = []
        for vector in self.__vectors:
            if vector.field_name in data_fields_for_analysis:
                if vector.field_name in Config.DATA_FIELDS_IN_ANALYSIS:
                    vectors_for_analysis.append(vector)

        self.__vectors = vectors_for_analysis
        self.__graph_vectors['continuous_vectors'] = self.__vectors
    def __get_vector_dict(self, remove_missing_values: bool):
        vector_dict = {}
        for analysis_vector in self.__vectors:
            field_name = analysis_vector.field_name
            if field_name == Data_Fields.get_target():
                continue

            if remove_missing_values:
                vector = analysis_vector.vector_without_missing_values
            else:
                vector = analysis_vector.vector
            vector_dict[field_name] = vector
        return vector_dict
    def __average_by_target_for_age_groups(self):
        report_table = defaultdict(list)

        vector_dict = self.__get_analysis_vector_dict()

        for continuous_var in Data_Fields.get_continuous_vars():
            if continuous_var == 'age':
                continue

            if continuous_var not in Config.DATA_FIELDS_IN_ANALYSIS:
                continue

            same_length_vectors = self.__get_same_length_vectors(vector_list=[
                vector_dict[Data_Fields.get_target()], vector_dict[
                    Data_Fields.AGE.field_name], vector_dict[continuous_var]
            ])

            is_adult = same_length_vectors[Data_Fields.AGE.field_name] >= 18

            target_adult_vector = same_length_vectors[
                Data_Fields.get_target()][is_adult]
            response_adult_vector = same_length_vectors[continuous_var][
                is_adult]

            target_child_vector = same_length_vectors[
                Data_Fields.get_target()][~is_adult]
            response_child_vector = same_length_vectors[continuous_var][
                ~is_adult]

            adult_corona_positive_response_vector = response_adult_vector[
                target_adult_vector == 1]
            adult_corona_negative_response_vector = response_adult_vector[
                target_adult_vector == 0]
            child_corona_positive_response_vector = response_child_vector[
                target_child_vector == 1]
            child_corona_negative_response_vector = response_child_vector[
                target_child_vector == 0]

            adult_bootstrap_significance = self.bootstrap_difference_in_mean_of_two_groups(
                adult_corona_positive_response_vector,
                adult_corona_negative_response_vector)

            child_bootstrap_significance = self.bootstrap_difference_in_mean_of_two_groups(
                child_corona_positive_response_vector,
                child_corona_negative_response_vector)

            adult_corona_positive_bootstrap_mean = self.__calc_bootstrap_mean(
                adult_corona_positive_response_vector,
                iterations=Config.BOOTSTRAP_ITERATIONS)

            adult_corona_negative_bootstrap_mean = self.__calc_bootstrap_mean(
                adult_corona_negative_response_vector,
                iterations=Config.BOOTSTRAP_ITERATIONS)

            child_corona_positive_bootstrap_mean = self.__calc_bootstrap_mean(
                child_corona_positive_response_vector,
                iterations=Config.BOOTSTRAP_ITERATIONS)

            child_corona_negative_bootstrap_mean = self.__calc_bootstrap_mean(
                child_corona_negative_response_vector,
                iterations=Config.BOOTSTRAP_ITERATIONS)

            adult_count = len(adult_corona_positive_response_vector) + len(
                adult_corona_negative_response_vector)
            child_count = len(child_corona_positive_response_vector) + len(
                child_corona_negative_response_vector)

            report_table['feature'].append(continuous_var)
            report_table['adult corona positive regular AVG'].append(
                np.mean(adult_corona_positive_response_vector))
            report_table['adult corona negative regular AVG'].append(
                np.mean(adult_corona_negative_response_vector))
            report_table['adult corona positive bootstrap AVG'].append(
                adult_corona_positive_bootstrap_mean)
            report_table['adult corona negative bootstrap AVG'].append(
                adult_corona_negative_bootstrap_mean)
            report_table['adult bootstrap significance'].append(
                adult_bootstrap_significance)
            report_table['adult count'].append(adult_count)

            report_table['child corona positive AVG'].append(
                np.mean(child_corona_positive_response_vector))
            report_table['child corona negative AVG'].append(
                np.mean(child_corona_negative_response_vector))
            report_table['child corona positive bootstrap AVG'].append(
                child_corona_positive_bootstrap_mean)
            report_table['child corona negative bootstrap AVG'].append(
                child_corona_negative_bootstrap_mean)
            report_table['child bootstrap significance'].append(
                child_bootstrap_significance)
            report_table['child count'].append(child_count)

        self.__report_tables['average_by_target'] = report_table