Esempio n. 1
0
    def __train_test_val_split(self):
        y = self.__same_length_vectors[Data_Fields.get_target()]
        binary_fields = [
            field for field in Data_Fields.get_binary_vars()
            if field in Config.DATA_FIELDS_IN_ANALYSIS
        ]
        continuous_fields = [
            field for field in Data_Fields.get_continuous_vars()
            if field in Config.DATA_FIELDS_IN_ANALYSIS
        ]
        predictors_names = binary_fields + continuous_fields

        predictors_vectors_tuple = tuple(
            [self.__same_length_vectors[name] for name in predictors_names])
        X = np.stack(predictors_vectors_tuple, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.1,
                                                            random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                          y_train,
                                                          test_size=0.2,
                                                          random_state=42)

        self.X_train = X_train
        self.X_val = X_val
        self.X_test = X_test
        self.y_train = y_train
        self.y_val = y_val
        self.y_test = y_test
        self.predictors_names = predictors_names
Esempio n. 2
0
    def __init__(self):
        self.__class__.WORK_DIR = 'C:/Users/normy/corona_classifier_files/classification'
        self.__class__.COMMON_FILES_DIR = 'C:/Users/normy/PycharmProjects/corona_classifier/common_files'
        self.__class__.RAW_DATA_PATH = 'C:/Users/normy/PycharmProjects/covidclinicaldata/data'
        self.__class__.PICKLE_PATH = os.path.join(self.WORK_DIR,
                                                  'pickle_files')
        self.__class__.YAML_FILE_DIR = os.path.join(os.path.dirname(__file__),
                                                    'yaml_files')
        self.__class__.COMMON_YAML_FILE_DIR = os.path.join(
            self.COMMON_FILES_DIR, 'yaml_files')
        self.__class__.OUTPUT_PATH = os.path.join(self.WORK_DIR, 'outputs')

        self.__class__.DATA_FIELDS_IN_ANALYSIS = [
            Data_Fields.get_target(), *Data_Fields.get_binary_vars(),
            *Data_Fields.get_continuous_vars()
        ]

        static_values = self.load_yaml(self.YAML_FILE_DIR, 'static_values')
        self.__class__.DATA_FIELD_MISSING_VALUES_THRESHOLD = static_values[
            'data_field_missing_values_threshold']
        self.__class__.BOOTSTRAP_PATIENT_ENLARGEMENT_SIZE = static_values[
            'bootstrap_patient_enlargement_size']
        self.__class__.MODEL_THRESHOLDS = static_values['model_thresholds']

        self.__class__.CONTINUOUS_FIELDS_THRESHOLDS = self.load_yaml(
            self.COMMON_YAML_FILE_DIR, 'continuous_fields_thresholds')
Esempio n. 3
0
    def __binary_one_hot_encoding(self):
        data_fields = Data_Fields.get_binary_vars()
        data_fields.append(Data_Fields.get_target())

        for patient in self.__patients:
            for field in data_fields:
                if getattr(patient, field) is None:
                    continue
                elif getattr(patient, field) not in ('TRUE', 'Positive'):
                    setattr(patient, field, 0)
                else:
                    setattr(patient, field, 1)
    def __get_vectors_for_analysis(self):
        data_fields_for_analysis = [Data_Fields.get_target()]
        data_fields_for_analysis.extend(Data_Fields.get_continuous_vars())

        vectors_for_analysis = []
        for vector in self.__vectors:
            if vector.field_name in data_fields_for_analysis:
                if vector.field_name in Config.DATA_FIELDS_IN_ANALYSIS:
                    vectors_for_analysis.append(vector)

        self.__vectors = vectors_for_analysis
        self.__graph_vectors['continuous_vectors'] = self.__vectors
Esempio n. 5
0
    def __remove_data_fields_with_to_much_missing_data(self):
        for data_field in Data_Fields.get_all_data_fields():
            missing_data_count = 0
            for patient in self.patients:
                if getattr(patient, data_field) is None:
                    missing_data_count += 1

            missing_data_ratio = missing_data_count / len(self.patients)

            if missing_data_ratio >= Config.DATA_FIELD_MISSING_VALUES_THRESHOLD:
                Config.DATA_FIELDS_IN_ANALYSIS.remove(data_field)
    def __get_vector_dict(self, remove_missing_values: bool):
        vector_dict = {}
        for analysis_vector in self.__vectors:
            field_name = analysis_vector.field_name
            if field_name == Data_Fields.get_target():
                continue

            if remove_missing_values:
                vector = analysis_vector.vector_without_missing_values
            else:
                vector = analysis_vector.vector
            vector_dict[field_name] = vector
        return vector_dict
Esempio n. 7
0
    def __standardize_continuous_vectors(self):
        for field_name in Data_Fields.get_continuous_vars():
            if field_name in Config.DATA_FIELDS_IN_ANALYSIS:
                standardized_values = []
                vector = self.__same_length_vectors[field_name]
                mean_value = vector.mean()
                std_value = vector.std()

                for value in vector:
                    new_value = (value - mean_value) / std_value
                    standardized_values.append(new_value)

                self.__same_length_vectors[field_name] = np.array(
                    standardized_values)
Esempio n. 8
0
    def __normalize_continuous_vectors(self):
        for field_name in Data_Fields.get_continuous_vars():
            if field_name in Config.DATA_FIELDS_IN_ANALYSIS:
                normalized_values = []
                vector = self.__same_length_vectors[field_name]
                min_value = vector.min()
                max_value = vector.max()

                for value in vector:
                    new_value = (value - min_value) / (max_value - min_value)
                    normalized_values.append(new_value)

                self.__same_length_vectors[field_name] = np.array(
                    normalized_values)
    def __init__(self):

        self.__class__.WORK_DIR = 'C:/Users/normy/corona_classifier_files'
        self.__class__.COMMON_FILE_PATH = 'C:/Users/normy/PycharmProjects/corona_classifier/common_files'
        self.__class__.RAW_DATA_PATH = 'C:/Users/normy/PycharmProjects/covidclinicaldata/data'
        self.__class__.LOCAL_YAML_FILE_PATH = os.path.join(os.path.dirname(__file__), 'yaml_files')
        self.__class__.COMMON_YAML_FILE_PATH = os.path.join(Data_Analysis_Config.COMMON_FILE_PATH, 'yaml_files')
        self.__class__.PICKLE_PATH = os.path.join(Data_Analysis_Config.WORK_DIR, 'pickle_files')
        self.__class__.DATA_ANALYSIS_OUTPUTS_PATH = os.path.join(Data_Analysis_Config.WORK_DIR, 'data_analysis_outputs')
        self.__class__.DATA_FIELDS_IN_ANALYSIS = set(Data_Fields.get_all_data_fields())

        static_values = self.load_local_yaml(file_name='static_values')
        self.__class__.PATIENT_MISSING_VALUES_THRESHOLD = static_values['patient_missing_values_threshold']
        self.__class__.DATA_FIELD_MISSING_VALUES_THRESHOLD = static_values['data_field_missing_values_threshold']

        self.__class__.BOOTSTRAP_ITERATIONS = static_values['bootstrap_iterations']

        self.__class__.CONTINUOUS_FIELDS_THRESHOLDS = self.load_common_yaml(file_name='continuous_fields_thresholds')
Esempio n. 10
0
    def __replace_un_accepted_vital_values_with_none(self):
        for patient in self.__patients:
            age_group = patient.age_group

            for field_name in Data_Fields.get_continuous_vars():
                vital_value = getattr(patient, field_name)

                if vital_value is None:
                    continue

                lower_threshold = Config.get_continuous_field_threshold(
                    age_group=age_group,
                    threshold='lower_threshold',
                    vital_value_name=field_name)

                upper_threshold = Config.get_continuous_field_threshold(
                    age_group=age_group,
                    threshold='upper_threshold',
                    vital_value_name=field_name)

                if vital_value > upper_threshold or vital_value < lower_threshold:
                    setattr(patient, field_name, None)
Esempio n. 11
0
    def __build_analysis_vectors(self):
        vector_builder = Analysis_Vector_Builder(data_object_list=self.__patients,
                                                 data_fields=Data_Fields.get_all_data_fields())

        self.__vectors = vector_builder.analysis_vectors
Esempio n. 12
0
 def __replace_empty_strings_with_none(self):
     data_fields = Data_Fields.get_all_data_fields()
     for patient in self.patients:
         for data_field in data_fields:
             if getattr(patient, data_field) == '':
                 setattr(patient, data_field, None)
    def __average_by_target_for_age_groups(self):
        report_table = defaultdict(list)

        vector_dict = self.__get_analysis_vector_dict()

        for continuous_var in Data_Fields.get_continuous_vars():
            if continuous_var == 'age':
                continue

            if continuous_var not in Config.DATA_FIELDS_IN_ANALYSIS:
                continue

            same_length_vectors = self.__get_same_length_vectors(vector_list=[
                vector_dict[Data_Fields.get_target()], vector_dict[
                    Data_Fields.AGE.field_name], vector_dict[continuous_var]
            ])

            is_adult = same_length_vectors[Data_Fields.AGE.field_name] >= 18

            target_adult_vector = same_length_vectors[
                Data_Fields.get_target()][is_adult]
            response_adult_vector = same_length_vectors[continuous_var][
                is_adult]

            target_child_vector = same_length_vectors[
                Data_Fields.get_target()][~is_adult]
            response_child_vector = same_length_vectors[continuous_var][
                ~is_adult]

            adult_corona_positive_response_vector = response_adult_vector[
                target_adult_vector == 1]
            adult_corona_negative_response_vector = response_adult_vector[
                target_adult_vector == 0]
            child_corona_positive_response_vector = response_child_vector[
                target_child_vector == 1]
            child_corona_negative_response_vector = response_child_vector[
                target_child_vector == 0]

            adult_bootstrap_significance = self.bootstrap_difference_in_mean_of_two_groups(
                adult_corona_positive_response_vector,
                adult_corona_negative_response_vector)

            child_bootstrap_significance = self.bootstrap_difference_in_mean_of_two_groups(
                child_corona_positive_response_vector,
                child_corona_negative_response_vector)

            adult_corona_positive_bootstrap_mean = self.__calc_bootstrap_mean(
                adult_corona_positive_response_vector,
                iterations=Config.BOOTSTRAP_ITERATIONS)

            adult_corona_negative_bootstrap_mean = self.__calc_bootstrap_mean(
                adult_corona_negative_response_vector,
                iterations=Config.BOOTSTRAP_ITERATIONS)

            child_corona_positive_bootstrap_mean = self.__calc_bootstrap_mean(
                child_corona_positive_response_vector,
                iterations=Config.BOOTSTRAP_ITERATIONS)

            child_corona_negative_bootstrap_mean = self.__calc_bootstrap_mean(
                child_corona_negative_response_vector,
                iterations=Config.BOOTSTRAP_ITERATIONS)

            adult_count = len(adult_corona_positive_response_vector) + len(
                adult_corona_negative_response_vector)
            child_count = len(child_corona_positive_response_vector) + len(
                child_corona_negative_response_vector)

            report_table['feature'].append(continuous_var)
            report_table['adult corona positive regular AVG'].append(
                np.mean(adult_corona_positive_response_vector))
            report_table['adult corona negative regular AVG'].append(
                np.mean(adult_corona_negative_response_vector))
            report_table['adult corona positive bootstrap AVG'].append(
                adult_corona_positive_bootstrap_mean)
            report_table['adult corona negative bootstrap AVG'].append(
                adult_corona_negative_bootstrap_mean)
            report_table['adult bootstrap significance'].append(
                adult_bootstrap_significance)
            report_table['adult count'].append(adult_count)

            report_table['child corona positive AVG'].append(
                np.mean(child_corona_positive_response_vector))
            report_table['child corona negative AVG'].append(
                np.mean(child_corona_negative_response_vector))
            report_table['child corona positive bootstrap AVG'].append(
                child_corona_positive_bootstrap_mean)
            report_table['child corona negative bootstrap AVG'].append(
                child_corona_negative_bootstrap_mean)
            report_table['child bootstrap significance'].append(
                child_bootstrap_significance)
            report_table['child count'].append(child_count)

        self.__report_tables['average_by_target'] = report_table