def process_associations(self, features_to_process: List[FeatureToProcess], source_target_series, compare_target_series): def mirror_association(association_dict, feature_name, other_name, value): if other_name not in association_dict.keys(): association_dict[other_name] = dict() other_dict = association_dict[other_name] if feature_name not in other_dict.keys(): other_dict[feature_name] = value for feature in features_to_process: feature_name = feature.source.name if feature_name not in self._associations.keys(): self._associations[feature_name] = dict() cur_associations = self._associations[feature_name] if feature.compare is not None: if feature_name not in self._associations_compare.keys(): self._associations_compare[feature_name] = dict() cur_associations_compare = self._associations_compare[ feature_name] else: cur_associations_compare = None for other in features_to_process: # for other in [of for of in features_to_process if of.source.name != feature_name]: process_compare = cur_associations_compare is not None and other.compare is not None # if other.source.name in cur_associations.keys(): # print(f"Skipping {feature_name} {other.source.name}") # continue if other.source.name == feature_name: cur_associations[other.source.name] = 0.0 mirror_association(self._associations, feature_name, other.source.name, 0.0) if process_compare: cur_associations_compare[other.source.name] = 0.0 mirror_association(self._associations_compare, feature_name, other.source.name, 0.0) continue if self[feature_name]["type"] == FeatureType.TYPE_CAT or \ self[feature_name]["type"] == FeatureType.TYPE_BOOL: # CAT/BOOL source # ------------------------------------ if self[other.source.name]["type"] == FeatureType.TYPE_CAT or \ self[other.source.name]["type"] == FeatureType.TYPE_BOOL: # CAT-CAT cur_associations[other.source.name] = \ associations.theils_u(feature.source, other.source) if process_compare: cur_associations_compare[other.source.name] = \ associations.theils_u(feature.compare, other.compare) elif self[ other.source.name]["type"] == FeatureType.TYPE_NUM: # CAT-NUM cur_associations[other.source.name] = \ associations.correlation_ratio(feature.source, other.source) mirror_association(self._associations, feature_name, other.source.name, \ cur_associations[other.source.name]) if process_compare: cur_associations_compare[other.source.name] = \ associations.correlation_ratio(feature.compare, other.compare) mirror_association(self._associations_compare, feature_name, other.source.name, \ cur_associations_compare[other.source.name]) elif self[feature_name]["type"] == FeatureType.TYPE_NUM: # NUM source # ------------------------------------ if self[other.source.name]["type"] == FeatureType.TYPE_NUM: # NUM-NUM cur_associations[other.source.name] = \ feature.source.corr(other.source, method='pearson') mirror_association(self._associations, feature_name, other.source.name, \ cur_associations[other.source.name]) if process_compare: cur_associations_compare[other.source.name] = \ feature.compare.corr(other.compare, method='pearson') mirror_association(self._associations_compare, feature_name, other.source.name, \ cur_associations_compare[other.source.name]) self.progress_bar.update(1)
def process_associations(self, features_to_process: List[FeatureToProcess], source_target_series, compare_target_series): def mirror_association(association_dict, feature_name, other_name, value): if other_name not in association_dict.keys(): association_dict[other_name] = dict() other_dict = association_dict[other_name] if feature_name not in other_dict.keys(): other_dict[feature_name] = value for feature in features_to_process: feature_name = feature.source.name if feature_name not in self._associations.keys(): self._associations[feature_name] = dict() cur_associations = self._associations[feature_name] if feature.compare is not None: if feature_name not in self._associations_compare.keys(): self._associations_compare[feature_name] = dict() cur_associations_compare = self._associations_compare[feature_name] else: cur_associations_compare = None for other in features_to_process: # for other in [of for of in features_to_process if of.source.name != feature_name]: process_compare = cur_associations_compare is not None and other.compare is not None # if other.source.name in cur_associations.keys(): # print(f"Skipping {feature_name} {other.source.name}") # continue if other.source.name == feature_name: cur_associations[other.source.name] = 0.0 mirror_association(self._associations, feature_name, other.source.name, 0.0) if process_compare: cur_associations_compare[other.source.name] = 0.0 mirror_association(self._associations_compare, feature_name, other.source.name, 0.0) continue if self[feature_name]["type"] == FeatureType.TYPE_CAT or \ self[feature_name]["type"] == FeatureType.TYPE_BOOL: # CAT/BOOL source # ------------------------------------ if self[other.source.name]["type"] == FeatureType.TYPE_CAT or \ self[other.source.name]["type"] == FeatureType.TYPE_BOOL: # CAT-CAT cur_associations[other.source.name] = \ associations.theils_u(feature.source, other.source) if process_compare: cur_associations_compare[other.source.name] = \ associations.theils_u(feature.compare, other.compare) elif self[other.source.name]["type"] == FeatureType.TYPE_NUM: # CAT-NUM # This handles cat-num, then mirrors so no need to process num-cat separately # (symmetrical relationship) cur_associations[other.source.name] = \ associations.correlation_ratio(feature.source, other.source) mirror_association(self._associations, feature_name, other.source.name, \ cur_associations[other.source.name]) if process_compare: cur_associations_compare[other.source.name] = \ associations.correlation_ratio(feature.compare, other.compare) mirror_association(self._associations_compare, feature_name, other.source.name, \ cur_associations_compare[other.source.name]) elif self[feature_name]["type"] == FeatureType.TYPE_NUM: # NUM source # ------------------------------------ if self[other.source.name]["type"] == FeatureType.TYPE_NUM: # NUM-NUM cur_associations[other.source.name] = \ feature.source.corr(other.source, method='pearson') # TODO: display correlation error better in graph! if isnan(cur_associations[other.source.name]): if feature.source.equals(other.source): cur_associations[other.source.name] = CORRELATION_IDENTICAL else: # ERROR may occur if Nan's in one match values in other, and vice-versa cur_associations[other.source.name] = CORRELATION_ERROR mirror_association(self._associations, feature_name, other.source.name, \ cur_associations[other.source.name]) if process_compare: cur_associations_compare[other.source.name] = \ feature.compare.corr(other.compare, method='pearson') # TODO: display correlation error better in graph! if isnan(cur_associations_compare[other.source.name]): if feature.compare.equals(other.compare): cur_associations_compare[other.source.name] = CORRELATION_IDENTICAL else: # ERROR may occur if Nan's in one match values in other, and vice-versa cur_associations_compare[other.source.name] = CORRELATION_ERROR mirror_association(self._associations_compare, feature_name, other.source.name, \ cur_associations_compare[other.source.name]) self.progress_bar.update(1)
def process_associations(self, features_to_process: List[FeatureToProcess], source_target_series, compare_target_series): def mirror_association(association_dict, feature_name, other_name, value): if other_name not in association_dict.keys(): association_dict[other_name] = dict() other_dict = association_dict[other_name] if feature_name not in other_dict.keys(): other_dict[feature_name] = value for feature in features_to_process: feature_name = feature.source.name if feature_name not in self._associations.keys(): self._associations[feature_name] = dict() cur_associations = self._associations[feature_name] if feature.compare is not None: if feature_name not in self._associations_compare.keys(): self._associations_compare[feature_name] = dict() cur_associations_compare = self._associations_compare[ feature_name] else: cur_associations_compare = None for other in features_to_process: # for other in [of for of in features_to_process if of.source.name != feature_name]: process_compare = cur_associations_compare is not None and other.compare is not None # if other.source.name in cur_associations.keys(): # print(f"Skipping {feature_name} {other.source.name}") # continue if other.source.name == feature_name: cur_associations[other.source.name] = 0.0 mirror_association(self._associations, feature_name, other.source.name, 0.0) if process_compare: cur_associations_compare[other.source.name] = 0.0 mirror_association(self._associations_compare, feature_name, other.source.name, 0.0) continue if self[feature_name]["type"] == FeatureType.TYPE_CAT or \ self[feature_name]["type"] == FeatureType.TYPE_BOOL: # CAT/BOOL source # ------------------------------------ if self[other.source.name]["type"] == FeatureType.TYPE_CAT or \ self[other.source.name]["type"] == FeatureType.TYPE_BOOL: # CAT-CAT cur_associations[other.source.name] = \ associations.theils_u(feature.source, other.source) if process_compare: cur_associations_compare[other.source.name] = \ associations.theils_u(feature.compare, other.compare) elif self[ other.source.name]["type"] == FeatureType.TYPE_NUM: # CAT-NUM # This handles cat-num, then mirrors so no need to process num-cat separately # (symmetrical relationship) cur_associations[other.source.name] = \ associations.correlation_ratio(feature.source, other.source) mirror_association(self._associations, feature_name, other.source.name, \ cur_associations[other.source.name]) if process_compare: cur_associations_compare[other.source.name] = \ associations.correlation_ratio(feature.compare, other.compare) mirror_association(self._associations_compare, feature_name, other.source.name, \ cur_associations_compare[other.source.name]) elif self[feature_name]["type"] == FeatureType.TYPE_NUM: # NUM source # ------------------------------------ if self[other.source.name]["type"] == FeatureType.TYPE_NUM: # NUM-NUM try: cur_associations[other.source.name] = \ feature.source.corr(other.source, method='pearson') except FloatingPointError: # This usually happens when there is only 1 non-NaN value in each data series # Assigning the value 1.0 as per # https://stats.stackexchange.com/questions/94150/why-is-the-pearson-correlation-1-when-only-two-data-values-are-available # -> Also showing a warning cur_associations[other.source.name] = 1.0 self.corr_warning.append(feature_name + "/" + other.source.name) # TODO: display correlation error better in graph! if isnan(cur_associations[other.source.name]): if feature.source.equals(other.source): cur_associations[ other.source.name] = CORRELATION_IDENTICAL else: # ERROR may occur if Nan's in one match values in other, and vice-versa cur_associations[ other.source.name] = CORRELATION_ERROR mirror_association(self._associations, feature_name, other.source.name, \ cur_associations[other.source.name]) if process_compare: cur_associations_compare[other.source.name] = \ feature.compare.corr(other.compare, method='pearson') # TODO: display correlation error better in graph! if isnan(cur_associations_compare[ other.source.name]): if feature.compare.equals(other.compare): cur_associations_compare[ other.source. name] = CORRELATION_IDENTICAL else: # ERROR may occur if Nan's in one match values in other, and vice-versa cur_associations_compare[ other.source.name] = CORRELATION_ERROR mirror_association(self._associations_compare, feature_name, other.source.name, \ cur_associations_compare[other.source.name]) self.progress_bar.update(1)