def categorical_correlations(spn, dictionary): categoricals = f.get_categoricals(spn) corr = f.get_full_correlation(spn) all_combinations = [(i,j) for i,j in itertools.product(range(spn.numFeatures), range(spn.numFeatures)) if i > j and np.abs(corr[i,j]) > correlation_threshold] if isinstance(feature_combinations, int): num_choices = min(feature_combinations, len(all_combinations)) shown_combinations = random.sample(all_combinations, k=num_choices) elif feature_combinations == 'all': shown_combinations = all_combinations else: shown_combinations = feature_combinations for cat_counter, cat in enumerate(set([combination[0] for combination in shown_combinations])): for i in [combination[1] for combination in shown_combinations if combination[0] == cat]: phrase = get_nlg_phrase(*CORRELATION_NLG) while '{z}' in phrase or 'As' in phrase or 'linear' in phrase: phrase = get_nlg_phrase(*CORRELATION_NLG) strength = ['weak', 'moderate', 'strong', 'very strong', 'perfect'] strength_values = [0.3, 0.6, 0.8, 0.99] strength_descr = strength[threshold(strength_values, np.abs(corr[cat,i]))] strength_adv = strength_descr+'ly' if show_conditional: iplot(p.plot_related_features(spn, i, cat, dictionary=dictionary)) printmd(phrase.format( x=spn.featureNames[cat], y=spn.featureNames[i], strength=strength_descr, strength_adv=strength_adv, direction='', neg_pos=''))
def compute_strength(self, gradients): self.ready = True direction = np.mean(gradients) strength = ['very weak', 'weak', 'moderate', 'strong', 'very strong'] strength_values = [0.05, 0.15, 0.3, 0.5] direction_descriptor = ['negative', 'positive'] self.strength = strength[threshold(strength_values, np.abs(direction))] self.direction = 'positive' if direction > 0 else 'negative'
def get_correlation_modifier(corr): strength = ['weak', 'moderate', 'strong', 'very strong', 'perfect'] strength_values = [0.3, 0.6, 0.8, 0.99] direction = ['decrease', 'increase'] neg_pos = ['negative', 'positive'] strength = strength[threshold(strength_values, np.abs(corr))] strength_adv = strength + 'ly' direction = direction[0] if corr < 0 else direction[1] neg_pos = neg_pos[0] if corr < 0 else neg_pos[1] return Modifier(strength, strength_adv, direction, neg_pos)
def show_node_separation(spn, nodes): categoricals = f.get_categoricals(spn) all_features = list(range(spn.numFeatures)) if features_shown == 'all': shown_features = all_features elif isinstance(features_shown, int): num_choices = min(features_shown, len(all_features)) shown_features = random.sample(all_features, k=num_choices) else: shown_features = features_shown node_means = np.array([node.moment(1, spn.numFeatures) for node in nodes]) node_vars = np.array([node.moment(2, spn.numFeatures) - node.moment(1, spn.numFeatures) ** 2 for node in nodes]) node_stds = np.sqrt(node_vars) names = np.arange(1,len(nodes)+1,1) strength_separation = f.cluster_variance_separation(spn) node_var, node_mean = f.cluster_mean_var_distance(nodes, spn) all_seps = {i: separation for i, separation in zip(shown_features, strength_separation)} for i in shown_features: if i not in categoricals: description_string = '' plot = p.plot_error_bar(names, node_means[:,i], node_vars[:,i], spn.featureNames[i]) strength = ['weak', 'moderate', 'strong', 'very strong', 'perfect'] strength_values = [0.3, 0.6, 0.8, 0.99] strength_adv = strength[threshold(strength_values, strength_separation[i])]+'ly' var_outliers = np.where(node_var[:,i] > variance_threshold)[0] if len(var_outliers) == 1: node_string = ', '.join([str(v) for v in var_outliers]) description_string += 'The variance of node {} is significantly larger then the average node. '.format(node_string) elif len(var_outliers) > 0: node_string = ', '.join([str(v) for v in var_outliers]) description_string += 'The variances of the nodes {} are significantly larger then the average node. '.format(node_string) mean_high_outliers = np.where(node_mean[:,i] > mean_threshold)[0] mean_low_outliers = np.where(node_mean[:,i] < -mean_threshold)[0] if len(mean_high_outliers) == 1: node_string = ', '.join([str(v) for v in mean_high_outliers]) description_string += 'The mean of node {} is significantly larger then the average node. '.format(node_string) elif len(mean_high_outliers) > 0: node_string = ', '.join([str(v) for v in mean_high_outliers]) description_string += 'The means of the nodes {} are significantly larger then the average node. '.format(node_string) if len(mean_low_outliers) == 1: node_string = ', '.join([str(v) for v in mean_low_outliers]) description_string += 'The mean of node {} is significantly smaller then the average node.'.format(node_string) elif len(mean_low_outliers) > 0: node_string = ', '.join([str(v) for v in mean_low_outliers]) description_string += 'The means of the nodes {} are significantly smaller then the average node.'.format(node_string) if description_string or strength_separation[i] > separation_threshold: description_string = 'The feature "{}" is {} separated by the clustering. '.format(spn.featureNames[i], strength_adv) + description_string iplot(plot) f.printmd(description_string) return all_seps
def correlation_statement(corr, feature1, feature2): strength = ['weak', 'moderate', 'strong', 'very strong', 'perfect'] strength_values = [0.3, 0.6, 0.8, 0.99] direction = ['decrease', 'increase'] neg_pos = ['negative', 'positive'] description = dict( strength=strength[threshold(strength_values, np.abs(corr))], strength_adv=strength[threshold(strength_values, np.abs(corr))]+'ly', direction=direction[0] if corr < 0 else direction[1], neg_pos=neg_pos[0] if corr < 0 else neg_pos[1], fx=feature1, fy=feature2, ) sentences = [ '"{fx}" and "{fy}" influence each other {strength_adv}. As one increases, the other {direction}s.', 'There is a {strength} {neg_pos} dependency between "{fx}" and "{fy}".', 'There is a {strength} linear relation between "{fx}" and "{fy}".', 'The model shows a {strength} linear relation between "{fx}" and "{fy}".', 'The features "{fx}" an "{fy}" have a {strength} dependency between them.' ] return sentences[random.randrange(len(sentences))].format(**description) + ' ' if np.abs(corr) > 0.25 else ''