def MeasureAttribute_info(self, attr, data): table = data.select([attr, data.domain.classVar]) table = orange.Preprocessor_dropMissing(table) a1 = [table[k][0].value for k in range(len(table))] a2 = [table[k][1].value for k in range(len(table))] val, prob = statc.pearsonr(a1, a2) return val
def distPearson(x, y): """distance corresponding to 1 - pearson's correlation coefficient for arrays x,y returns distance: 1 - pearson_r """ x = MA.asarray(x) y = MA.asarray(y) assert MA.rank(x) == MA.rank(y) == 1 cond = MA.logical_not(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y))) return 1 - statc.pearsonr( MA.compress(cond, x).tolist(), MA.compress(cond, y).tolist())[0]
def __call__(self, e1, e2): X1 = []; X2 = [] for i in self.indxs: if not(e1[i].isSpecial() or e2[i].isSpecial()): X1.append(float(e1[i])) X2.append(float(e2[i])) if not X1: return 1.0 try: return (1.0 - statc.pearsonr(X1, X2)[0]) / 2. except: return 1.0
def computeCorrelation(data, attr1, attr2): if data.domain[attr1].varType != orange.VarTypes.Continuous: return None if data.domain[attr2].varType != orange.VarTypes.Continuous: return None table = data.select([attr1, attr2]) table = orange.Preprocessor_dropMissing(table) a1 = [table[k][attr1].value for k in range(len(table))] a2 = [table[k][attr2].value for k in range(len(table))] try: val, prob = statc.pearsonr(a1, a2) except: val = 0.0 # possibly invalid a1 or a2 return val
def computeCorrelationInsideClasses(data, attr1, attr2): if data.domain[attr1].varType != orange.VarTypes.Continuous: return None if data.domain[attr2].varType != orange.VarTypes.Continuous: return None table = data.select([attr1, attr2, data.domain.classVar]) table = orange.Preprocessor_dropMissing(table) lengths = []; corrs = [] for val in table.domain.classVar.values: tab = table.filter({table.domain.classVar: val}) a1 = [tab[k][attr1].value for k in range(len(tab))] a2 = [tab[k][attr2].value for k in range(len(tab))] if len(a1) == 0: continue val, prob = statc.pearsonr(a1, a2) lengths.append(len(a1)) corrs.append(val) corr = 0 for ind in range(len(corrs)): corr += abs(corrs[ind])*lengths[ind] corr /= sum(lengths) return corr, corrs, lengths
def pearson(ex1, ex2): vals1 = ex1.native(0)[:-1] vals2 = ex2.native(0)[:-1] if check_same and vals1 == vals2: return 10 #they are the same #leaves undefined elements out if not no_unknowns: common = [ True if v1 != "?" and v2 != "?" else False \ for v1,v2 in zip(vals1,vals2) ] vals1 = [v for v, c in zip(vals1, common) if c] vals2 = [v for v, c in zip(vals2, common) if c] #statc correlation is from 5-10 times faster than numpy! try: return statc.pearsonr(vals1, vals2)[0] except: return numpy.corrcoef([vals1, vals2])[0, 1]
def computeCorrelationInsideClasses(data, attr1, attr2): if data.domain[attr1].varType != orange.VarTypes.Continuous: return None if data.domain[attr2].varType != orange.VarTypes.Continuous: return None table = data.select([attr1, attr2, data.domain.classVar]) table = orange.Preprocessor_dropMissing(table) lengths = [] corrs = [] for val in table.domain.classVar.values: tab = table.filter({table.domain.classVar: val}) a1 = [tab[k][attr1].value for k in range(len(tab))] a2 = [tab[k][attr2].value for k in range(len(tab))] if len(a1) == 0: continue val, prob = statc.pearsonr(a1, a2) lengths.append(len(a1)) corrs.append(val) corr = 0 for ind in range(len(corrs)): corr += abs(corrs[ind]) * lengths[ind] corr /= sum(lengths) return corr, corrs, lengths