def getBestModel(filePath, dataPoints): dot_pos = filePath.rfind('.') basePath = filePath[:dot_pos] ext = filePath[dot_pos + 1:] modelDict = {} for i in range(NUMBER_OF_MODELS_TO_CONSIDER): if ext == 'py': #print('Generating data') synthDataGen.run(filePath, samples=dataPoints) dataFile = basePath + '.csv' ic = CaLingam.IC(dataFile, dataPoints) model = ic.getModel() dagStr = model.getEquivalencyKey() if not dagStr in modelDict: modelDict[dagStr] = [] modelDict[dagStr].append(model) # Find model based on most common structure mostCommon = None mostCommonCount = 0 for modelList in modelDict.values(): if len(modelList) > mostCommonCount: mostCommonCount = len(modelList) mostCommon = modelList[0] model = mostCommon print('Most common model = ', mostCommonCount, '/', NUMBER_OF_MODELS_TO_CONSIDER) return model
def run(s1, s2): global firstTime, CUM_DEPENDENCE global SNR, SNR_DB, CUM_TIME reset = True if firstTime: #print('std ratio = ', s1D.std() / s2D.std()) firstTime = False else: reset = False try: synthDataGen.run(FILE + '.py', samples=DATA_POINTS, maxDifficulty=MAX_DIFFICULTY, reset=reset) except: pass dr = getData.DataReader(input=FILE + '.csv', limit=DATA_POINTS) s1D = np.array(dr.getSeries(s1)) s2D = np.array(dr.getSeries(s2)) dependence = IndHSIC.scoreDependence(s1D, s2D) #dependence = IndLn.scoreDependence(s1D, s2D) CUM_DEPENDENCE = CUM_DEPENDENCE + dependence #print('s1 = ', s1D.mean(), s1D.std()) #print('s2 = ', s2D.mean(), s2D.std()) if firstTime: #print('std ratio = ', s1D.std() / s2D.std()) firstTime = False coefs = np.polyfit(s1D, s2D.T, 1) x1coef, x0coef = coefs Dprint('coefs = ', coefs) cs2D = s1D * x1coef + x0coef # coef = lstsq(np.array([list(s1D)]).T, s2D)[0][0] # Dprint('coef = ', coef) # cs2D = s1D * coef res = s2D - cs2D signal = cs2D.var(ddof=1) noise = res.var(ddof=1) snr = signal / noise if snr < SNR: SNR = snr SNR_DB = 10 * math.log(snr, 10) start = time.time() if IND_TYPE == 'PL' or IND_TYPE == 'CCM': dep = scoreDependence(s1D, s2D) else: dep = scoreDependence(s1D, res) end = time.time() duration = end - start CUM_TIME += duration Dprint('Residual Dependence for ', s1, '-->', s2, ' = ', dep) return dep
def analyzeOneLingam(self, dataCount, gen=True, valData=None): if gen: synthDataGen.run(self.genFile, dataCount) dr = getData.DataReader(self.testFile, dataCount) vars = dr.getSeriesNames() if valData is None: valData = synthDataGen.getValidation() lowestSNR = 10**100 cumSNR = 0 pairCount = 0 lowestPair = None stats = {} nodeDiffs = {} for tuple in valData: successor, predecessors = tuple if len(predecessors) == 0: continue for predecessor in predecessors: nodeDiffs[predecessor] = {} pD = np.array(dr.getSeries(predecessor)) sD = np.array(dr.getSeries(successor)) coefs = np.polyfit(pD, sD.T, 1) x1coef, x0coef = coefs #Dprint('coefs = ', coefs) cs2D = pD * x1coef + x0coef res = sD - cs2D # Note that signal and noise are reversed from what you might expect. # In this case, the variance of the error term is the signal we are looking for, # while the linear component is actually the noise noise = cs2D.var(ddof=1) signal = res.var(ddof=1) #print('pair = ', predecessor, '--->', successor) #print('noise = ', noise) #print('signal = ', signal) snr = signal / noise #print('snr = ', snr) nodeDiffs[predecessor][successor] = 10* math.log(1.0/snr, 10) if snr < lowestSNR: lowestSNR = snr lowestPair = (predecessor, successor) cumSNR += snr pairCount += 1 stats['minSnr'] = 10*math.log(lowestSNR,10) avgSNR = cumSNR/float(pairCount) stats['avgSnr'] = 10*math.log(avgSNR, 10) stats['weakestPair'] = lowestPair difficulty = max([10 * math.log(1.0 / lowestSNR, 10),0.0]) stats['difficulty'] = difficulty stats['weakestPairing'] = lowestPair stats['variableDifficulty'] = nodeDiffs stats['normDifficulty'] = 100.0 * difficulty / (dataCount**.5) return stats
def run(dataDefFile, var1, var2, datapoints): tau = TAU print('Generating data using: ', dataDefFile) fileExt = dataDefFile.split('.')[-1] if fileExt == 'py': # Data Definition File outFile = synthDataGen.run(dataDefFile, datapoints) elif fileExt == 'csv': # Data file outFile = dataDefFile else: print('*** Invalid file type = ', fileExt) return d = getData.DataReader(outFile, limit=datapoints) fig = plt.figure() ax = fig.add_axes([.1,.1,.8,.8],projection='3d') # If var1 and var2 are not specified, then build a master manifold out of three vars at a time. Otherwise, build shadow manifolds # from the two specified vars if var1 is None: vars = d.getSeriesNames() vars.sort() print('Vars = ', vars) colors = ['b', 'g', 'r','o'] for i in range(4): if len(vars) < i*3+3: break X = d.getSeries(vars[i*3]) Y = d.getSeries(vars[i*3+1]) Z = d.getSeries(vars[i*3+2]) if standard: X = standardize.standardize(X) Y = standardize.standardize(Y) Z = standardize.standardize(Z) color = colors[i] ax.plot(X, Y, Z) else: var1D = d.getSeries(var1) if standard: var1D = standardize.standardize(var1D) X1 = var1D[:-2*tau] Y1 = var1D[tau:-tau] Z1 = var1D[2*tau:] var2D = d.getSeries(var2) if standard: var2D = standardize.standardize(var2D) X2 = var2D[:-2*tau] Y2 = var2D[tau:-tau] Z2 = var2D[2*tau:] #plotData(d, datapoints) ax.plot(X1,Y1,Z1) ax.plot(X2,Y2,Z2, 'r') #ax.plot(X1, Y1, Z2, 'g') # `ax` is a 3D-aware axis instance because of the projection='3d' keyword argument to add_subplot #ax = fig.add_subplot(1, 2, 1, projection='3d') plt.show()
def calibrateOneCITest(self, testType, filePath): synthDataGen.run(filePath + '.py', samples=self.datacount) exec('import ' + testType) module = eval(testType) SA = analyzeSEM.SemAnalyzer(filePath + '.py', self.datacount) reader = getData.DataReader(filePath + '.csv', self.datacount) dependencies, independencies = SA.getCondDependencies() # print('dependencies = ', dependencies) # print('independencies = ', independencies) errors = 0 errorTerms = {} items = 0 for item in dependencies: x, y, z = item X = reader.getSeries(x) Y = reader.getSeries(y) Z = reader.getSeries(z) ind = module.isIndependent(X, Y, Z) if ind: print('Error -- ', x, 'and', y, 'Should be dependent given', z) self.err1Count += 1 errors += 1 errorTerms[item] = 1 self.testCount += 1 for item in independencies: x, y, z = item X = reader.getSeries(x) Y = reader.getSeries(y) Z = reader.getSeries(z) ind = module.isIndependent(X, Y, Z) if not ind: print('Error -- ', x, 'and', y, 'Should be independent given', z) self.err2Count += 1 errors += 1 errorTerms[item] = 1 self.testCount += 1 #print('Rating = ', (1 - (errors / items))*100, '%') print('Errors for file: ', filePath, '=', errors, list(errorTerms.keys())) return
def run(dataDefFile, datapoints): print('Generating data using: ', dataDefFile) fileExt = dataDefFile.split('.')[-1] if fileExt == 'py': # Data Definition File outFile = synthDataGen.run(dataDefFile, datapoints) elif fileExt == 'csv': # Data file outFile = dataDefFile else: print('*** Invalid file type = ', fileExt) return d = getData.DataReader(outFile, limit=datapoints) print('Vars = ', d.getSeriesNames()) plotData(d, datapoints)
def CaLingamTest(testType, paramSet): global VALIDATION_ERRORS, CAUSAL_ORDERS VALIDATION_ERRORS = {} CAUSAL_ORDERS = {} maxDifficulty = MAX_DIFFICULTY fails = 0.0 datafile, runs, dataPoints, validation = paramSet[:4] valData = None totalDuration = 0 datafileRootName = datafile.split('.')[-2].split('\\')[-1] #print('dfrn = ', datafileRootName) sa = analyzeSEM.SemAnalyzer(datafileRootName) difficulty = 0 diffNorm = 0 for i in range(runs): if i == 0: reset = True if RESET_COUNT > 0 and i > 0 and i / RESET_COUNT == int( i / RESET_COUNT): reset = True print() print('Previous SEM:') print(synthDataGen.getSEM()) #print('Resetting') else: reset = False if i == 0: reset = True prune = True if validation == 'SynthOrderVal': # Suppress pruning in order to increase perf when only testing order. prune = False if testType == 'synth': outFile = synthDataGen.run(datafile, samples=dataPoints, reset=reset, maxDifficulty=maxDifficulty) valData = synthDataGen.getValidation() if i == 0 or reset: saStats = sa.analyzeOne(dataPoints, gen=False, valData=valData) difficulty = round(saStats['difficulty'], 2) diffNorm = round(saStats['normDifficulty'], 2) print('difficulty = ', difficulty, ', norm difficulty = ', diffNorm) elif testType == 'live': outFile = datafile else: print('*** Invalid Test Type = ', testType) return startTime = time.time() c = CaLingam.IC(outFile, limit=dataPoints, prune=prune) dag = c.getDAG() endTime = time.time() duration = endTime - startTime totalDuration += duration if len(validation) > 0: result = eval(validation + '(paramSet, dag, valData)') else: corder = str.join('||', dag.getVarOrder()[:5]) if not corder in CAUSAL_ORDERS: CAUSAL_ORDERS[corder] = 1 #print('Causal Order = ', dag.getVarOrder()) # If we get a new causal order after the first time, it is considered an error if len(CAUSAL_ORDERS) > 1: result = 0 else: result = 1 else: # Got an existing causal order. Consider that success count = CAUSAL_ORDERS[corder] count += 1 CAUSAL_ORDERS[corder] = count result = 1 if result: #print ('Success', ) print('.', end='', flush=True) else: print('x', end='', flush=True) fails += 1 print() reliability = round(1.0 - (fails / float(runs)), 2) #stats = 'Errors: ' + str(int(fails)) + ' / ' + str(int(i+1)) + ' -- Reliability: ' + str((1.0 - (fails / (i+1))) * 100) + '%' stats = 'Errors: ' + str(int(fails)) + ' / ' + str( int(runs)) + ' -- Reliability: ' + str( reliability * 100) + '% avg duration: ' + str( round(totalDuration / runs, 2)) + ' sec' + ' difficulty: ' + str( difficulty) + ' diffNorm: ' + str( diffNorm) + ' strength: ' + str( round(reliability * diffNorm, 2)) print('Stats = ', stats) if fails > 0: if len(validation) > 0: # Sort validation_errors to show the most common first counts = [] keys = [] for key in VALIDATION_ERRORS.keys(): count = VALIDATION_ERRORS[key] keys.append(key) counts.append(count) tuples = list(zip(counts, keys)) tuples.sort() tuples.reverse() keys = [key for (count, key) in tuples] errStrs = [] for key in keys: errStrs.append(key + ':' + str(VALIDATION_ERRORS[key])) errStr = str.join(', ', errStrs) print('ValidationErrors = ', errStr) else: maxKey = None maxKeyCount = 0 totalKeyCount = 0 keys = CAUSAL_ORDERS.keys() for key in keys: keyCount = CAUSAL_ORDERS[key] if keyCount > maxKeyCount: maxKeyCount = keyCount maxKey = key totalKeyCount += keyCount print('Most Common Order (', maxKeyCount / totalKeyCount * 100, '%) = ', maxKey) print() print('CausalOrders = ', str(CAUSAL_ORDERS)) return