Ejemplo n.º 1
0
def getBestModel(filePath, dataPoints):
    dot_pos = filePath.rfind('.')
    basePath = filePath[:dot_pos]
    ext = filePath[dot_pos + 1:]
    modelDict = {}
    for i in range(NUMBER_OF_MODELS_TO_CONSIDER):
        if ext == 'py':
            #print('Generating data')
            synthDataGen.run(filePath, samples=dataPoints)
        dataFile = basePath + '.csv'

        ic = CaLingam.IC(dataFile, dataPoints)
        model = ic.getModel()
        dagStr = model.getEquivalencyKey()
        if not dagStr in modelDict:
            modelDict[dagStr] = []
        modelDict[dagStr].append(model)
    # Find model based on most common structure
    mostCommon = None
    mostCommonCount = 0
    for modelList in modelDict.values():
        if len(modelList) > mostCommonCount:
            mostCommonCount = len(modelList)
            mostCommon = modelList[0]
    model = mostCommon
    print('Most common model = ', mostCommonCount, '/',
          NUMBER_OF_MODELS_TO_CONSIDER)
    return model
Ejemplo n.º 2
0
def run(s1, s2):
    global firstTime, CUM_DEPENDENCE
    global SNR, SNR_DB, CUM_TIME
    reset = True
    if firstTime:
        #print('std ratio = ', s1D.std() / s2D.std())
        firstTime = False
    else:
        reset = False
    try:
        synthDataGen.run(FILE + '.py',
                         samples=DATA_POINTS,
                         maxDifficulty=MAX_DIFFICULTY,
                         reset=reset)
    except:
        pass
    dr = getData.DataReader(input=FILE + '.csv', limit=DATA_POINTS)
    s1D = np.array(dr.getSeries(s1))
    s2D = np.array(dr.getSeries(s2))
    dependence = IndHSIC.scoreDependence(s1D, s2D)
    #dependence = IndLn.scoreDependence(s1D, s2D)
    CUM_DEPENDENCE = CUM_DEPENDENCE + dependence
    #print('s1 = ', s1D.mean(), s1D.std())
    #print('s2 = ', s2D.mean(), s2D.std())
    if firstTime:
        #print('std ratio = ', s1D.std() / s2D.std())
        firstTime = False
    coefs = np.polyfit(s1D, s2D.T, 1)
    x1coef, x0coef = coefs
    Dprint('coefs = ', coefs)
    cs2D = s1D * x1coef + x0coef

    # coef = lstsq(np.array([list(s1D)]).T, s2D)[0][0]
    # Dprint('coef = ', coef)
    # cs2D = s1D * coef

    res = s2D - cs2D
    signal = cs2D.var(ddof=1)
    noise = res.var(ddof=1)
    snr = signal / noise
    if snr < SNR:
        SNR = snr
        SNR_DB = 10 * math.log(snr, 10)

    start = time.time()
    if IND_TYPE == 'PL' or IND_TYPE == 'CCM':
        dep = scoreDependence(s1D, s2D)
    else:
        dep = scoreDependence(s1D, res)
    end = time.time()
    duration = end - start
    CUM_TIME += duration
    Dprint('Residual Dependence for ', s1, '-->', s2, ' = ', dep)
    return dep
Ejemplo n.º 3
0
	def analyzeOneLingam(self, dataCount, gen=True, valData=None):
		if gen:
			synthDataGen.run(self.genFile, dataCount)
		dr = getData.DataReader(self.testFile, dataCount)
		vars = dr.getSeriesNames()
		if valData is None:
			valData = synthDataGen.getValidation()
		lowestSNR = 10**100
		cumSNR = 0
		pairCount = 0
		lowestPair = None
		stats = {}
		nodeDiffs = {}
		for tuple in valData:
			successor, predecessors = tuple
			if len(predecessors) == 0:
				continue
			for predecessor in predecessors:
				nodeDiffs[predecessor] = {}
				pD = np.array(dr.getSeries(predecessor))
				sD = np.array(dr.getSeries(successor))
				coefs = np.polyfit(pD, sD.T, 1)
				x1coef, x0coef = coefs
				#Dprint('coefs = ', coefs)
				cs2D = pD * x1coef + x0coef
				res = sD - cs2D
				# Note that signal and noise are reversed from what you might expect.
				# In this case, the variance of the error term is the signal we are looking for,
				# while the linear component is actually the noise
				noise = cs2D.var(ddof=1)
				signal = res.var(ddof=1)
				#print('pair = ', predecessor, '--->', successor)
				#print('noise = ', noise)
				#print('signal = ', signal)
				snr = signal / noise
				#print('snr = ', snr)
				nodeDiffs[predecessor][successor] = 10* math.log(1.0/snr, 10)
				if snr < lowestSNR:
					lowestSNR = snr
					lowestPair = (predecessor, successor)
				cumSNR += snr
				pairCount += 1
		stats['minSnr'] = 10*math.log(lowestSNR,10)
		avgSNR = cumSNR/float(pairCount)
		stats['avgSnr'] = 10*math.log(avgSNR, 10)
		stats['weakestPair'] = lowestPair
		difficulty = max([10 * math.log(1.0 / lowestSNR, 10),0.0])
		stats['difficulty'] = difficulty
		stats['weakestPairing'] = lowestPair
		stats['variableDifficulty'] = nodeDiffs
		stats['normDifficulty'] = 100.0 * difficulty / (dataCount**.5)		
		return stats
Ejemplo n.º 4
0
def run(dataDefFile, var1, var2, datapoints):
	tau = TAU
	print('Generating data using: ', dataDefFile)
	fileExt = dataDefFile.split('.')[-1]
	if fileExt == 'py':
		# Data Definition File
		outFile = synthDataGen.run(dataDefFile, datapoints)
	elif fileExt == 'csv':
		# Data file
		outFile = dataDefFile
	else:
		print('*** Invalid file type = ', fileExt)
		return
	d = getData.DataReader(outFile, limit=datapoints)
	fig = plt.figure()
	ax = fig.add_axes([.1,.1,.8,.8],projection='3d')
	
	# If var1 and var2 are not specified, then build a master manifold out of three vars at a time.  Otherwise, build shadow manifolds
	# from the two specified vars
	if var1 is None:
		vars = d.getSeriesNames()
		vars.sort()
		print('Vars = ', vars)
		colors = ['b', 'g', 'r','o'] 
		for i in range(4):
			if len(vars) < i*3+3:
				break
			X = d.getSeries(vars[i*3])
			Y = d.getSeries(vars[i*3+1])
			Z = d.getSeries(vars[i*3+2])
			if standard:
				 X = standardize.standardize(X)
				 Y = standardize.standardize(Y)
				 Z = standardize.standardize(Z)
			color = colors[i]
			ax.plot(X, Y, Z)
			
	else:
		var1D = d.getSeries(var1)
		if standard:
			var1D = standardize.standardize(var1D)
		X1 = var1D[:-2*tau]
		Y1 = var1D[tau:-tau]
		Z1 = var1D[2*tau:]

		var2D = d.getSeries(var2)
		if standard:
			var2D = standardize.standardize(var2D)
		X2 = var2D[:-2*tau]
		Y2 = var2D[tau:-tau]
		Z2 = var2D[2*tau:]
		
		#plotData(d, datapoints)
		ax.plot(X1,Y1,Z1)
		ax.plot(X2,Y2,Z2, 'r')
		#ax.plot(X1, Y1, Z2, 'g')
	
	# `ax` is a 3D-aware axis instance because of the projection='3d' keyword argument to add_subplot
	#ax = fig.add_subplot(1, 2, 1, projection='3d')
	plt.show()
Ejemplo n.º 5
0
 def calibrateOneCITest(self, testType, filePath):
     synthDataGen.run(filePath + '.py', samples=self.datacount)
     exec('import ' + testType)
     module = eval(testType)
     SA = analyzeSEM.SemAnalyzer(filePath + '.py', self.datacount)
     reader = getData.DataReader(filePath + '.csv', self.datacount)
     dependencies, independencies = SA.getCondDependencies()
     # print('dependencies = ', dependencies)
     # print('independencies = ', independencies)
     errors = 0
     errorTerms = {}
     items = 0
     for item in dependencies:
         x, y, z = item
         X = reader.getSeries(x)
         Y = reader.getSeries(y)
         Z = reader.getSeries(z)
         ind = module.isIndependent(X, Y, Z)
         if ind:
             print('Error -- ', x, 'and', y, 'Should be dependent given', z)
             self.err1Count += 1
             errors += 1
             errorTerms[item] = 1
         self.testCount += 1
     for item in independencies:
         x, y, z = item
         X = reader.getSeries(x)
         Y = reader.getSeries(y)
         Z = reader.getSeries(z)
         ind = module.isIndependent(X, Y, Z)
         if not ind:
             print('Error -- ', x, 'and', y, 'Should be independent given',
                   z)
             self.err2Count += 1
             errors += 1
             errorTerms[item] = 1
         self.testCount += 1
     #print('Rating = ', (1 - (errors / items))*100, '%')
     print('Errors for file: ', filePath, '=', errors,
           list(errorTerms.keys()))
     return
Ejemplo n.º 6
0
def run(dataDefFile, datapoints):
    print('Generating data using: ', dataDefFile)
    fileExt = dataDefFile.split('.')[-1]
    if fileExt == 'py':
        # Data Definition File
        outFile = synthDataGen.run(dataDefFile, datapoints)
    elif fileExt == 'csv':
        # Data file
        outFile = dataDefFile
    else:
        print('*** Invalid file type = ', fileExt)
        return
    d = getData.DataReader(outFile, limit=datapoints)
    print('Vars = ', d.getSeriesNames())
    plotData(d, datapoints)
Ejemplo n.º 7
0
def CaLingamTest(testType, paramSet):
    global VALIDATION_ERRORS, CAUSAL_ORDERS
    VALIDATION_ERRORS = {}
    CAUSAL_ORDERS = {}
    maxDifficulty = MAX_DIFFICULTY
    fails = 0.0
    datafile, runs, dataPoints, validation = paramSet[:4]
    valData = None
    totalDuration = 0
    datafileRootName = datafile.split('.')[-2].split('\\')[-1]
    #print('dfrn = ', datafileRootName)
    sa = analyzeSEM.SemAnalyzer(datafileRootName)
    difficulty = 0
    diffNorm = 0
    for i in range(runs):
        if i == 0:
            reset = True
        if RESET_COUNT > 0 and i > 0 and i / RESET_COUNT == int(
                i / RESET_COUNT):
            reset = True
            print()
            print('Previous SEM:')
            print(synthDataGen.getSEM())
            #print('Resetting')
        else:
            reset = False
        if i == 0:
            reset = True
        prune = True
        if validation == 'SynthOrderVal':
            # Suppress pruning in order to increase perf when only testing order.
            prune = False
        if testType == 'synth':
            outFile = synthDataGen.run(datafile,
                                       samples=dataPoints,
                                       reset=reset,
                                       maxDifficulty=maxDifficulty)
            valData = synthDataGen.getValidation()
            if i == 0 or reset:
                saStats = sa.analyzeOne(dataPoints, gen=False, valData=valData)
                difficulty = round(saStats['difficulty'], 2)
                diffNorm = round(saStats['normDifficulty'], 2)
                print('difficulty = ', difficulty, ', norm difficulty = ',
                      diffNorm)
        elif testType == 'live':
            outFile = datafile
        else:
            print('*** Invalid Test Type = ', testType)
            return
        startTime = time.time()
        c = CaLingam.IC(outFile, limit=dataPoints, prune=prune)
        dag = c.getDAG()
        endTime = time.time()
        duration = endTime - startTime
        totalDuration += duration
        if len(validation) > 0:
            result = eval(validation + '(paramSet, dag, valData)')
        else:
            corder = str.join('||', dag.getVarOrder()[:5])
            if not corder in CAUSAL_ORDERS:
                CAUSAL_ORDERS[corder] = 1
                #print('Causal Order = ', dag.getVarOrder())
                # If we get a new causal order after the first time, it is considered an error
                if len(CAUSAL_ORDERS) > 1:
                    result = 0
                else:
                    result = 1
            else:
                # Got an existing causal order.  Consider that success
                count = CAUSAL_ORDERS[corder]
                count += 1
                CAUSAL_ORDERS[corder] = count
                result = 1
        if result:
            #print ('Success', )
            print('.', end='', flush=True)
        else:
            print('x', end='', flush=True)
            fails += 1
    print()
    reliability = round(1.0 - (fails / float(runs)), 2)
    #stats = 'Errors: ' + str(int(fails)) + ' / ' + str(int(i+1)) + ' -- Reliability: ' + str((1.0 - (fails / (i+1))) * 100) + '%'
    stats = 'Errors: ' + str(int(fails)) + ' / ' + str(
        int(runs)) + ' -- Reliability: ' + str(
            reliability * 100) + '% avg duration: ' + str(
                round(totalDuration / runs,
                      2)) + ' sec' + ' difficulty: ' + str(
                          difficulty) + ' diffNorm: ' + str(
                              diffNorm) + ' strength: ' + str(
                                  round(reliability * diffNorm, 2))
    print('Stats = ', stats)
    if fails > 0:
        if len(validation) > 0:
            # Sort validation_errors to show the most common first
            counts = []
            keys = []
            for key in VALIDATION_ERRORS.keys():
                count = VALIDATION_ERRORS[key]
                keys.append(key)
                counts.append(count)
            tuples = list(zip(counts, keys))
            tuples.sort()
            tuples.reverse()
            keys = [key for (count, key) in tuples]
            errStrs = []
            for key in keys:
                errStrs.append(key + ':' + str(VALIDATION_ERRORS[key]))
                errStr = str.join(', ', errStrs)
            print('ValidationErrors = ', errStr)
        else:
            maxKey = None
            maxKeyCount = 0
            totalKeyCount = 0
            keys = CAUSAL_ORDERS.keys()
            for key in keys:
                keyCount = CAUSAL_ORDERS[key]
                if keyCount > maxKeyCount:
                    maxKeyCount = keyCount
                    maxKey = key
                totalKeyCount += keyCount
            print('Most Common Order (', maxKeyCount / totalKeyCount * 100,
                  '%) = ', maxKey)
            print()
            print('CausalOrders = ', str(CAUSAL_ORDERS))
    return