def __init__(self, pileup_data, pileup_var, samplename): self.pileup=pileup_data self.name=name self.genotypeList=[] for ploidy in [2]: l=[ combo for combo in combinations_with_replacement(['A','C','G','T'],p) ] for g in l: genotype="".join( list(g) ) self.genotypeList.append( Genotype( genotype, ploidy) ) GL=Factor ( [ pileup_var ],[ 10 ], [], samplename ) depth=length(pileup_data) likelihood_matrix=np.zeros( depth, 10 ) for i in range(depth): (sample, readgroup, aligned_read, basecall, bq)=self.pileup[i] for j in range( len (self.genotypeList)): likelihood_matrix[i,j]=self.genotypeList[j].calculateBaseLikelihood(basecall, ErrorProb( bq) ) genotypelikelihoods=np.sum( np.log(likelihood_matrix), axis=0) GL.setVal(genotypelikelihoods.tolist()) """ https://github.com/indapa/Pgmsnp/blob/master/PythonNotebook/genotypeLikelihoodMatrix.py """
def __init__(self, allelefreqs, genotypeVar, name): self.allelefreq = allelefreqs #number of alleles == number of allele frequencies passed in numAlleles = len(allelefreqs) self.allelesToGenotypes = None self.genotypesToAlleles = None self.genotypeFactor = None #map alleles to genotypes and genotyeps to alleles (self.allelesToGenotypes, self.genotypesToAlleles) = generateAlleleGenotypeMappers(numAlleles) (ngenos, ploidy) = np.shape(self.genotypesToAlleles) self.genotypeFactor = Factor([genotypeVar], [], [], name) #the cardinality of the factor is the number of genotypes self.genotypeFactor.setCard([ngenos]) #set the values to zero initially values = np.zeros((np.prod(self.genotypeFactor.getCard()))).tolist() for i in range(ngenos): alleles = self.genotypesToAlleles[i, :].tolist() if alleles[0] == alleles[1]: values[i] = np.prod([allelefreqs[j] for j in alleles]) else: values[i] = np.prod([allelefreqs[j] for j in alleles]) * 2 self.genotypeFactor.setVal(values)
def __init__(self, alphaList, numAlleles, geneCopyVarOne, geneCopyVarTwo, phenotypeVar): self.numalleles = numAlleles self.alphaList = alphaList self.phenotypeFactor = Factor( [phenotypeVar, geneCopyVarOne, geneCopyVarTwo], [], [], 'phenotype| geneCopy1, geneCopy2') ngenos = len(alphaList) self.phenotypeFactor.setCard([2, numAlleles, numAlleles]) #phenotypeFactor.val = zeros(1, prod(phenotypeFactor.card)); values = np.zeros( (1, np.prod(self.phenotypeFactor.getCard()))).flatten().tolist() affectedAlphas = alphaList unaffectedAlphas = [1 - alpha for alpha in alphaList] (allelesToGenotypes, genotypesToAlleles) = generateAlleleGenotypeMappers(numAlleles) assignments = IndexToAssignment( np.arange(np.prod(self.phenotypeFactor.getCard())), self.phenotypeFactor.getCard()) - 1 for z in range(np.prod(self.phenotypeFactor.getCard())): curr_assign = assignments[z] curr_assign = assignments[z] genotype_num = allelesToGenotypes[curr_assign[1], curr_assign[2]] if curr_assign[0] == 0: values[z] = affectedAlphas[genotype_num] else: values[z] = unaffectedAlphas[genotype_num] self.phenotypeFactor.setVal(values)
def __init__(self, numAlleles, genotypeVarChild, genotypeVarParentOne, genotypeVarParentTwo, name): self.genotypeFactor = Factor( [genotypeVarChild, genotypeVarParentOne, genotypeVarParentTwo], [], [], name) #map alleles to genotypes and genotyeps to alleles (self.allelesToGenotypes, self.genotypesToAlleles) = generateAlleleGenotypeMappers(numAlleles) (ngenos, ploidy) = np.shape(self.genotypesToAlleles) self.genotypeFactor.setCard([ngenos, ngenos, ngenos]) #set the values to zero initially values = np.zeros((np.prod(self.genotypeFactor.getCard()))).tolist() #iterate thru variable assignments to random variables #assign probablities based on Punnet square crosses assignments = IndexToAssignment( np.arange(np.prod(self.genotypeFactor.getCard())), self.genotypeFactor.getCard()) - 1 for z in range(np.prod(self.genotypeFactor.getCard())): curr_assign = assignments[z] childAssignment = int(curr_assign[0]) parent1gametes = self.genotypesToAlleles[curr_assign[1], :] parent2gametes = self.genotypesToAlleles[curr_assign[2], :] #print 'parental gametes: ', parent1gametes, parent2gametes #print 'child assignment: ', childAssignment #list of tuples containing list of zygote(genotype) tuples zygote_list = list( itertools.product(parent1gametes, parent2gametes)) punnet_freq = [ self.allelesToGenotypes[zygote[0], zygote[1]] for zygote in zygote_list ] histc = {} hist = [] for g in range(ngenos): histc[g] = 0. for x in punnet_freq: histc[x] += 1. #print histc.values() for g in range(ngenos): hist.append(histc[g]) #print punnet_freq hist = (np.array(hist)) / 4 #print 'hist:', hist #print zygote_list values[z] = hist[childAssignment] self.genotypeFactor.setVal(values)
class ChildCopyGivenFreqFactor(object): """ for a founder, its particular haplotype is proprortional to the given allelel freq of the locus. This factor is part of the decoupled Bayesian Genetic network , along with ChildCopyGivenParentalsFactor""" def __init__(self, alleleFreqs, geneCopyVar): numAlleles = len(alleleFreqs) self.geneCopyFactor=Factor( [geneCopyVar], [], [], 'founderHap') self.geneCopyFactor.setCard ( [numAlleles]) self.geneCopyFactor.setVal( alleleFreqs ) #geneCopyFactor = struct('var', [], 'card', [], 'val', []) #geneCopyFactor.var(1) = geneCopyVar; #geneCopyFactor.card(1) = numAlleles; #geneCopyFactor.val = alleleFreqs'; def getVar(self): return self.geneCopyFactor.getVar() def getCard(self): return self.geneCopyFactor.getCard() def getVal(self): return self.geneCopyFactor.getVal() def getFactor(self): return self.genCopyFactor def __str__(self): return self.geneCopyFactor.__str__()
def from_xml_file(self, filename): doc = minidom.parse(filename) rootnode = doc.getElementsByTagName("root")[0] for node in rootnode.childNodes: if node.nodeType == node.TEXT_NODE: continue if node.nodeName == "instruction": self.instruction = node.getAttribute("text") elif node.nodeName == "world": self.world = World() self.world.from_xml(node.toxml()) else: self.root = Factor() self.root.from_xml(node.toxml())
class ChildCopyGivenFreqFactor(object): """ for a founder, its particular haplotype is proprortional to the given allelel freq of the locus. This factor is part of the decoupled Bayesian Genetic network , along with ChildCopyGivenParentalsFactor""" def __init__(self, alleleFreqs, geneCopyVar): numAlleles = len(alleleFreqs) self.geneCopyFactor = Factor([geneCopyVar], [], [], 'founderHap') self.geneCopyFactor.setCard([numAlleles]) self.geneCopyFactor.setVal(alleleFreqs) #geneCopyFactor = struct('var', [], 'card', [], 'val', []) #geneCopyFactor.var(1) = geneCopyVar; #geneCopyFactor.card(1) = numAlleles; #geneCopyFactor.val = alleleFreqs'; def getVar(self): return self.geneCopyFactor.getVar() def getCard(self): return self.geneCopyFactor.getCard() def getVal(self): return self.geneCopyFactor.getVal() def getFactor(self): return self.genCopyFactor def __str__(self): return self.geneCopyFactor.__str__()
def __init__(self, alphaList, numAlleles, geneCopyVarOne, geneCopyVarTwo, phenotypeVar): self.numalleles=numAlleles self.alphaList=alphaList self.phenotypeFactor=Factor([phenotypeVar,geneCopyVarOne, geneCopyVarTwo], [], [], 'phenotype| geneCopy1, geneCopy2') ngenos=len(alphaList) self.phenotypeFactor.setCard( [ 2, numAlleles, numAlleles]) #phenotypeFactor.val = zeros(1, prod(phenotypeFactor.card)); values=np.zeros( (1, np.prod(self.phenotypeFactor.getCard()))).flatten().tolist() affectedAlphas=alphaList unaffectedAlphas=[ 1- alpha for alpha in alphaList] (allelesToGenotypes, genotypesToAlleles) = generateAlleleGenotypeMappers(numAlleles) assignments=IndexToAssignment( np.arange(np.prod(self.phenotypeFactor.getCard())), self.phenotypeFactor.getCard() )-1 for z in range( np.prod(self.phenotypeFactor.getCard() ) ): curr_assign= assignments[z] curr_assign=assignments[z] genotype_num=allelesToGenotypes[curr_assign[1], curr_assign[2]] if curr_assign[0] == 0: values[z] = affectedAlphas[genotype_num] else: values[z] = unaffectedAlphas[genotype_num] self.phenotypeFactor.setVal( values )
def multiply(factor1, factor2): largeFactor = factor1 if factor1.array.ndim >= factor2.array.ndim else factor2 smallFactor = factor1 if factor1.array.ndim < factor2.array.ndim else factor2 variableListFactor1 = factor1.variables coordList1 = [1] * 5 for var in variableListFactor1: index = getVariableIndex(var) if (index == 1): coordList1[index] = 3 else: coordList1[index] = 2 coordTuple1 = tuple(coordList1) factor1Temp = factor1.array.reshape(coordTuple1) variableListFactor2 = factor2.variables coordList2 = [1] * 5 for var in variableListFactor2: index = getVariableIndex(var) if (index == 1): coordList2[index] = 3 else: coordList2[index] = 2 coordTuple2 = tuple(coordList2) factor2Temp = factor2.array.reshape(coordTuple2) soln = np.squeeze(factor1Temp * factor2Temp) variables = largeFactor.variables + list( set(smallFactor.variables) - set(largeFactor.variables)) return Factor(variables, soln)
def returnNonFoundersFactor( genotypeVarChild, genotypeVarParentOne, genotypeVarParentTwo, values, factorName="child|parent 1, parent2",numAlleles=4 ): """ return a Factor object that represents pr( offspring_genotype | genotype_mother, genotype_father ) values are the transition probalities of pr(offspring_genotype|mother,father) These don't change, so we calculate them once and the pass them in as a parameter. The only thing you are doing is setting the variable names and cardinality (based on the number of alleles) Note, when you calculate the transition probablities in values with returnPunnetValues, make sure the numAlleles is the same. Otherwise there will be a dimenionality mismatch!""" f1= Factor( [genotypeVarChild, genotypeVarParentOne, genotypeVarParentTwo ], [ ], values, factorName ) (allelesToGenotypes, genotypesToAlleles)=generateAlleleGenotypeMappers(numAlleles) (ngenos,ploidy)=np.shape(genotypesToAlleles) f1.setCard([ ngenos,ngenos,ngenos ] ) #set the values to zero initially return f1
def __init__(self, allelefreqs, genotypeVar, name): self.allelefreq=allelefreqs #number of alleles == number of allele frequencies passed in numAlleles=len(allelefreqs) self.allelesToGenotypes=None self.genotypesToAlleles=None self.genotypeFactor=None #map alleles to genotypes and genotyeps to alleles (self.allelesToGenotypes, self.genotypesToAlleles)=generateAlleleGenotypeMappers(numAlleles) (ngenos,ploidy)=np.shape(self.genotypesToAlleles) self.genotypeFactor = Factor( [genotypeVar], [], [], name) #the cardinality of the factor is the number of genotypes self.genotypeFactor.setCard( [ngenos] ) #set the values to zero initially values=np.zeros( (np.prod(self.genotypeFactor.getCard()))).tolist() for i in range (ngenos): alleles=self.genotypesToAlleles[i,:].tolist() if alleles[0] == alleles[1]: values[i]= np.prod( [ allelefreqs[j] for j in alleles ]) else: values[i]= np.prod( [ allelefreqs[j] for j in alleles ]) * 2 self.genotypeFactor.setVal( values )
def __init__(self, numAlleles, geneCopyVarChild, geneCopyHapOne, geneCopyHapTwo): self.numalleles=numAlleles self.hapone=geneCopyVarChild self.haptwo=geneCopyHapTwo #geneCopyFactor = struct('var', [], 'card', [], 'val', []); self.geneCopyFactor=Factor( [geneCopyVarChild, geneCopyHapOne, geneCopyHapTwo ], [], [], 'child|hap1,hap2') self.geneCopyFactor.setCard( [self.numalleles,self.numalleles,self.numalleles ]) values=np.zeros( np.prod([ self.numalleles,self.numalleles,self.numalleles])).tolist() #this keeps track of what posiiton you are in the values list index=0 #the number of iterations thru the nested for loops should be equal to numallels^3 for i in range(numAlleles): #iterate through alleles from #grand(paternal) haplotype for j in range(numAlleles): #iterate through alleles from #grand(maternal) haplotype for k in range(numAlleles): #iterate thru child alleles print i, j, k if j==k:#child has grandmotherhap if i==k:#grandfatherhap is the same values[index]=1 else: values[index]=.5 elif i==k:#child has grandfather hap values[index]=.5 else: pass index+=1 #print values self.geneCopyFactor.setVal( values )
class ChildCopyGivenParentalsFactor(object): """ this represents a de-coupled factor given a parents two haplotypes, returns factor whose values are the probablity of inheriting (grand)paternal or (grand)maternal haplotype. This allows for some more flexibility in modeling inheritance, rather than clumping a single parent's haplotype into a genotype i.e. GenotypeGivenParentsFactor """ def __init__(self, numAlleles, geneCopyVarChild, geneCopyHapOne, geneCopyHapTwo): self.numalleles=numAlleles self.hapone=geneCopyVarChild self.haptwo=geneCopyHapTwo #geneCopyFactor = struct('var', [], 'card', [], 'val', []); self.geneCopyFactor=Factor( [geneCopyVarChild, geneCopyHapOne, geneCopyHapTwo ], [], [], 'child|hap1,hap2') self.geneCopyFactor.setCard( [self.numalleles,self.numalleles,self.numalleles ]) values=np.zeros( np.prod([ self.numalleles,self.numalleles,self.numalleles])).tolist() #this keeps track of what posiiton you are in the values list index=0 #the number of iterations thru the nested for loops should be equal to numallels^3 for i in range(numAlleles): #iterate through alleles from #grand(paternal) haplotype for j in range(numAlleles): #iterate through alleles from #grand(maternal) haplotype for k in range(numAlleles): #iterate thru child alleles print i, j, k if j==k:#child has grandmotherhap if i==k:#grandfatherhap is the same values[index]=1 else: values[index]=.5 elif i==k:#child has grandfather hap values[index]=.5 else: pass index+=1 #print values self.geneCopyFactor.setVal( values ) def getVar(self): return self.geneCopyFactor.getVar() def getCard(self): return self.geneCopyFactor.getCard() def getVal(self): return self.geneCopyFactor.getVal() def getFactor(self): return self.geneCopyFactor def __str__(self): return self.geneCopyFactor.__str__()
def ComputeMarginal(V, F, E): """ ComputeMarginal Computes the marginal over a set of given variables M = ComputeMarginal(V, F, E) computes the marginal over variables V in the distribution induced by the set of factors F, given evidence E M is a factor containing the marginal over variables V V is a vector containing the variables in the marginal e.g. [1 2 3] for X_1, X_2 and X_3. i.e. a result of FactorMarginalization F is a vector of factors (struct array) containing the factors defining the distribution E is an N-by-2 matrix, each row being a variable/value pair. Variables are in the first column and values are in the second column. If there is no evidence, pass in the empty matrix [] for E. """ totalFactors = len(F) #reshape a 1d array to 1 x ncol array #since ObserveEvidence requires Nx2 array, we reshape to a 2 column array #see http://stackoverflow.com/a/12576163 for reshaping 1d array to 2d array EVIDENCE = np.reshape(np.array(E), (-1, 2)) #print np.shape(EVIDENCE) if totalFactors == 0: sys.stderr.write("empty factor list given as input.\n") return Factor([], [], []) # union of all variables in list of factors F variableList = [ ] # a list of of lists, where each element is a list containing the variables of the factor in F for factor in F: var = factor.getVar().tolist() variableList.append(var) #get the union of variables across all the factor in F #see this http://stackoverflow.com/a/2151553, Pythonic Way to Create Union of All Values Contained in Multiple Lists union_variables = set().union(*variableList) #print union_variables #v contains the variables not in the list of variables in the marginal v = list(union_variables.difference(V)) # compute the joint distribution, but then reduce it, given the evidence # ComputeJointDistribution returns a factor, but ObserveEvidence expects a list # of factors as the first argument, so hence the need for brackets [ ] # ObserveEvidence returns a list, but we want the first element so thats why the [0] jointE = ObserveEvidence([ComputeJointDistribution(F)], EVIDENCE)[0] #now we need to re-normaize the joint, since observe evidence doesn't do it for us jointE_normalizedVal = jointE.getVal() / np.sum(jointE.getVal()) jointE.setVal(jointE_normalizedVal.tolist()) return FactorMarginalization(jointE, v)
def get_rmw_factor_list(roe_list, stock_return_frame, market_capital_frame): # Initialize factor list rmw_factor_list = pd.DataFrame(np.zeros((len(roe_list.index), 1)), columns=["RMW"]) # Calculate RMW factor for i in range(len(roe_list)): rmw_factor_list.iloc[i] = fct.RMW(roe_list.iloc[i], stock_return_frame.iloc[i], market_capital_frame.iloc[i]) return rmw_factor_list
def test_computeMarginal1 (self): # Place into an array factor_array = [] factor_array.append(self.factorA) factor_array.append(self.factorB) factor_array.append(self.factorC) expectedFactor = pgmf.Factor(np.array([2, 3]), np.array([2, 2]), np.array([0.0858, 0.0468, 0.1342, 0.7332])) resultFactor = pgmf.computeMarginal(np.array([2,3]), factor_array, np.array([1,2])) np.testing.assert_array_equal(resultFactor.varbs, expectedFactor.varbs) np.testing.assert_array_equal(resultFactor.card, expectedFactor.card) np.testing.assert_array_almost_equal(resultFactor.vals, expectedFactor.vals, decimal = 4)
def __init__(self, numAlleles, geneCopyVarChild, geneCopyHapOne, geneCopyHapTwo): self.numalleles = numAlleles self.hapone = geneCopyVarChild self.haptwo = geneCopyHapTwo #geneCopyFactor = struct('var', [], 'card', [], 'val', []); self.geneCopyFactor = Factor( [geneCopyVarChild, geneCopyHapOne, geneCopyHapTwo], [], [], 'child|hap1,hap2') self.geneCopyFactor.setCard( [self.numalleles, self.numalleles, self.numalleles]) values = np.zeros( np.prod([self.numalleles, self.numalleles, self.numalleles])).tolist() #this keeps track of what posiiton you are in the values list index = 0 #the number of iterations thru the nested for loops should be equal to numallels^3 for i in range(numAlleles): #iterate through alleles from #grand(paternal) haplotype for j in range(numAlleles): #iterate through alleles from #grand(maternal) haplotype for k in range(numAlleles): #iterate thru child alleles print i, j, k if j == k: #child has grandmotherhap if i == k: #grandfatherhap is the same values[index] = 1 else: values[index] = .5 elif i == k: #child has grandfather hap values[index] = .5 else: pass index += 1 #print values self.geneCopyFactor.setVal(values)
def __init__(self, isDominant, genotypeVar, phenotypeVar, name): #instantiate a Factor object phenotype = Factor([phenotypeVar, genotypeVar], [2, 3], [], name) phenotype.setVal(np.zeros(np.prod(phenotype.getCard())).tolist()) #this enumerates the values the factor can take # since there are 2x3 cardinality, 6 possible assignments assignments = IndexToAssignment( np.arange(np.prod(phenotype.getCard())), phenotype.getCard()) val = val = np.zeros(np.prod(phenotype.getCard())) (nrows, ncols) = np.shape(assignments) for i in range(np.prod([2, 3])): #if its dominant, if you have at least one copy, you have the phenotype (pheno, geno) = assignments[i] if isDominant == 1: if pheno == 1: #affected if geno == 1 or geno == 2: val[i] = 1 else: val[i] = 0 else: #uneffected if geno == 3: val[i] = 1 if isDominant == 0: if pheno == 1: if geno == 3: val[i] = 1 else: if geno == 1 or geno == 2: val[i] = 1 phenotype.setVal(val.tolist()) self.phenotype = phenotype
class ExampleInstance(object): def __init__(self): self.instruction = "" self.world = None self.root = None def from_xml_file(self, filename): doc = minidom.parse(filename) rootnode = doc.getElementsByTagName("root")[0] for node in rootnode.childNodes: if node.nodeType == node.TEXT_NODE: continue if node.nodeName == "instruction": self.instruction = node.getAttribute("text") elif node.nodeName == "world": self.world = World() self.world.from_xml(node.toxml()) else: self.root = Factor() self.root.from_xml(node.toxml()) def __str__(self): out_str = "[instruction]" + self.instruction + "\n" out_str += self.world.__str__() out_str += self.root.__str__() return out_str def get_factors(self): factor_list = [] self.scan_factor(self.root, factor_list) return factor_list def scan_factor(self, factor, factor_list): factor_list.append(factor) for child_factor in factor.children: self.scan_factor(child_factor, factor_list)
def __init__(self, alphaList, phenotypeVar, genotypeVar, name): self.phenotypeFactor = Factor([phenotypeVar, genotypeVar], [], [], name) self.alpha = np.array(alphaList) ngenotypes = len(alphaList) self.phenotypeFactor.setCard([2, ngenotypes]) values = [x for x in range(np.prod(self.phenotypeFactor.getCard()))] for i in range(len(alphaList)): values[i] = alphaList[i] values[i + 1] = 1 - alphaList[i] ctr = 0 alphas = 2 * len(alphaList) * [None] for i in range(len(alphaList)): alphas[ctr] = alphaList[i] ctr = ctr + 1 alphas[ctr] = 1 - alphaList[i] ctr = ctr + 1 values = alphas self.phenotypeFactor.setVal(values)
def test_joint1 (self): # Reset Factors self.factorA = pgmf.Factor(np.array([1]), np.array([2]), np.array([0.11, 0.89])) self.factorB = pgmf.Factor(np.array([2,1]), np.array([2,2]), np.array([0.59, 0.41, 0.22, 0.78])) self.factorC = pgmf.Factor(np.array([3,2]), np.array([2,2]), np.array([0.39, 0.61, 0.06, 0.94])) # Place into an array factor_array = [] factor_array.append(self.factorA) factor_array.append(self.factorB) factor_array.append(self.factorC) expectedFactor = pgmf.Factor(np.array([1, 2, 3]), np.array([2, 2, 2]), np.array([0.025311, 0.076362, 0.002706, 0.041652, 0.039589, 0.119438, 0.042394, 0.652548])) resultFactor = pgmf.joint(factor_array) np.testing.assert_array_equal(resultFactor.varbs, expectedFactor.varbs) np.testing.assert_array_equal(resultFactor.card, expectedFactor.card) np.testing.assert_array_almost_equal(resultFactor.vals, expectedFactor.vals, decimal = 6)
def __init__(self, isDominant, genotypeVar, phenotypeVar, name): #instantiate a Factor object phenotype = Factor( [phenotypeVar, genotypeVar], [2, 3], [], name ) phenotype.setVal( np.zeros ( np.prod(phenotype.getCard())).tolist() ) #this enumerates the values the factor can take # since there are 2x3 cardinality, 6 possible assignments assignments=IndexToAssignment( np.arange(np.prod(phenotype.getCard())), phenotype.getCard() ) val=val = np.zeros(np.prod(phenotype.getCard() )) (nrows,ncols)=np.shape(assignments) for i in range(np.prod([2,3])): #if its dominant, if you have at least one copy, you have the phenotype (pheno,geno)=assignments[i] if isDominant==1: if pheno ==1: #affected if geno ==1 or geno ==2: val[i]=1 else: val[i]=0 else:#uneffected if geno == 3: val[i]=1 if isDominant == 0: if pheno == 1: if geno==3: val[i]=1 else: if geno ==1 or geno == 2: val[i]=1 phenotype.setVal( val.tolist() ) self.phenotype=phenotype
def returnGenotypeGivenParentsFactor( genotypeVarChild, genotypeVarParentOne, genotypeVarParentTwo, factorName="child|parent 1, parent2", numAlleles=4 ): """ return a Factor object that represents pr( offspring_genotype | genotype_mother, genotype_father ) basically this is a Punnet square """ f1= Factor( [genotypeVarChild, genotypeVarParentOne, genotypeVarParentTwo ], [ ], [ ], factorName ) (allelesToGenotypes, genotypesToAlleles)=generateAlleleGenotypeMappers(numAlleles) (ngenos,ploidy)=np.shape(genotypesToAlleles) f1.setCard([ ngenos,ngenos,ngenos ] ) #set the values to zero initially values=np.zeros( (np.prod(f1.getCard()))).tolist() assignments=IndexToAssignment( np.arange(np.prod(f1.getCard())), f1.getCard() )-1 for z in range( np.prod(f1.getCard() ) ): curr_assign= assignments[z] childAssignment=int(curr_assign[0]) parent1gametes= genotypesToAlleles[curr_assign[1],:] parent2gametes= genotypesToAlleles[curr_assign[2],:] #print 'parental gametes: ', parent1gametes, parent2gametes #print 'child assignment: ', childAssignment #list of tuples containing list of zygote(genotype) tuples zygote_list=list(itertools.product(parent1gametes,parent2gametes)) punnet_freq=[ allelesToGenotypes[zygote[0],zygote[1]] for zygote in zygote_list ] histc = defaultdict(int) hist=[] for x in punnet_freq: histc[x]+=1. #print histc.values() hist=[ histc[g] for g in range(ngenos) ] #for g in range (ngenos): # hist.append ( histc[g] ) #print punnet_freq hist=(np.array ( hist)) /4 values[z]=hist[childAssignment] f1.setVal( values ) return f1
def main(): f = open('data/ASIA/asia.bif') BIF = f.readlines() BIF = BIFParser.fixWhiteSpace(BIF) BN = BIFParser.parseBIF(BIF) factors = [] for nodes in BN: if not nodes.isRoot(): tempArray = [nodes] tempArray.extend(nodes.getParents()) factors.append(Factor.Factor(nodes.getDist(), tempArray)) converged=False converNum=0 while not converged: prevConverNum = copy.deepcopy(converNum) converNum=0 for a in BN: for f in factors: if partOf(a,f): message = a.sendMarginal(f) f.receiveBelief(message, a) for f in factors: for a in BN: if partOf(a,f): message = f.sendBelief(a) a.receiveMarginal(message, f) for a in BN: a.updateMarginal() converNum += a.getMarginal()[a.getMarginal().keys()[0]] if (np.abs(converNum-prevConverNum) < .00001): converged=True g=open("results.txt","w") for a in BN: g.write(a.getName() + " ") print a.getMarginal() i=len(a.getMarginal().keys())-1 while(i >= 0): g.write(str(a.getMarginal()[a.getMarginal().keys()[i]]) + " ") i-=1 g.write("\n") g.close()
def __init__(self,numAlleles, genotypeVarChild, genotypeVarParentOne, genotypeVarParentTwo, name): self.genotypeFactor = Factor( [genotypeVarChild, genotypeVarParentOne, genotypeVarParentTwo ], [ ], [ ], name) #map alleles to genotypes and genotyeps to alleles (self.allelesToGenotypes, self.genotypesToAlleles)=generateAlleleGenotypeMappers(numAlleles) (ngenos,ploidy)=np.shape(self.genotypesToAlleles) self.genotypeFactor.setCard([ ngenos,ngenos,ngenos ] ) #set the values to zero initially values=np.zeros( (np.prod(self.genotypeFactor.getCard()))).tolist() #iterate thru variable assignments to random variables #assign probablities based on Punnet square crosses assignments=IndexToAssignment( np.arange(np.prod(self.genotypeFactor.getCard())), self.genotypeFactor.getCard() )-1 for z in range( np.prod(self.genotypeFactor.getCard() ) ): curr_assign= assignments[z] childAssignment=int(curr_assign[0]) parent1gametes= self.genotypesToAlleles[curr_assign[1],:] parent2gametes= self.genotypesToAlleles[curr_assign[2],:] #print 'parental gametes: ', parent1gametes, parent2gametes #print 'child assignment: ', childAssignment #list of tuples containing list of zygote(genotype) tuples zygote_list=list(itertools.product(parent1gametes,parent2gametes)) punnet_freq=[ self.allelesToGenotypes[zygote[0],zygote[1]] for zygote in zygote_list ] histc={} hist=[] for g in range( ngenos): histc[g]=0. for x in punnet_freq: histc[x]+=1. #print histc.values() for g in range (ngenos): hist.append ( histc[g] ) #print punnet_freq hist=(np.array ( hist)) /4 #print 'hist:', hist #print zygote_list values[z]=hist[childAssignment] self.genotypeFactor.setVal( values )
def ComputeJointDistribution(INPUTS): """ ComputeJointDistribution Computes the joint distribution defined by a set of given factors Joint = ComputeJointDistribution(INPUTS) computes the joint distribution defined by a set of given factors Joint is a factor that encapsulates the joint distribution given by INPUTS INPUTS is a vector of Factor objects containing the factors defining the distribution """ totalFactors = len(INPUTS) #check for empty list of INPUTS if totalFactors == 0: sys.stderr.write("Empty factor list given as input\n") return Factor([], [], []) else: # see http://docs.python.org/library/functions.html#reduce for description of Python reduce function return reduce(lambda x, y: FactorProduct(x, y), INPUTS)
def __init__(self,alphaList, phenotypeVar, genotypeVar , name): self.phenotypeFactor=Factor( [ phenotypeVar, genotypeVar], [], [], name) self.alpha=np.array ( alphaList) ngenotypes=len(alphaList) self.phenotypeFactor.setCard( [2, ngenotypes]) values=[x for x in range( np.prod(self.phenotypeFactor.getCard()))] for i in range( len(alphaList )): values[i]=alphaList[i] values[i+1]=1-alphaList[i] ctr=0 alphas=2*len(alphaList)*[None] for i in range(len(alphaList)): alphas[ctr]=alphaList[i]; ctr=ctr+1 alphas[ctr]=1-alphaList[i]; ctr=ctr+1 values=alphas self.phenotypeFactor.setVal( values)
def returnGenotypePriorFounderFactor( refbase, factorVar, theta=0.001,ploidy=2 ): """ Not sure this is right, but its simple enough This function returns a factor representing genotype priors, passing in the reference base, and list of alt alelles in altbase. genotypePrior is the name of the variable theta is heterozygosity rate set to .001 by default and ploidy is set to 2 prior(ref homoz=1-3(theta/2), het=theta, alt homoz=theta/2 """ numAlleles=len( ['A','C','G','T'] ) f1= Factor( [factorVar ], [ ], [ ], 'genotypePrior') (allelesToGenotypes, genotypesToAlleles)=generateAlleleGenotypeMappers(numAlleles) (ngenos,ploidy)=np.shape(genotypesToAlleles) #print ngenos f1.setCard([ ngenos] ) values=np.zeros( (np.prod(f1.getCard()))).tolist() #print values # l is the exhaustive set possible genotypes for a given ploidy #l=[ "".join( list(combo)) for combo in itertools.combinations_with_replacement(['A','C','G','T'],ploidy) ] #print l for i in range(ngenos): genotype=indexToGenotype(i, ''.join( ['A','C','G','T'] ) ) (a1,a2)=list(genotype) #print a1,a2 if a1 == a2 and refbase not in genotype: #print genotype, 'non-ref homoz' values[i]=(theta / 2.) elif a1==a2==refbase: #print genotype, 'homoz reference' values[i]= 1 - (3*(theta/2.)) elif a1!=a2 and refbase in genotype: #print genotype, 'heterzygote' values[i]=theta else: #print genotype, 'tri-alleleic het' values[i]=np.power( [ theta/2 ], 3).tolist()[0] #print values f1.setVal(values) return f1
def FactorSum ( A, B): """ FactorSum Computes the sum of two factors. % Similiar to FactorProduct We would use this in log space where multiplication becomes addition % Based on the code here https://github.com/indapa/PGM/blob/master/Prog4/FactorSum.m """ C=Factor() #check for empty factors if len( A.getVar() ) == 0 : sys.stderr.write("A factor is empty!\n") return B if len( B.getVar() ) == 0: sys.stderr.write("B factor is empty!\n") return A #check of variables that in both A and B have the same cardinality #print 'A.getVar(): ', A.getVar() #print 'B.getVar(): ',B.getVar() #setA= set( A.getVar() ) #setB= set( B.getVar() ) #intersect=np.array( list( setA.intersection(setB))) intersect=np.intersect1d( A.getVar(), B.getVar() ).tolist() #print "Intersection of variables in FactorProduct ", intersect #print "A var: ", A.getVar() #print "B var: ", B.getVar() #if the intersection of variables in the two factors #is non-zero, then make sure they have the same cardinality if len(intersect) > 0: #iA=np.nonzero(intersect - A.getVar()==0)[0].tolist() # see this http://stackoverflow.com/a/432146, return the index of something in an array? iA=getIndex( A.getVar(), intersect ) #print "iA: ", iA #iB=np.nonzero(intersect - B.getVar()==0)[0].tolist() iB = getIndex ( B.getVar(), intersect ) #print "iB: ", iB # check to see if any of the comparisons in the array resulting from of a.getCard()[iA] == b.getCard()[iB] # are all False. If so print an error and exit if len( np.where( A.getCard()[iA].all() == B.getCard()[iB].all() ==False)[0].tolist() ) > 0: sys.stderr.write("dimensionality mismatch in factors!\n") sys.exit(1) #now set the variables of C to the union of variables in factors A and B #print 'setA ' ,setA #print 'setB ', setB #print list( setA.union(setB) ) C.setVar( np.union1d ( A.getVar(), B.getVar() ).tolist() ) #C.setVar ( list( setA.union(setB) ) ) mapA=isMember(A.getVar(), C.getVar() ) mapB=isMember(B.getVar(), C.getVar() ) #Set the cardinality of variables in C C.setCard( np.zeros( len(C.getVar())).tolist() ) C.getCard()[mapA]=A.getCard() C.getCard()[mapB]=B.getCard() #intitialize the values of the factor C to be zero C.setVal( np.zeros(np.prod(C.getCard())).tolist() ) #some helper indices to tell what indices of A and B values to multiply assignments=IndexToAssignment( np.arange(np.prod(C.getCard())), C.getCard() ) #get the assignment of values of C indxA=AssignmentToIndex( assignments[:,mapA], A.getCard())-1 # re-arrange the assignment of C, to what it would be in factor A indxB=AssignmentToIndex( assignments[:,mapB], B.getCard())-1 # re-arange the assignment of C to what it would be in factorB #print 'indxA ', indxA #print 'indxB ', indxB c_val=A.getVal()[indxA.flatten().tolist()] + B.getVal()[indxB.flatten().tolist()] #now that we have the index into A.val and B.val vector, multiply them to factor product C.setVal ( c_val.tolist() ) return C
def FactorDiv ( A, B): """ FactorProduct Computes the dividend of two factors. % Similiar to Factor Product, but if we divide 0/0, return 0 see page 365 in Koller and Friedman for definition of FactorDivision """ #print "A: ", A #print "====" #print "B: ", B C=Factor() #check for empty factors if len( A.getVar() ) == 0 : sys.stderr.write("A factor is empty!\n") return B if len( B.getVar() ) == 0: sys.stderr.write("B factor is empty!\n") return A #check of variables that in both A and B have the same cardinality #print 'A.getVar(): ', A.getVar() #print 'B.getVar(): ',B.getVar() #setA= set( A.getVar() ) #setB= set( B.getVar() ) #intersect=np.array( list( setA.intersection(setB))) intersect=np.intersect1d( A.getVar(), B.getVar() ).tolist() #print "Intersection of variables in FactorProduct ", intersect #print "A var: ", A.getVar() #print "B var: ", B.getVar() #if the intersection of variables in the two factors #is non-zero, then make sure they have the same cardinality if len(intersect) > 0: #iA=np.nonzero(intersect - A.getVar()==0)[0].tolist() # see this http://stackoverflow.com/a/432146, return the index of something in an array? iA=getIndex( A.getVar(), intersect ) #print "iA: ", iA #iB=np.nonzero(intersect - B.getVar()==0)[0].tolist() iB = getIndex ( B.getVar(), intersect ) #print "iB: ", iB # check to see if any of the comparisons in the array resulting from of a.getCard()[iA] == b.getCard()[iB] # are all False. If so print an error and exit if len( np.where( A.getCard()[iA].all() == B.getCard()[iB].all() ==False)[0].tolist() ) > 0: sys.stderr.write("dimensionality mismatch in factors!\n") sys.exit(1) #now set the variables of C to the union of variables in factors A and B #print 'setA ' ,setA #print 'setB ', setB #print list( setA.union(setB) ) C.setVar( np.union1d ( A.getVar(), B.getVar() ).tolist() ) #C.setVar ( list( setA.union(setB) ) ) mapA=isMember(A.getVar(), C.getVar() ) mapB=isMember(B.getVar(), C.getVar() ) #Set the cardinality of variables in C C.setCard( np.zeros( len(C.getVar())).tolist() ) C.getCard()[mapA]=A.getCard() C.getCard()[mapB]=B.getCard() #intitialize the values of the factor C to be zero C.setVal( np.zeros(np.prod(C.getCard())).tolist() ) #some helper indices to tell what indices of A and B values to multiply assignments=IndexToAssignment( np.arange(np.prod(C.getCard())), C.getCard() ) #get the assignment of values of C indxA=AssignmentToIndex( assignments[:,mapA], A.getCard())-1 # re-arrange the assignment of C, to what it would be in factor A indxB=AssignmentToIndex( assignments[:,mapB], B.getCard())-1 # re-arange the assignment of C to what it would be in factorB numerator=A.getVal()[indxA.flatten().tolist()] denominator=B.getVal()[indxB.flatten().tolist()] #print numerator #print denominator #print zip(numerator, denominator) val= map( lambda x: common.zerodiv_tuple(x), zip(numerator,denominator) ) #print val C.setVal ( val ) return C
class GenotypeGivenParentsFactor (object): """ construct factor that has prob of genotype of child given both parents Pr(g_child| g_mother, g_father """ def __init__(self,numAlleles, genotypeVarChild, genotypeVarParentOne, genotypeVarParentTwo, name): self.genotypeFactor = Factor( [genotypeVarChild, genotypeVarParentOne, genotypeVarParentTwo ], [ ], [ ], name) #map alleles to genotypes and genotyeps to alleles (self.allelesToGenotypes, self.genotypesToAlleles)=generateAlleleGenotypeMappers(numAlleles) (ngenos,ploidy)=np.shape(self.genotypesToAlleles) self.genotypeFactor.setCard([ ngenos,ngenos,ngenos ] ) #set the values to zero initially values=np.zeros( (np.prod(self.genotypeFactor.getCard()))).tolist() #iterate thru variable assignments to random variables #assign probablities based on Punnet square crosses assignments=IndexToAssignment( np.arange(np.prod(self.genotypeFactor.getCard())), self.genotypeFactor.getCard() )-1 for z in range( np.prod(self.genotypeFactor.getCard() ) ): curr_assign= assignments[z] childAssignment=int(curr_assign[0]) parent1gametes= self.genotypesToAlleles[curr_assign[1],:] parent2gametes= self.genotypesToAlleles[curr_assign[2],:] #print 'parental gametes: ', parent1gametes, parent2gametes #print 'child assignment: ', childAssignment #list of tuples containing list of zygote(genotype) tuples zygote_list=list(itertools.product(parent1gametes,parent2gametes)) punnet_freq=[ self.allelesToGenotypes[zygote[0],zygote[1]] for zygote in zygote_list ] histc={} hist=[] for g in range( ngenos): histc[g]=0. for x in punnet_freq: histc[x]+=1. #print histc.values() for g in range (ngenos): hist.append ( histc[g] ) #print punnet_freq hist=(np.array ( hist)) /4 #print 'hist:', hist #print zygote_list values[z]=hist[childAssignment] self.genotypeFactor.setVal( values ) def getVar(self): return self.genotypeFactor.getVar() def getCard(self): return self.genotypeFactor.getCard() def getVal(self): return self.genotypeFactor.getVal() def setVal(self, val): self.genotypeFactor.setVal(val) def getFactor(self): return self.genotypeFactor def genotypeSlice(self): pass #see this http://stackoverflow.com/q/4257394/1735942 def __str__(self): return self.genotypeFactor.__str__()
def FactorMaxMarginalization( A, V ): """ computes the factor with the variables in V *maxed* out. The resulting factor will have all the variables in A minus those variables in V. This is quite similiar to FactorMarginalization, but rather then summing out variables in V we take the max. In the code, this translates passing np.max as the function to accum See section 13.2 in Koller and Friedman for more information""" B=Factor() #check for empy factor or variable list if len( A.getVar() ) == 0 or len(V) == 0: return A Bvar=np.setdiff1d( A.getVar(), V) mapB=isMember(Bvar, A.getVar()) if len(Bvar) == 0: sys.stderr.write("FactorMaxMarginalization: Error, resultant factor has empty scope...\n") return np.max (A.getVal() ) #set the marginalized factor's variable scope and cardinality B.setVar( Bvar.tolist() ) B.setCard( A.getCard()[mapB] ) B.setVal( np.zeros(np.prod(B.getCard())).tolist() ) #compute some helper indices assignments=IndexToAssignment ( np.arange(np.prod(A.getCard()) ), A.getCard() ) #indxB tells which values in A to sum together when marginalizing out the variable(s) in B indxB=AssignmentToIndex( assignments[:,mapB], B.getCard())-1 #here we pass in the function np.max #NumPy and Python are awesome max_vals=accum(indxB, A.getVal(), np.max ) B.setVal( max_vals.tolist() ) return B
def LogFactor(F): """ return a factor whose values are the natural log of the orginal factor F """ return Factor(F.getVar().tolist(), F.getCard().tolist(), np.log(F.getVal()).tolist(), F.getName())
class GenotypeGivenParentsFactor(object): """ construct factor that has prob of genotype of child given both parents Pr(g_child| g_mother, g_father """ def __init__(self, numAlleles, genotypeVarChild, genotypeVarParentOne, genotypeVarParentTwo, name): self.genotypeFactor = Factor( [genotypeVarChild, genotypeVarParentOne, genotypeVarParentTwo], [], [], name) #map alleles to genotypes and genotyeps to alleles (self.allelesToGenotypes, self.genotypesToAlleles) = generateAlleleGenotypeMappers(numAlleles) (ngenos, ploidy) = np.shape(self.genotypesToAlleles) self.genotypeFactor.setCard([ngenos, ngenos, ngenos]) #set the values to zero initially values = np.zeros((np.prod(self.genotypeFactor.getCard()))).tolist() #iterate thru variable assignments to random variables #assign probablities based on Punnet square crosses assignments = IndexToAssignment( np.arange(np.prod(self.genotypeFactor.getCard())), self.genotypeFactor.getCard()) - 1 for z in range(np.prod(self.genotypeFactor.getCard())): curr_assign = assignments[z] childAssignment = int(curr_assign[0]) parent1gametes = self.genotypesToAlleles[curr_assign[1], :] parent2gametes = self.genotypesToAlleles[curr_assign[2], :] #print 'parental gametes: ', parent1gametes, parent2gametes #print 'child assignment: ', childAssignment #list of tuples containing list of zygote(genotype) tuples zygote_list = list( itertools.product(parent1gametes, parent2gametes)) punnet_freq = [ self.allelesToGenotypes[zygote[0], zygote[1]] for zygote in zygote_list ] histc = {} hist = [] for g in range(ngenos): histc[g] = 0. for x in punnet_freq: histc[x] += 1. #print histc.values() for g in range(ngenos): hist.append(histc[g]) #print punnet_freq hist = (np.array(hist)) / 4 #print 'hist:', hist #print zygote_list values[z] = hist[childAssignment] self.genotypeFactor.setVal(values) def getVar(self): return self.genotypeFactor.getVar() def getCard(self): return self.genotypeFactor.getCard() def getVal(self): return self.genotypeFactor.getVal() def setVal(self, val): self.genotypeFactor.setVal(val) def getFactor(self): return self.genotypeFactor def genotypeSlice(self): pass #see this http://stackoverflow.com/q/4257394/1735942 def __str__(self): return self.genotypeFactor.__str__()
def FactorMarginalization(A, V): """ FactorMarginalization Sums given variables out of a factor. B = FactorMarginalization(A,V) computes the factor with the variables in V summed out. The factor data structure has the following fields: .var Vector of variables in the factor, e.g. [1 2 3] .card Vector of cardinalities corresponding to .var, e.g. [2 2 2] .val Value table of size prod(.card) The resultant factor should have at least one variable remaining or this function will throw an error. See also FactorProduct, IndexToAssignment , and AssignmentToIndex Based on matlab code found here: https://github.com/indapa/PGM/blob/master/Prog1/FactorMarginalization.m """ #the resulting factor after marginalizing out variables in python list V that are in #the factor A B = Factor() #check for empy factor or variable list if len(A.getVar()) == 0 or len(V) == 0: return A #construct the variables of the marginalized factor by #computing the set difference between A.var and V #These variables in the difference set will be the scope of the new factor setA = set(A.getVar()) setV = set(V) Bvar = np.array(list(setA.difference(setV))) mapB = isMember(Bvar, A.getVar( )) #indices of the variables of the new factor in the original factor A #print mapB, Bvar #check to see if the new factor has empty scope if len(Bvar) == 0: sys.stderr.write( "FactorMarginalization:Error, resultant factor has empty scope...\n" ) return None #set the marginalized factor's variable scope and cardinality B.setVar(Bvar.tolist()) B.setCard(A.getCard()[mapB]) B.setVal(np.zeros(np.prod(B.getCard())).tolist()) #compute some helper indices assignments = IndexToAssignment(np.arange(np.prod(A.getCard())), A.getCard()) #indxB tells which values in A to sum together when marginalizing out the variable(s) in B indxB = AssignmentToIndex(assignments[:, mapB], B.getCard()) - 1 #accum is a numpy implementation of matlab accumarray #accumarray sums data in each group #here the group(s) are defined in indxB #indxB is a map to tell which value in A.val to map the sum to #see http://blogs.mathworks.com/loren/2008/02/20/under-appreciated-accumarray/ marginal_vals = accum(indxB, A.getVal()) #set the marginal values to the new factor with teh variable(s) in V summed(marginalized) out B.setVal(marginal_vals.tolist()) return B
def __init__(self, name, desc, factors, weights=None, univPP=None): ''' initialize the composite factor by specifying the calculator :param factors: a list of Factor or CompositeFactor objects :param weights: could be None, a list of float, or a pd.Series of pd.DataFrame, where the index of Series is Date, and the index of DataFrame is StockID and the columns are factors. Each row represents an array of weights for a stock, and therefore its sum is equal to 1.0 If it is None, then equal weights will be applied. If it is a list of float, it must have the same length as factors and the static weights will be applied. If it is a pd.Series, the dynamic weights will be applied :return: a composite factor object ''' self.NumFactors = len(factors) self.Factors = factors if weights is None: weights = [1.0/self.NumFactors] * self.NumFactors # equal weights self.Weights = weights elif type(weights) is list: if len(weights) != self.NumFactors: print "ERROR: the length of weights is NOT equal to the number of factors" exit(-1) noNegative = weights[weights > 0] if len(noNegative) > 0: print "ERROR: Negative weights" exit(-1) tot = math.fsum(weights) if math.fabs(tot - 1) > 0.01: print ("WARN: the sum of weights " + str(math.floor(tot,2)) + "is not equal to 1. ReWeight to 1.") weights = weights * 1.0 / tot self.Weights = weights elif type(weights) is pd.Series: pass #todo dynamic weights, for backtest elif isfunction(weights): pass #todo dynamic weights, for backtest & live trading, function arg is StockID,date,factorID else: print ("Unknown weights type: " + str(type(weights))) exit(-1) def myCalc(stockID, date): ''' aggregate factor scores with linear weights, where weights might be a list, or a Series, or a function that takes stockID and date, and return a composite score for the stock on that date :param stockID: wind stock id :param date: DateTime or date string in a format of 'yyyymmdd' :return: a float ''' wts = None if type(self.Weights) is list: wts = self.Weights elif type(weights) is pd.Series: wts = self.Weights[date] elif isfunction(weights): wts = self.Weights(stockID, date) compScore = 0 nanWts = 0 for i in range(self.NumFactors): score = wts[i] * self.Factors[i].GetScore(stockID, date) if np.isnan(score): score = 0 nanWts += wts[i] if nanWts > 0.8: compScore = np.nan elif nanWts > 0.5: log.debug('the total weights with NaN score is ' + str(nanWts)) compScore += score else: compScore += score return compScore Factor.__init__(self, name, desc, myCalc, univPP)
class PhenotypeGivenGenotypeFactor(object): """ construct factor of phenotype|genotype #prob of being effected, given the ith genotype #alphaList[i] is the prob of being effected given the ith genotype """ def __init__(self,alphaList, phenotypeVar, genotypeVar , name): self.phenotypeFactor=Factor( [ phenotypeVar, genotypeVar], [], [], name) self.alpha=np.array ( alphaList) ngenotypes=len(alphaList) self.phenotypeFactor.setCard( [2, ngenotypes]) values=[x for x in range( np.prod(self.phenotypeFactor.getCard()))] for i in range( len(alphaList )): values[i]=alphaList[i] values[i+1]=1-alphaList[i] ctr=0 alphas=2*len(alphaList)*[None] for i in range(len(alphaList)): alphas[ctr]=alphaList[i]; ctr=ctr+1 alphas[ctr]=1-alphaList[i]; ctr=ctr+1 values=alphas self.phenotypeFactor.setVal( values) def getVar(self): return self.phenotypeFactor.getVar() def getCard(self): return self.phenotypeFactor.getCard() def getVal(self): return self.phenotypeFactor.getVal() def setVal(self,val): self.phenotypeFactor.setVal(val) def getFactor(self): return self.phenotypeFactor def __str__(self): return self.phenotypeFactor.__str__()
def eliminateVar(self, Z, E, factorList): """ a variable elimination function based on https://github.com/indapa/PGM/blob/master/Prog4/EliminateVar.m Z is the variable to be eliminated. We base this code on the matlab file linked to above as well as the Sum-product VE pseudo code in Koller and Friedman page 298 E is a numpy 2d matrix representing adjacency matrix of variables It represents the induced VE graph Once a variable is eliminated, its edges are removed from E """ useFactors = [] #the index of the factor that contains the variable Z scope = [] #print 'Z: ', Z #get a list containining the index in self.factorLlist of factors #that contain the variable Z to be eliminated # get the scope of variables from the factors that contain variable Z for i in range(len(factorList)): if Z in factorList[i].getVar().tolist(): useFactors.append( i ) #the ith factor is being currently involved in elimination scope = list( set.union(set(scope), factorList[i].getVar().tolist())) # update edge map """ These represent the induced edges for the VE graph. once the variable Z is eliminated, its edges are removed from the graph but in the process of elimination, we create a new factor. This introduces fill edges (see pg. 307 Koller and Friedman) Z is one based, but the indices in E are zero based, hence Z-1 also the variable names in scope are 1 based, so we subtract 1 when updating the induced VE graph """ for i in range(len(scope)): for j in range(len(scope)): if i != j: E[scope[i] - 1, scope[j] - 1] = 1 E[scope[j] - 1, scope[i] - 1] = 1 E[Z - 1, :] = 0 E[:, Z - 1] = 0 #G=nx.from_numpy_matrix(E) #print 'induced graph edges:\n', (G.edges()) #nx.draw_shell(G) #plt.show() #these are the indices of factorList which are not involved in VE unusedFactors = list( set.difference(set(range(len(factorList))), set(useFactors))) newF = None #check first if there are any unused factors left! if len(unusedFactors) > 0: newF = len(unusedFactors) * [None] newmap = np.zeros(max(unusedFactors) + 1, dtype=int).tolist() #newF is a new factor list, we populate it first #with the unused factors #newmap is maps the new location of ith unusedFactor for i in range(len(unusedFactors)): newF[i] = factorList[unusedFactors[i]] newmap[unusedFactors[i]] = i #print 'newmap ', newmap,"\n" #print 'length of newmap: ', len(newmap), "\n" newFactor = Factor([], [], [], 'newFactor') #we multiple in all the factors that contain the variable Z for i in range(len(useFactors)): newFactor = FactorProduct(newFactor, factorList[useFactors[i]]) #then we marginalize Z out and obtain a new factor #then append it the end of newF, the new factor list newFactor = FactorMarginalization(newFactor, [Z]) #print 'newFactor: ',newFactor #newF(length(nonUseFactors)+1) = newFactor; if newFactor != None: newF.append(newFactor) if newF != None: factorList = newF #return E ######################################################################## """ the remaining code builds the edges of the clique tree """ """ add new node with the factors that contain the variable Z adding a new node represents new clique. The scope of every factor generated during the variable elimination process is a clique pg. 309 Koller & Friedman """ self.nodeList.append(scope) #newC is the total number of nodes in the clique tree newC = len(self.nodeList) #print 'newC: ', newC #factorInds are individual factors with one variable ... I think self.factorInds.append(len(unusedFactors) + 1) #print 'range( newC -1) ', range( newC-1 ) #print 'factorInds: ', self.factorInds #print 'useFactors: ', useFactors #pdb.set_trace() """ we update the edges of the clique tree """ for i in range(newC - 1): #if self.factorInds [ i ] -1 in useFactors: #there was the off by onoe erorr - the values in factorInds #were one-based, need to subtract 1 if self.factorInds[i] - 1 in useFactors: self.edges[i, newC - 1] = 1 self.edges[newC - 1, i] = 1 self.factorInds[i] = 0 else: if self.factorInds[i] != 0: #print 'i: ', i #print 'factorInds: ', self.factorInds #print 'newmap: ', newmap #print 'newmap [ self.factorInds[i] -1: ', newmap [ self.factorInds[i] -1 ] #print 'self.factorInds[ i ] = newmap [ self.factorInds[i] - 1 ] + 1 ' if len(unusedFactors) > 0: #self.factorInds[ i ] = newmap [ self.factorInds[i] -1 ] +1 self.factorInds[i] = newmap[self.factorInds[i] - 1] + 1 #self.factorInds[ i ] = newmap [ self.factorInds[i] ] #print 'factorInds right before returning: ', self.factorInds return E, factorList
def IdentityFactor(F): return Factor(F.getVar().tolist(), F.getCard().tolist(), np.ones(np.prod(F.getCard())), F.getName() + '_identity')
class phenotypeGivenHaplotypesFactor(object): """ factor represents Pr(phenotype| paternal haplotype, maternal haplotype) very similiar to PhenotypeGivenGenotypeFactor, but we are de-coupling into paternal and maternal alleles rather than genotype""" def __init__(self, alphaList, numAlleles, geneCopyVarOne, geneCopyVarTwo, phenotypeVar): self.numalleles=numAlleles self.alphaList=alphaList self.phenotypeFactor=Factor([phenotypeVar,geneCopyVarOne, geneCopyVarTwo], [], [], 'phenotype| geneCopy1, geneCopy2') ngenos=len(alphaList) self.phenotypeFactor.setCard( [ 2, numAlleles, numAlleles]) #phenotypeFactor.val = zeros(1, prod(phenotypeFactor.card)); values=np.zeros( (1, np.prod(self.phenotypeFactor.getCard()))).flatten().tolist() affectedAlphas=alphaList unaffectedAlphas=[ 1- alpha for alpha in alphaList] (allelesToGenotypes, genotypesToAlleles) = generateAlleleGenotypeMappers(numAlleles) assignments=IndexToAssignment( np.arange(np.prod(self.phenotypeFactor.getCard())), self.phenotypeFactor.getCard() )-1 for z in range( np.prod(self.phenotypeFactor.getCard() ) ): curr_assign= assignments[z] curr_assign=assignments[z] genotype_num=allelesToGenotypes[curr_assign[1], curr_assign[2]] if curr_assign[0] == 0: values[z] = affectedAlphas[genotype_num] else: values[z] = unaffectedAlphas[genotype_num] self.phenotypeFactor.setVal( values ) #genotype_num=allelesToGenotypes(assignment(2), assignment(3)); def getVar(self): return self.phenotypeFactor.getVar() def getCard(self): return self.phenotypeFactor.getCard() def getVal(self): return self.phenotypeFactor.getVal() def getFactor(self): return self.phenotypeFactor def __str__(self): return self.phenotypeFactor.__str__() def __str__(self): return self.phenotypeFactor.__str__()
def __init__(self, alleleFreqs, geneCopyVar): numAlleles = len(alleleFreqs) self.geneCopyFactor=Factor( [geneCopyVar], [], [], 'founderHap') self.geneCopyFactor.setCard ( [numAlleles]) self.geneCopyFactor.setVal( alleleFreqs )
def FactorMaxMarginalization(A, V): """ computes the factor with the variables in V *maxed* out. The resulting factor will have all the variables in A minus those variables in V. This is quite similiar to FactorMarginalization, but rather then summing out variables in V we take the max. In the code, this translates passing np.max as the function to accum See section 13.2 in Koller and Friedman for more information""" B = Factor() #check for empy factor or variable list if len(A.getVar()) == 0 or len(V) == 0: return A Bvar = np.setdiff1d(A.getVar(), V) mapB = isMember(Bvar, A.getVar()) if len(Bvar) == 0: sys.stderr.write( "FactorMaxMarginalization: Error, resultant factor has empty scope...\n" ) return np.max(A.getVal()) #set the marginalized factor's variable scope and cardinality B.setVar(Bvar.tolist()) B.setCard(A.getCard()[mapB]) B.setVal(np.zeros(np.prod(B.getCard())).tolist()) #compute some helper indices assignments = IndexToAssignment(np.arange(np.prod(A.getCard())), A.getCard()) #indxB tells which values in A to sum together when marginalizing out the variable(s) in B indxB = AssignmentToIndex(assignments[:, mapB], B.getCard()) - 1 #here we pass in the function np.max #NumPy and Python are awesome max_vals = accum(indxB, A.getVal(), np.max) B.setVal(max_vals.tolist()) return B
def FactorSum(A, B): """ FactorSum Computes the sum of two factors. % Similiar to FactorProduct We would use this in log space where multiplication becomes addition % Based on the code here https://github.com/indapa/PGM/blob/master/Prog4/FactorSum.m """ C = Factor() #check for empty factors if len(A.getVar()) == 0: sys.stderr.write("A factor is empty!\n") return B if len(B.getVar()) == 0: sys.stderr.write("B factor is empty!\n") return A #check of variables that in both A and B have the same cardinality #print 'A.getVar(): ', A.getVar() #print 'B.getVar(): ',B.getVar() #setA= set( A.getVar() ) #setB= set( B.getVar() ) #intersect=np.array( list( setA.intersection(setB))) intersect = np.intersect1d(A.getVar(), B.getVar()).tolist() #print "Intersection of variables in FactorProduct ", intersect #print "A var: ", A.getVar() #print "B var: ", B.getVar() #if the intersection of variables in the two factors #is non-zero, then make sure they have the same cardinality if len(intersect) > 0: #iA=np.nonzero(intersect - A.getVar()==0)[0].tolist() # see this http://stackoverflow.com/a/432146, return the index of something in an array? iA = getIndex(A.getVar(), intersect) #print "iA: ", iA #iB=np.nonzero(intersect - B.getVar()==0)[0].tolist() iB = getIndex(B.getVar(), intersect) #print "iB: ", iB # check to see if any of the comparisons in the array resulting from of a.getCard()[iA] == b.getCard()[iB] # are all False. If so print an error and exit if len( np.where(A.getCard()[iA].all() == B.getCard()[iB].all() == False)[0].tolist()) > 0: sys.stderr.write("dimensionality mismatch in factors!\n") sys.exit(1) #now set the variables of C to the union of variables in factors A and B #print 'setA ' ,setA #print 'setB ', setB #print list( setA.union(setB) ) C.setVar(np.union1d(A.getVar(), B.getVar()).tolist()) #C.setVar ( list( setA.union(setB) ) ) mapA = isMember(A.getVar(), C.getVar()) mapB = isMember(B.getVar(), C.getVar()) #Set the cardinality of variables in C C.setCard(np.zeros(len(C.getVar())).tolist()) C.getCard()[mapA] = A.getCard() C.getCard()[mapB] = B.getCard() #intitialize the values of the factor C to be zero C.setVal(np.zeros(np.prod(C.getCard())).tolist()) #some helper indices to tell what indices of A and B values to multiply assignments = IndexToAssignment(np.arange(np.prod( C.getCard())), C.getCard()) #get the assignment of values of C indxA = AssignmentToIndex(assignments[:, mapA], A.getCard( )) - 1 # re-arrange the assignment of C, to what it would be in factor A indxB = AssignmentToIndex(assignments[:, mapB], B.getCard( )) - 1 # re-arange the assignment of C to what it would be in factorB #print 'indxA ', indxA #print 'indxB ', indxB c_val = A.getVal()[indxA.flatten().tolist()] + B.getVal()[indxB.flatten( ).tolist( )] #now that we have the index into A.val and B.val vector, multiply them to factor product C.setVal(c_val.tolist()) return C
print "\n ** FINAL NORMALIZED SOLUTION ** " answer = normalize(answer) print answer def printfactorList(factorList): print " *** FACTOR LIST *** " for factor in factorList: print factor print " *** *********** *** " # FACTORS # Pr(G) f0 = Factor(['G'], np.array([0.90, 0.1])) # Pr(D) f1 = Factor(['D'], np.array([0.50, 0.25, 0.25])) # Pr(D|F) f2 = Factor(['D', 'F'], np.array([[0.98, 0.02], [0.40, 0.60], [0.15, 0.85]])) # Pr(D|DS) f3 = Factor(['D', 'DS'], np.array([[0.98, 0.02], [0.15, 0.85], [0.40, 0.60]])) # Pr(D|S, G) f4 = Factor(['D', 'S', 'G'], np.array([[[0.98, 0.02], [0.15, 0.85], [0.15, 0.85]], [[0.998, 0.002], [0.98, 0.02], [0.98, 0.02]]]))
class phenotypeGivenHaplotypesFactor(object): """ factor represents Pr(phenotype| paternal haplotype, maternal haplotype) very similiar to PhenotypeGivenGenotypeFactor, but we are de-coupling into paternal and maternal alleles rather than genotype""" def __init__(self, alphaList, numAlleles, geneCopyVarOne, geneCopyVarTwo, phenotypeVar): self.numalleles = numAlleles self.alphaList = alphaList self.phenotypeFactor = Factor( [phenotypeVar, geneCopyVarOne, geneCopyVarTwo], [], [], 'phenotype| geneCopy1, geneCopy2') ngenos = len(alphaList) self.phenotypeFactor.setCard([2, numAlleles, numAlleles]) #phenotypeFactor.val = zeros(1, prod(phenotypeFactor.card)); values = np.zeros( (1, np.prod(self.phenotypeFactor.getCard()))).flatten().tolist() affectedAlphas = alphaList unaffectedAlphas = [1 - alpha for alpha in alphaList] (allelesToGenotypes, genotypesToAlleles) = generateAlleleGenotypeMappers(numAlleles) assignments = IndexToAssignment( np.arange(np.prod(self.phenotypeFactor.getCard())), self.phenotypeFactor.getCard()) - 1 for z in range(np.prod(self.phenotypeFactor.getCard())): curr_assign = assignments[z] curr_assign = assignments[z] genotype_num = allelesToGenotypes[curr_assign[1], curr_assign[2]] if curr_assign[0] == 0: values[z] = affectedAlphas[genotype_num] else: values[z] = unaffectedAlphas[genotype_num] self.phenotypeFactor.setVal(values) #genotype_num=allelesToGenotypes(assignment(2), assignment(3)); def getVar(self): return self.phenotypeFactor.getVar() def getCard(self): return self.phenotypeFactor.getCard() def getVal(self): return self.phenotypeFactor.getVal() def getFactor(self): return self.phenotypeFactor def __str__(self): return self.phenotypeFactor.__str__() def __str__(self): return self.phenotypeFactor.__str__()
def FactorProduct(A, B): """ FactorProduct Computes the product of two factors. % C = FactorProduct(A,B) computes the product between two factors, A and B, % where each factor is defined over a set of variables with given dimension. % The factor data structure has the following fields: % .var Vector of variables in the factor, e.g. [1 2 3] % .card Vector of cardinalities corresponding to .var, e.g. [2 2 2] % .val Value table of size prod(.card) % % See also FactorMarginalization IndexToAssignment, % AssignmentToIndex, and https://github.com/indapa/PGM/blob/master/Prog1/FactorProduct.m """ #print "A: ", A #print "====" #print "B: ", B C = Factor() #check for empty factors if len(A.getVar()) == 0: sys.stderr.write("A factor is empty!\n") return B if len(B.getVar()) == 0: sys.stderr.write("B factor is empty!\n") return A #check of variables that in both A and B have the same cardinality #print 'A.getVar(): ', A.getVar() #print 'B.getVar(): ',B.getVar() #setA= set( A.getVar() ) #setB= set( B.getVar() ) #intersect=np.array( list( setA.intersection(setB))) intersect = np.intersect1d(A.getVar(), B.getVar()).tolist() #print "Intersection of variables in FactorProduct ", intersect #print "A var: ", A.getVar() #print "B var: ", B.getVar() #if the intersection of variables in the two factors #is non-zero, then make sure they have the same cardinality if len(intersect) > 0: #iA=np.nonzero(intersect - A.getVar()==0)[0].tolist() # see this http://stackoverflow.com/a/432146, return the index of something in an array? iA = getIndex(A.getVar(), intersect) #print "iA: ", iA #iB=np.nonzero(intersect - B.getVar()==0)[0].tolist() iB = getIndex(B.getVar(), intersect) #print "iB: ", iB # check to see if any of the comparisons in the array resulting from of a.getCard()[iA] == b.getCard()[iB] # are all False. If so print an error and exit if len( np.where(A.getCard()[iA].all() == B.getCard()[iB].all() == False)[0].tolist()) > 0: sys.stderr.write("dimensionality mismatch in factors!\n") sys.exit(1) #now set the variables of C to the union of variables in factors A and B #print 'setA ' ,setA #print 'setB ', setB #print list( setA.union(setB) ) C.setVar(np.union1d(A.getVar(), B.getVar()).tolist()) #C.setVar ( list( setA.union(setB) ) ) mapA = isMember(A.getVar(), C.getVar()) mapB = isMember(B.getVar(), C.getVar()) #Set the cardinality of variables in C C.setCard(np.zeros(len(C.getVar())).tolist()) C.getCard()[mapA] = A.getCard() C.getCard()[mapB] = B.getCard() #intitialize the values of the factor C to be zero C.setVal(np.zeros(np.prod(C.getCard())).tolist()) #some helper indices to tell what indices of A and B values to multiply assignments = IndexToAssignment(np.arange(np.prod( C.getCard())), C.getCard()) #get the assignment of values of C indxA = AssignmentToIndex(assignments[:, mapA], A.getCard( )) - 1 # re-arrange the assignment of C, to what it would be in factor A indxB = AssignmentToIndex(assignments[:, mapB], B.getCard( )) - 1 # re-arange the assignment of C to what it would be in factorB c_val = A.getVal()[indxA.flatten().tolist()] * B.getVal()[indxB.flatten( ).tolist( )] #now that we have the index into A.val and B.val vector, multiply them to factor product C.setVal(c_val.tolist()) return C
def __init__(self, alleleFreqs, geneCopyVar): numAlleles = len(alleleFreqs) self.geneCopyFactor = Factor([geneCopyVar], [], [], 'founderHap') self.geneCopyFactor.setCard([numAlleles]) self.geneCopyFactor.setVal(alleleFreqs)
class PhenotypeGivenGenotypeFactor(object): """ construct factor of phenotype|genotype #prob of being effected, given the ith genotype #alphaList[i] is the prob of being effected given the ith genotype """ def __init__(self, alphaList, phenotypeVar, genotypeVar, name): self.phenotypeFactor = Factor([phenotypeVar, genotypeVar], [], [], name) self.alpha = np.array(alphaList) ngenotypes = len(alphaList) self.phenotypeFactor.setCard([2, ngenotypes]) values = [x for x in range(np.prod(self.phenotypeFactor.getCard()))] for i in range(len(alphaList)): values[i] = alphaList[i] values[i + 1] = 1 - alphaList[i] ctr = 0 alphas = 2 * len(alphaList) * [None] for i in range(len(alphaList)): alphas[ctr] = alphaList[i] ctr = ctr + 1 alphas[ctr] = 1 - alphaList[i] ctr = ctr + 1 values = alphas self.phenotypeFactor.setVal(values) def getVar(self): return self.phenotypeFactor.getVar() def getCard(self): return self.phenotypeFactor.getCard() def getVal(self): return self.phenotypeFactor.getVal() def setVal(self, val): self.phenotypeFactor.setVal(val) def getFactor(self): return self.phenotypeFactor def __str__(self): return self.phenotypeFactor.__str__()
def FactorDiv(A, B): """ FactorProduct Computes the dividend of two factors. % Similiar to Factor Product, but if we divide 0/0, return 0 see page 365 in Koller and Friedman for definition of FactorDivision """ #print "A: ", A #print "====" #print "B: ", B C = Factor() #check for empty factors if len(A.getVar()) == 0: sys.stderr.write("A factor is empty!\n") return B if len(B.getVar()) == 0: sys.stderr.write("B factor is empty!\n") return A #check of variables that in both A and B have the same cardinality #print 'A.getVar(): ', A.getVar() #print 'B.getVar(): ',B.getVar() #setA= set( A.getVar() ) #setB= set( B.getVar() ) #intersect=np.array( list( setA.intersection(setB))) intersect = np.intersect1d(A.getVar(), B.getVar()).tolist() #print "Intersection of variables in FactorProduct ", intersect #print "A var: ", A.getVar() #print "B var: ", B.getVar() #if the intersection of variables in the two factors #is non-zero, then make sure they have the same cardinality if len(intersect) > 0: #iA=np.nonzero(intersect - A.getVar()==0)[0].tolist() # see this http://stackoverflow.com/a/432146, return the index of something in an array? iA = getIndex(A.getVar(), intersect) #print "iA: ", iA #iB=np.nonzero(intersect - B.getVar()==0)[0].tolist() iB = getIndex(B.getVar(), intersect) #print "iB: ", iB # check to see if any of the comparisons in the array resulting from of a.getCard()[iA] == b.getCard()[iB] # are all False. If so print an error and exit if len( np.where(A.getCard()[iA].all() == B.getCard()[iB].all() == False)[0].tolist()) > 0: sys.stderr.write("dimensionality mismatch in factors!\n") sys.exit(1) #now set the variables of C to the union of variables in factors A and B #print 'setA ' ,setA #print 'setB ', setB #print list( setA.union(setB) ) C.setVar(np.union1d(A.getVar(), B.getVar()).tolist()) #C.setVar ( list( setA.union(setB) ) ) mapA = isMember(A.getVar(), C.getVar()) mapB = isMember(B.getVar(), C.getVar()) #Set the cardinality of variables in C C.setCard(np.zeros(len(C.getVar())).tolist()) C.getCard()[mapA] = A.getCard() C.getCard()[mapB] = B.getCard() #intitialize the values of the factor C to be zero C.setVal(np.zeros(np.prod(C.getCard())).tolist()) #some helper indices to tell what indices of A and B values to multiply assignments = IndexToAssignment(np.arange(np.prod( C.getCard())), C.getCard()) #get the assignment of values of C indxA = AssignmentToIndex(assignments[:, mapA], A.getCard( )) - 1 # re-arrange the assignment of C, to what it would be in factor A indxB = AssignmentToIndex(assignments[:, mapB], B.getCard( )) - 1 # re-arange the assignment of C to what it would be in factorB numerator = A.getVal()[indxA.flatten().tolist()] denominator = B.getVal()[indxB.flatten().tolist()] #print numerator #print denominator #print zip(numerator, denominator) val = map(lambda x: common.zerodiv_tuple(x), zip(numerator, denominator)) #print val C.setVal(val) return C
def test_product1 (self): expectedFactor = pgmf.Factor(np.array([1,2]), np.array([2,2]), np.array([0.0649, 0.1958, 0.0451, 0.6942])) productFactor = pgmf.product(self.factorA, self.factorB) np.testing.assert_array_equal(productFactor.varbs, expectedFactor.varbs) np.testing.assert_array_equal(productFactor.card, expectedFactor.card) np.testing.assert_array_almost_equal(productFactor.vals, expectedFactor.vals, decimal = 5)
from CliqueTree import * from CliqueTreeOperations import * from FactorOperations import * import scipy.io as sio import numpy as np import pprint import pdb matfile = '/Users/amit/BC_Classes/PGM/Prog4/PA4Sample.mat' mat_contents = sio.loadmat(matfile) mat_struct = mat_contents['FactorMax'] val = mat_struct[0, 0] input_factors = val['INPUT1'][0][0] var = input_factors[0].flatten().tolist() card = input_factors[1].flatten().tolist() value = input_factors[2].flatten().tolist() print var print card print value INPUT1 = Factor(var, card, value, 'test') INPUT2 = val['INPUT2'].flatten() print INPUT1 print INPUT2 print FactorMaxMarginalization(INPUT1, INPUT2) #example used in section 13.2 pg 555 of Friedman and Koller print "=====" psi = Factor([1, 2, 3], [3, 2, 2], [.25, .05, .15, .08, 0, .09, .35, .07, .21, .16, 0, .18]) maxfactor = FactorMaxMarginalization(psi, [2]) print maxfactor print IndexToAssignment(np.arange(6), [3, 2])
class ChildCopyGivenParentalsFactor(object): """ this represents a de-coupled factor given a parents two haplotypes, returns factor whose values are the probablity of inheriting (grand)paternal or (grand)maternal haplotype. This allows for some more flexibility in modeling inheritance, rather than clumping a single parent's haplotype into a genotype i.e. GenotypeGivenParentsFactor """ def __init__(self, numAlleles, geneCopyVarChild, geneCopyHapOne, geneCopyHapTwo): self.numalleles = numAlleles self.hapone = geneCopyVarChild self.haptwo = geneCopyHapTwo #geneCopyFactor = struct('var', [], 'card', [], 'val', []); self.geneCopyFactor = Factor( [geneCopyVarChild, geneCopyHapOne, geneCopyHapTwo], [], [], 'child|hap1,hap2') self.geneCopyFactor.setCard( [self.numalleles, self.numalleles, self.numalleles]) values = np.zeros( np.prod([self.numalleles, self.numalleles, self.numalleles])).tolist() #this keeps track of what posiiton you are in the values list index = 0 #the number of iterations thru the nested for loops should be equal to numallels^3 for i in range(numAlleles): #iterate through alleles from #grand(paternal) haplotype for j in range(numAlleles): #iterate through alleles from #grand(maternal) haplotype for k in range(numAlleles): #iterate thru child alleles print i, j, k if j == k: #child has grandmotherhap if i == k: #grandfatherhap is the same values[index] = 1 else: values[index] = .5 elif i == k: #child has grandfather hap values[index] = .5 else: pass index += 1 #print values self.geneCopyFactor.setVal(values) def getVar(self): return self.geneCopyFactor.getVar() def getCard(self): return self.geneCopyFactor.getCard() def getVal(self): return self.geneCopyFactor.getVal() def getFactor(self): return self.geneCopyFactor def __str__(self): return self.geneCopyFactor.__str__()
def FactorProduct ( A, B): """ FactorProduct Computes the product of two factors. % C = FactorProduct(A,B) computes the product between two factors, A and B, % where each factor is defined over a set of variables with given dimension. % The factor data structure has the following fields: % .var Vector of variables in the factor, e.g. [1 2 3] % .card Vector of cardinalities corresponding to .var, e.g. [2 2 2] % .val Value table of size prod(.card) % % See also FactorMarginalization IndexToAssignment, % AssignmentToIndex, and https://github.com/indapa/PGM/blob/master/Prog1/FactorProduct.m """ #print "A: ", A #print "====" #print "B: ", B C=Factor() #check for empty factors if len( A.getVar() ) == 0 : sys.stderr.write("A factor is empty!\n") return B if len( B.getVar() ) == 0: sys.stderr.write("B factor is empty!\n") return A #check of variables that in both A and B have the same cardinality #print 'A.getVar(): ', A.getVar() #print 'B.getVar(): ',B.getVar() #setA= set( A.getVar() ) #setB= set( B.getVar() ) #intersect=np.array( list( setA.intersection(setB))) intersect=np.intersect1d( A.getVar(), B.getVar() ).tolist() #print "Intersection of variables in FactorProduct ", intersect #print "A var: ", A.getVar() #print "B var: ", B.getVar() #if the intersection of variables in the two factors #is non-zero, then make sure they have the same cardinality if len(intersect) > 0: #iA=np.nonzero(intersect - A.getVar()==0)[0].tolist() # see this http://stackoverflow.com/a/432146, return the index of something in an array? iA=getIndex( A.getVar(), intersect ) #print "iA: ", iA #iB=np.nonzero(intersect - B.getVar()==0)[0].tolist() iB = getIndex ( B.getVar(), intersect ) #print "iB: ", iB # check to see if any of the comparisons in the array resulting from of a.getCard()[iA] == b.getCard()[iB] # are all False. If so print an error and exit if len( np.where( A.getCard()[iA].all() == B.getCard()[iB].all() ==False)[0].tolist() ) > 0: sys.stderr.write("dimensionality mismatch in factors!\n") sys.exit(1) #now set the variables of C to the union of variables in factors A and B #print 'setA ' ,setA #print 'setB ', setB #print list( setA.union(setB) ) C.setVar( np.union1d ( A.getVar(), B.getVar() ).tolist() ) #C.setVar ( list( setA.union(setB) ) ) mapA=isMember(A.getVar(), C.getVar() ) mapB=isMember(B.getVar(), C.getVar() ) #Set the cardinality of variables in C C.setCard( np.zeros( len(C.getVar())).tolist() ) C.getCard()[mapA]=A.getCard() C.getCard()[mapB]=B.getCard() #intitialize the values of the factor C to be zero C.setVal( np.zeros(np.prod(C.getCard())).tolist() ) #some helper indices to tell what indices of A and B values to multiply assignments=IndexToAssignment( np.arange(np.prod(C.getCard())), C.getCard() ) #get the assignment of values of C indxA=AssignmentToIndex( assignments[:,mapA], A.getCard())-1 # re-arrange the assignment of C, to what it would be in factor A indxB=AssignmentToIndex( assignments[:,mapB], B.getCard())-1 # re-arange the assignment of C to what it would be in factorB c_val=A.getVal()[indxA.flatten().tolist()] * B.getVal()[indxB.flatten().tolist()] #now that we have the index into A.val and B.val vector, multiply them to factor product C.setVal ( c_val.tolist() ) return C
def FactorMarginalization(A,V): """ FactorMarginalization Sums given variables out of a factor. B = FactorMarginalization(A,V) computes the factor with the variables in V summed out. The factor data structure has the following fields: .var Vector of variables in the factor, e.g. [1 2 3] .card Vector of cardinalities corresponding to .var, e.g. [2 2 2] .val Value table of size prod(.card) The resultant factor should have at least one variable remaining or this function will throw an error. See also FactorProduct, IndexToAssignment , and AssignmentToIndex Based on matlab code found here: https://github.com/indapa/PGM/blob/master/Prog1/FactorMarginalization.m """ #the resulting factor after marginalizing out variables in python list V that are in #the factor A B=Factor() #check for empy factor or variable list if len( A.getVar() ) == 0 or len(V) == 0: return A #construct the variables of the marginalized factor by #computing the set difference between A.var and V #These variables in the difference set will be the scope of the new factor setA=set( A.getVar() ) setV=set(V) Bvar=np.array( list( setA.difference(setV))) mapB=isMember(Bvar, A.getVar()) #indices of the variables of the new factor in the original factor A #print mapB, Bvar #check to see if the new factor has empty scope if len(Bvar) == 0: sys.stderr.write("FactorMarginalization:Error, resultant factor has empty scope...\n") return None #set the marginalized factor's variable scope and cardinality B.setVar( Bvar.tolist() ) B.setCard( A.getCard()[mapB] ) B.setVal( np.zeros(np.prod(B.getCard())).tolist() ) #compute some helper indices assignments=IndexToAssignment ( np.arange(np.prod(A.getCard()) ), A.getCard() ) #indxB tells which values in A to sum together when marginalizing out the variable(s) in B indxB=AssignmentToIndex( assignments[:,mapB], B.getCard())-1 #accum is a numpy implementation of matlab accumarray #accumarray sums data in each group #here the group(s) are defined in indxB #indxB is a map to tell which value in A.val to map the sum to #see http://blogs.mathworks.com/loren/2008/02/20/under-appreciated-accumarray/ marginal_vals=accum(indxB, A.getVal() ) #set the marginal values to the new factor with teh variable(s) in V summed(marginalized) out B.setVal( marginal_vals.tolist() ) return B
class GenotypeAlleleFreqFactor (object): """ construct a factor that has the probability of each genotype given allele frequencies Pr(genotype|allele_freq)""" def __init__(self, allelefreqs, genotypeVar, name): self.allelefreq=allelefreqs #number of alleles == number of allele frequencies passed in numAlleles=len(allelefreqs) self.allelesToGenotypes=None self.genotypesToAlleles=None self.genotypeFactor=None #map alleles to genotypes and genotyeps to alleles (self.allelesToGenotypes, self.genotypesToAlleles)=generateAlleleGenotypeMappers(numAlleles) (ngenos,ploidy)=np.shape(self.genotypesToAlleles) self.genotypeFactor = Factor( [genotypeVar], [], [], name) #the cardinality of the factor is the number of genotypes self.genotypeFactor.setCard( [ngenos] ) #set the values to zero initially values=np.zeros( (np.prod(self.genotypeFactor.getCard()))).tolist() for i in range (ngenos): alleles=self.genotypesToAlleles[i,:].tolist() if alleles[0] == alleles[1]: values[i]= np.prod( [ allelefreqs[j] for j in alleles ]) else: values[i]= np.prod( [ allelefreqs[j] for j in alleles ]) * 2 self.genotypeFactor.setVal( values ) def getVar(self): return self.genotypeFactor.getVar() def getCard(self): return self.genotypeFactor.getCard() def getVal(self): return self.genotypeFactor.getVal() def setVal(self,val): self.genotypeFactor.setVal(val) def getFactor(self): return self.genotypeFactor def __str__(self): return self.genotypeFactor.__str__()
from Factor import * from PGMcommon import * from CliqueTree import * from CliqueTreeOperations import * from FactorOperations import * import scipy.io as sio import numpy as np import pprint import pdb matfile='/Users/amit/BC_Classes/PGM/Prog4/PA4Sample.mat' mat_contents=sio.loadmat(matfile) mat_struct=mat_contents['SumProdCalibrate'] val=mat_struct[0,0] input_edges = val['INPUT']['edges'][0][0] input_cliqueList= val['INPUT']['cliqueList'][0][0][0] clique_list_factorObj=[] for tpl in input_cliqueList: (var, card, values)=tpl f= Factor( var[0].tolist(), card[0].tolist(), values[0].tolist(), 'factor' ) clique_list_factorObj.append(f) P=CliqueTree( clique_list_factorObj , input_edges, clique_list_factorObj, []) P=CliqueTreeCalibrate(P) for f in P.getNodeList(): print f print "=="
def testprimeFactors(self): self.assertEqual(Factor.primeFactorsOf(1),[]) self.assertEqual(Factor.primeFactorsOf(2), [2]) self.assertEqual(Factor.primeFactorsOf(3), [3]) self.assertEqual(Factor.primeFactorsOf(4), [2,2]) self.assertEqual(Factor.primeFactorsOf(5), [5]) self.assertEqual(Factor.primeFactorsOf(6),[2,3]) self.assertEqual(Factor.primeFactorsOf(7), [7]) self.assertEqual(Factor.primeFactorsOf(8), [2,2,2]) self.assertEqual(Factor.primeFactorsOf(9), [3,3]) self.assertEqual(Factor.primeFactorsOf(10), [2,5]) self.assertEqual(Factor.primeFactorsOf(11), [11])