def parse_prior(composition, alphabet, weight=1.0): """ Parse a description of the expected monomer distribution of a nucleotide sequence. For protein sequences the prior is set implicitly for BILD scores and assumed either correspond to Robinson-Robinson frequencies (None - default) or to an explicit distribution (same as for nucleotides - see below). Valid compositions for nucleotides: - None : Use 'equiprobable' - 'equiprobable' : All monomers have the same probability. - a percentage, e.g. '45%' or a fraction '0.45': The fraction of CG bases for nucleotide alphabets - a species name, e.g. 'E. coli', 'H. sapiens' : Use the average CG percentage for the specie's genome. - An explicit distribution, e.g. {'A':10, 'C':40, 'G':40, 'T':10} """ if weight is None: weight = 1.0 if weight < 0: raise ValueError("Weight cannot be negative") # Protein prior is Robinson-Robinson unless set explicitly if alphabet == unambiguous_protein_alphabet: if composition is None: prior = weight * na.array(aa_composition, na.float64) elif composition[0] == '{' and composition[-1] == '}': prior = _parse_explicit_composition(composition, alphabet, weight) else: prior = weight * na.array(aa_composition, na.float64) # Nucleotide prior depends on input else: if composition is None: composition = 'equiprobable' comp = composition.strip() if comp.lower() == 'equiprobable': prior = weight * equiprobable_distribution(len(alphabet)) elif comp in std_percentCG: prior = weight * base_distribution(std_percentCG[comp]) elif comp[-1] == '%': prior = weight * base_distribution(float(comp[:-1])) elif isfloat(comp): prior = weight * base_distribution(float(comp) * 100.) elif composition[0] == '{' and composition[-1] == '}': prior = _parse_explicit_composition(composition, alphabet, weight) else: raise ValueError("Unknown or malformed composition: %s" % composition) if len(prior) != len(alphabet): raise ValueError( "The sequence alphabet and composition are incompatible.") return prior
def parse_prior(composition, alphabet, weight=None) : """ Parse a description of the expected monomer distribution of a sequence. Valid compositions: - None or 'none' : No composition sepecified - 'auto' or 'automatic': Use the typical average distribution for proteins and an equiprobable distribution for everything else. - 'equiprobable' : All monomers have the same probability. - a percentage, e.g. '45%' or a fraction '0.45': The fraction of CG bases for nucleotide alphabets - a species name, e.g. 'E. coli', 'H. sapiens' : Use the average CG percentage for the specie's genome. - An explicit distribution, e.g. {'A':10, 'C':40, 'G':40, 'T':10} """ if composition is None: return None comp = composition.strip() if comp.lower() == 'none': return None if weight is None and alphabet is not None: weight = sqrt(float(len(alphabet))) if weight<0 : raise ValueError("Weight cannot be negative.") if comp.lower() == 'equiprobable' : prior = weight * equiprobable_distribution(len(alphabet)) elif comp.lower() == 'auto' or comp.lower() == 'automatic': if alphabet == unambiguous_protein_alphabet : prior = weight * asarray(aa_composition, float64) else : prior = weight * equiprobable_distribution(len(alphabet)) elif comp in std_percentCG : prior = weight * base_distribution(std_percentCG[comp]) elif comp[-1] == '%' : prior = weight * base_distribution( float(comp[:-1])) elif isfloat(comp) : prior = weight * base_distribution( float(comp)*100. ) elif composition[0] == '{' and composition[-1] == '}' : explicit = composition[1: -1] explicit = explicit.replace(',',' ').replace("'", ' ').replace('"',' ').replace(':', ' ').split() if len(explicit) != len(alphabet)*2 : #print(explicit) raise ValueError("Explicit prior does not match length of alphabet") prior = - ones(len(alphabet), float64) try : for r in range(len(explicit) // 2): letter = explicit[r*2] index = alphabet.ord(letter) value = float(explicit[r*2 +1]) prior[index] = value except ValueError : raise ValueError("Cannot parse explicit composition") if any(prior==-1.) : raise ValueError("Explicit prior does not match alphabet") prior/= sum(prior) prior *= weight else : raise ValueError("Unknown or malformed composition: %s"%composition) if len(prior) != len(alphabet) : raise ValueError( "The sequence alphabet and composition are incompatible.") return prior
def parse_prior(composition, alphabet, weight=None): """ Parse a description of the expected monomer distribution of a sequence. Valid compositions: - None or 'none' : No composition sepecified - 'auto' or 'automatic': Use the typical average distribution for proteins and an equiprobable distribution for everything else. - 'equiprobable' : All monomers have the same probability. - a percentage, e.g. '45%' or a fraction '0.45': The fraction of CG bases for nucleotide alphabets - An explicit distribution, e.g. {'A':10, 'C':40, 'G':40, 'T':10} """ if composition is None: return [None, None] comp = composition.strip() if comp.lower() == 'none': return [None, None] if weight is None and alphabet is not None: weight = sqrt(float(len(alphabet))) if weight<0 : raise ValueError("Weight cannot be negative.") comp = composition.strip() if os.path.exists(comp): compos = - ones(len(alphabet), float64) try: for l in open(comp): fL = l.rstrip("\n").split() if alphabet in [codon_dna_alphabet, codon_rna_alphabet]: compos[ alphabet.index(fL[0]) ] = float(fL[1]) else: compos[ alphabet.ord(fL[0]) ] = float(fL[1]) except ValueError : raise ValueError("Cannot parse composition file.") if any(compos==-1.) : raise ValueError("Explicit prior does not match alphabet") compos /= sum(compos) elif comp.lower() == 'equiprobable' : compos = equiprobable_distribution(len(alphabet)) elif comp.lower() == 'auto' or comp.lower() == 'automatic': if alphabet == unambiguous_protein_alphabet : compos = asarray(aa_composition, float64) else : compos = equiprobable_distribution(len(alphabet)) elif comp[-1] == '%' : compos = base_distribution( float(comp[:-1])) elif isfloat(comp) : compos = base_distribution( float(comp)*100. ) elif composition[0] == '{' and composition[-1] == '}' : explicit = composition[1: -1] explicit = explicit.replace(',',' ').replace("'", ' ').replace('"',' ').replace(':', ' ').split() if len(explicit) != len(alphabet)*2 : raise ValueError("Explicit prior does not match length of alphabet") compos = - ones(len(alphabet), float64) try : for r in range(len(explicit)/2) : letter = explicit[r*2] if alphabet in [codon_dna_alphabet, codon_rna_alphabet]: index = alphabet.index(letter) else: index = alphabet.ord(letter) value = float(explicit[r*2 +1]) compos[index] = value except ValueError : raise ValueError("Cannot parse explicit composition") if any(compos==-1.) : raise ValueError("Explicit prior does not match alphabet") compos/= sum(compos) else : raise ValueError("Unknown or malformed composition: %s"%composition) if len(compos) != len(alphabet) : raise ValueError( "The sequence alphabet and composition are incompatible.") prior = weight * compos return [prior, compos]