Esempio n. 1
0
def parse_prior(composition, alphabet, weight=1.0):
    """
    Parse a description of the expected monomer distribution of a nucleotide
    sequence. For protein sequences the prior is set implicitly for BILD
    scores and assumed either correspond to Robinson-Robinson frequencies (None
    - default) or to an explicit distribution (same as for nucleotides - see
    below).

    Valid compositions for nucleotides:

    - None  :                Use 'equiprobable'
    - 'equiprobable' :      All monomers have the same probability.
    - a percentage, e.g. '45%' or a fraction '0.45':
                            The fraction of CG bases for nucleotide alphabets
    - a species name, e.g. 'E. coli', 'H. sapiens' :
                            Use the average CG percentage for the specie's
                            genome.
    - An explicit distribution,  e.g. {'A':10, 'C':40, 'G':40, 'T':10}
    """

    if weight is None:
        weight = 1.0
    if weight < 0:
        raise ValueError("Weight cannot be negative")

    # Protein prior is Robinson-Robinson unless set explicitly
    if alphabet == unambiguous_protein_alphabet:
        if composition is None:
            prior = weight * na.array(aa_composition, na.float64)
        elif composition[0] == '{' and composition[-1] == '}':
            prior = _parse_explicit_composition(composition, alphabet, weight)
        else:
            prior = weight * na.array(aa_composition, na.float64)

    # Nucleotide prior depends on input
    else:
        if composition is None:
            composition = 'equiprobable'
        comp = composition.strip()

        if comp.lower() == 'equiprobable':
            prior = weight * equiprobable_distribution(len(alphabet))

        elif comp in std_percentCG:
            prior = weight * base_distribution(std_percentCG[comp])

        elif comp[-1] == '%':
            prior = weight * base_distribution(float(comp[:-1]))

        elif isfloat(comp):
            prior = weight * base_distribution(float(comp) * 100.)

        elif composition[0] == '{' and composition[-1] == '}':
            prior = _parse_explicit_composition(composition, alphabet, weight)

        else:
            raise ValueError("Unknown or malformed composition: %s" %
                             composition)

    if len(prior) != len(alphabet):
        raise ValueError(
            "The sequence alphabet and composition are incompatible.")

    return prior
Esempio n. 2
0
def parse_prior(composition, alphabet, weight=None) :
    """ Parse a description of the expected monomer distribution of a sequence.
    
    Valid compositions:
    
    - None or 'none' :      No composition sepecified 
    - 'auto' or 'automatic': Use the typical average distribution
                            for proteins and an equiprobable distribution for
                            everything else.    
    - 'equiprobable' :      All monomers have the same probability.
    - a percentage, e.g. '45%' or a fraction '0.45':
                            The fraction of CG bases for nucleotide alphabets
    - a species name, e.g. 'E. coli', 'H. sapiens' :
                            Use the average CG percentage for the specie's      
                            genome.
    - An explicit distribution,  e.g. {'A':10, 'C':40, 'G':40, 'T':10}
    """
    if composition is None: return None
    comp = composition.strip()
    
    if comp.lower() == 'none': return None
    
    
    if weight is None and alphabet is not None: 
        weight = sqrt(float(len(alphabet)))

    if weight<0 : raise ValueError("Weight cannot be negative.")
    
    
    if comp.lower() == 'equiprobable' :
        prior = weight * equiprobable_distribution(len(alphabet)) 
    elif comp.lower() == 'auto' or comp.lower() == 'automatic':
        if alphabet == unambiguous_protein_alphabet :
            prior =  weight * asarray(aa_composition, float64)
        else :
            prior = weight * equiprobable_distribution(len(alphabet)) 
    
    elif comp in std_percentCG :
        prior = weight * base_distribution(std_percentCG[comp])

    elif comp[-1] == '%' :
        prior = weight * base_distribution( float(comp[:-1]))

    elif isfloat(comp) :
        prior = weight * base_distribution( float(comp)*100. )

    elif composition[0] == '{' and composition[-1] == '}' : 
        explicit = composition[1: -1]
        explicit = explicit.replace(',',' ').replace("'", ' ').replace('"',' ').replace(':', ' ').split()
        
        if len(explicit) != len(alphabet)*2 :
            #print(explicit)
            raise ValueError("Explicit prior does not match length of alphabet")
        prior = - ones(len(alphabet), float64) 
        try :
            for r in range(len(explicit) // 2):
                letter = explicit[r*2]
                index = alphabet.ord(letter)
                value = float(explicit[r*2 +1])
                prior[index] = value
        except ValueError :
            raise ValueError("Cannot parse explicit composition")
    
        if any(prior==-1.) :
            raise ValueError("Explicit prior does not match alphabet") 
        prior/= sum(prior)
        prior *= weight
        
        
    else : 
        raise ValueError("Unknown or malformed composition: %s"%composition)
    
    if len(prior) != len(alphabet) :
        raise ValueError(
            "The sequence alphabet and composition are incompatible.")
    return prior
Esempio n. 3
0
def parse_prior(composition, alphabet, weight=None):
    """ Parse a description of the expected monomer distribution of a sequence.
    
    Valid compositions:
    
    - None or 'none' :      No composition sepecified 
    - 'auto' or 'automatic': Use the typical average distribution
                            for proteins and an equiprobable distribution for
                            everything else.    
    - 'equiprobable' :      All monomers have the same probability.
    - a percentage, e.g. '45%' or a fraction '0.45':
                            The fraction of CG bases for nucleotide alphabets
    - An explicit distribution,  e.g. {'A':10, 'C':40, 'G':40, 'T':10}
    """

    if composition is None: return [None, None]

    comp = composition.strip()
    
    if comp.lower() == 'none': return [None, None]
    
    if weight is None and alphabet is not None: 
        weight = sqrt(float(len(alphabet)))
    if weight<0 : raise ValueError("Weight cannot be negative.")

    comp = composition.strip()
    if os.path.exists(comp):
        compos = - ones(len(alphabet), float64)
        try:
            for l in open(comp):
                fL = l.rstrip("\n").split()
                if alphabet in [codon_dna_alphabet, codon_rna_alphabet]:
                    compos[ alphabet.index(fL[0]) ] = float(fL[1])
                else:
                    compos[ alphabet.ord(fL[0]) ] = float(fL[1])
        except ValueError :
            raise ValueError("Cannot parse composition file.")
        if any(compos==-1.) :
            raise ValueError("Explicit prior does not match alphabet") 
        compos /= sum(compos)        
    
    elif comp.lower() == 'equiprobable' :
        compos = equiprobable_distribution(len(alphabet))

    elif comp.lower() == 'auto' or comp.lower() == 'automatic':
        if alphabet == unambiguous_protein_alphabet :
            compos = asarray(aa_composition, float64)
        else :
            compos = equiprobable_distribution(len(alphabet))

    elif comp[-1] == '%' :
        compos = base_distribution( float(comp[:-1]))

    elif isfloat(comp) :
        compos = base_distribution( float(comp)*100. )

    elif composition[0] == '{' and composition[-1] == '}' : 
        explicit = composition[1: -1]
        explicit = explicit.replace(',',' ').replace("'", ' ').replace('"',' ').replace(':', ' ').split()
        
        if len(explicit) != len(alphabet)*2 :
            raise ValueError("Explicit prior does not match length of alphabet")
        compos = - ones(len(alphabet), float64) 
        try :
            for r in range(len(explicit)/2) :
                letter = explicit[r*2]
                if alphabet in [codon_dna_alphabet, codon_rna_alphabet]:
                    index = alphabet.index(letter)
                else:
                    index = alphabet.ord(letter)
                value = float(explicit[r*2 +1])
                compos[index] = value
        except ValueError :
            raise ValueError("Cannot parse explicit composition")
    
        if any(compos==-1.) :
            raise ValueError("Explicit prior does not match alphabet") 
        compos/= sum(compos)        
        
    else : 
        raise ValueError("Unknown or malformed composition: %s"%composition)
    
    if len(compos) != len(alphabet) :
        raise ValueError(
            "The sequence alphabet and composition are incompatible.")
    prior = weight * compos
    return [prior, compos]