Example #1
0
    def protAnalysis(self, content):
        result, resultFlexDic = dict(), dict()
        content = Parsers.normalizeSequence(content, self.sourceType)
        protein = ProteinAnalysis(content)

        result['proteinMWeight'] = protein.molecular_weight()
        result['proteinAroma'] = protein.aromaticity()
        result['proteinInstab'] = protein.instability_index()
        result['proteinIsoelec'] = protein.isoelectric_point()
        result['proteinGravy'] = protein.gravy()

        proteinStructure = protein.secondary_structure_fraction()
        protStruct = self.flatten('proteinSecstruc', proteinStructure)

        result = {**protStruct, **result}

        # merge result and protein Structure
        flexibility = protein.flexibility()
        flexibFlat = self.flatten('proteinFlex', flexibility)
        flexibAmino = self.flatten(list(content), flexibility)

        flattened = {**flexibFlat, **result}
        flattenedFlexDic = {**flexibAmino, **result}

        return result, flattened, flattenedFlexDic,
Example #2
0
def sequence_vector(temp_window: str, window: int = 6, chemical=1):
    """
    This vector takes the sequence and has each amino acid represented by an int
    0 represents nonstandard amino acids or as fluff for tails/heads of sequences
    Strip is a list which can be modified as user needs call for
    """
    temp_window = clean(temp_window)
    temp_window = windower(sequence=temp_window, position=int(len(temp_window)*.5), wing_size=window)

    vec = []
    aa = {"G": 1, "A": 2, "L": 3, "M": 4, "F": 5, "W": 6, "K": 7, "Q": 8, "E": 9, "S": 10, "P": 11, "V": 12, "I": 13,
          "C": 14, "Y": 15, "H": 16, "R": 17, "N": 18, "D": 19, "T": 20, "X": 0}

    for i in temp_window:
        vec.append(aa[i])
    if len(vec) != (window*2)+1:
        t = len(vec)
        for i in range((window*2)+1-t):
            vec.append(0)
    # Hydrophobicity is optional
    if chemical == 1:
        s = ProteinAnalysis(temp_window)
        vec.append(s.gravy())
        vec.append(s.instability_index())
        vec.append(s.aromaticity())

    return vec
Example #3
0
def protein_properties(seq):
    """Return a tuple with some protein biochemical properties

    seq is a Bio.Seq.Seq or str representing protein sequence
    """
    pa = ProteinAnalysis(seq)

    aa_counts = pa.count_amino_acids()
    arom = pa.aromaticity()
    isoelec = pa.isoelectric_point()
    try:
        instability = pa.instability_index()
    except KeyError:
        instability = None
    try:
        gravy = pa.gravy()
    except KeyError:
        gravy = None

    return ProtProp(aa=str(seq),
                    gravy=gravy,
                    aromaticity=arom,
                    isoelectric_point=isoelec,
                    instability=instability,
                    aa_counts=aa_counts)
Example #4
0
def featureExtraction(train_df, test_df):
    #feature extraction using bio library to acquire peptide attributes
    n = len(train_df)
    Y = train_df[0]
    train_df = train_df.drop(columns=0)
    train_df = train_df.rename(columns={1: 0})
    big = pd.concat([train_df, test_df], ignore_index=True)
    big['molecular_weight'] = 0.0
    #big['flexibility'] = 0
    big['isoelectric_point'] = 0.0
    big['aromaticity'] = 0.0
    big['stability'] = 0.0
    for i in range(len(big)):
        #print(big.iloc[i, 0])
        val = big.iloc[i, 0]
        #invalid peptide check, set all values to 0
        if 'X' in val or 'Z' in val:
            big.at[i, 'molecular_weight'] = -1
            #big.at[i, 'flexibility'] = -1
            big.at[i, 'isoelectric_point'] = -1
            big.at[i, 'aromaticity'] = -1
            big.at[i, 'stability'] = -1
            continue
        model = ProteinAnalysis(val)
        big.at[i, 'molecular_weight'] = model.molecular_weight()
        #big.at[i, 'flexibility'] = model.flexibility()
        big.at[i, 'isoelectric_point'] = model.isoelectric_point()
        big.at[i, 'aromaticity'] = model.aromaticity()
        big.at[i, 'stability'] = model.instability_index()
    big = big.drop(columns=0)
    train_df = big.iloc[:n, ]
    test_df = big.iloc[n:, ]
    return train_df, test_df, Y
Example #5
0
    def _protein_parameters(self, sequence):
        """Calculates physicochemical properties for the amino acid sequence.
        
        Args:
            sequence: str, amino acid sequence.
            
        Returns: 
            property_arr: np array, vector of properties.
            
        """

        analysis = ProteinAnalysis(sequence)

        property_arr = []

        property_arr.append(analysis.molecular_weight())
        property_arr.append(analysis.aromaticity())
        property_arr.append(analysis.instability_index())
        property_arr.append(analysis.gravy())
        property_arr.append(analysis.isoelectric_point())

        secondary = analysis.secondary_structure_fraction()
        property_arr.append(secondary[0])
        property_arr.append(secondary[1])
        property_arr.append(secondary[2])

        molar_extinction_coefficient = analysis.molar_extinction_coefficient()
        property_arr.append(molar_extinction_coefficient[0])
        property_arr.append(molar_extinction_coefficient[1])

        property_arr.append(self._net_charge(sequence))

        return np.array(property_arr)
Example #6
0
def physchem_props(data):
    """Calculate the physicochemical properties per protein in ara_d."""
    new_table = []
    header = "ID\tclass\tindex\tsequon\tsequence\tmol_weight\tgravy\taromaticity\tinstab_index\tiso_point\n"
    new_table.append(header)
    for line in data:
        split_line = line.rstrip().split('\t')
        seq = split_line[-2]  # Sequon, not sequence
        # Calculates the properties
        if "X" in seq or '*' in seq or seq == '':
            continue  # Skip non-usable sequences, only negs
        try:
            a_seq = ProteinAnalysis(seq)
            # Update ara_d with new physchem properties
            results = [
                a_seq.molecular_weight(),
                a_seq.gravy(),
                a_seq.aromaticity(),
                a_seq.instability_index(),
                #a_seq.flexibility(),
                a_seq.isoelectric_point(),
                #a_seq.secondary_structure_fraction(),
            ]
        except:
            print(split_line)
            sys.exit(1)
        new_line = line.rstrip() + "\t{}\t{}\t{}\t{}\t{}\n".format(*results)
        new_table.append(new_line)
    return new_table
def get_biopython_features(X):
    res = np.zeros((X.shape[0], 6))
    for i,seq in enumerate(X):
        analysed_seq = ProteinAnalysis(seq)
        res[i] = np.array([analysed_seq.molecular_weight()]+[analysed_seq.instability_index()] + [analysed_seq.isoelectric_point()] + list(analysed_seq.secondary_structure_fraction()))
        
    return res
Example #8
0
def protein_analysis():
    if session.username == None:
        redirect(URL(r=request, c='account', f='log_in'))
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    form = FORM(
        TABLE(
            TR(
                "Amino acid sequence:  ",
                TEXTAREA(_type="text",
                         _name="sequence",
                         requires=IS_NOT_EMPTY())),
            INPUT(_type="submit", _value="SUBMIT")))
    if form.accepts(request.vars, session):
        session['sequence'] = seqClean(form.vars.sequence.upper())
        X = ProteinAnalysis(session['sequence'])
        session['aa_count'] = X.count_amino_acids()
        session['percent_aa'] = X.get_amino_acids_percent()
        session['mw'] = X.molecular_weight()
        session['aromaticity'] = X.aromaticity()
        session['instability'] = X.instability_index()
        session['flexibility'] = X.flexibility()
        session['pI'] = X.isoelectric_point()
        session['sec_struct'] = X.secondary_structure_fraction()
        redirect(URL(r=request, f='protein_analysis_output'))
    return dict(form=form)
    def parse_pro_sequence(self, p_seq, id=None, desc=None):
        try:
            p_seq = ''.join([pro for pro in p_seq if pro in proteins])

            # append fasta sequence metadata
            self.id.append(id)
            self.description.append(desc)

            # reverse translate protein to nucleotide sequence
            n_seq = ''.join([list(dna_codons.keys())[list(dna_codons.values()).index(pro)] for pro in p_seq])
            self.nucleotide_sequence.append(n_seq)
    
            self.protein_sequence.append(p_seq)
            # self.protein_sequence.append(str(record.seq.translate()).replace('*', ' '))
            
            # GC content
            self.gc_content.append(self.calculate_gc_content(n_seq))
            
            # protein analysis methods
            analysis = ProteinAnalysis(p_seq)
            self.amino_acid_dict.append(analysis.get_amino_acids_percent())
            self.molecular_weight.append(analysis.molecular_weight())
            self.instability_index.append(analysis.instability_index())
            self.aromaticity.append(analysis.aromaticity())

        except Exception as e:
            print('-'*80)
            print(f"Exception in parsing uploaded virus sequence: {e}")
            traceback.print_exc(file=sys.stdout)
            print('-'*80)
def calculate_physiochemical_features(temp_dict, sequence):
    analyzed_seq = ProteinAnalysis(sequence)

    charge_at_pH7 = analyzed_seq.charge_at_pH(7)
    instability_index = analyzed_seq.instability_index()
    molecular_weight = analyzed_seq.molecular_weight()
    aromaticity = analyzed_seq.aromaticity()
    molar_extinction_coefficient = analyzed_seq.molar_extinction_coefficient()
    range_l, range_h = molar_extinction_coefficient
    molar_extinction_coefficient = (float(range_l) + float(range_h)) / 2
    gravy = analyzed_seq.gravy(
    )  #Grand Average Hyrdopathy - Higher value = More Hydrophobic
    isoelectric_point = analyzed_seq.isoelectric_point()
    helix_fraction, turn_fraction, sheet_fraction = analyzed_seq.secondary_structure_fraction(
    )

    physiochem_dict = {
        "Charge at pH7": charge_at_pH7,
        "Instability Index": instability_index,
        "Molecular Wt": molecular_weight,
        "Aromaticity": aromaticity,
        "Molar Extinction Coeff": molar_extinction_coefficient,
        "Gravy": gravy,
        "Isoelectric pt": isoelectric_point,
        "Helix Fraction": helix_fraction,
        "Turn Fraction": turn_fraction,
        "Sheet Fraction": sheet_fraction
    }
    temp_dict.update(physiochem_dict)

    #Adding separately because get_amino_acids_percent() generates a dictionary on its own
    aa_percent = analyzed_seq.get_amino_acids_percent()
    temp_dict.update(aa_percent)
Example #11
0
def protparm(cudir, filename, name):

    fasta_sequence = SeqIO.parse(open(cudir + "/" + name + "/" + filename),
                                 "fasta")
    for fasta in fasta_sequence:
        name1, sequence = fasta.id, str(fasta.seq)
        ##print sequence
        X = ProteinAnalysis(sequence)
        ##print name1+"\t"+str(X.instability_index())
        if float(round(X.instability_index(), 2)) < 40:
            ii = (round(X.instability_index(), 2))
            stab = "stable"
            stab_coff = 1
        else:
            ii = (round(X.instability_index(), 2))
            stab = "unstable"
            stab_coff = 0
    return ii, stab, stab_coff
def get_protein_analysis(aa):
    protein_analysis = ProteinAnalysis(aa)
    analyze = [protein_analysis.molecular_weight(), 
        protein_analysis.aromaticity(),
        protein_analysis.instability_index(),
        protein_analysis.isoelectric_point(),
        protein_analysis.gravy()] + list(
        protein_analysis.secondary_structure_fraction())
    return analyze
Example #13
0
    def get_instability_index(self):
        """
        Calculates Instability index from sequence (1 value) from biopython

        :return: dictionary with the value of Instability index
        """
        res = {}
        analysed_seq = ProteinAnalysis(self.ProteinSequence)
        res['Instability_index'] = analysed_seq.instability_index()
        return res
Example #14
0
def properties(toxin_faa, antitoxin_faa, out):

    # Build a dictionary of {locus:[{properties:values},{properties:values}]}
    from collections import defaultdict
    loci = defaultdict(list)
    from Bio import SeqIO
    for f in [toxin_faa, antitoxin_faa]:
        # Parse FASTA files
        with open(f, 'rU') as handle:
            for record in SeqIO.parse(handle, 'fasta'):
                locus, start = getNameAndPosition(record)
                if not start:
                    continue
                aaseq = str(record.seq).strip("*")
                # Omit sequences with missing positions or premature stops
                # give them 0 as flag for missing data instead
                if "*" not in aaseq and "X" not in aaseq:
                    data = ProteinAnalysis(aaseq)
                    loci[locus].append({
                        'start': start,
                        'pI': data.isoelectric_point(),
                        'weight': data.molecular_weight(),
                        'instability': data.instability_index()
                    })
                else:
                    loci[locus].append({
                        'start': start,
                        'pI': 0,
                        'weight': 0,
                        'instability': 0
                    })

    # Order genes in a locus positionally
    loci = orderPairs(loci)

    # Write to output fil
    outfile = ".".join([out, "properties", "txt"])
    with open(outfile, 'w') as o:
        header = "\t".join([
            "locus", "gene1_pI", "gene2_pI", "gene1_weight", "gene2_weight",
            "gene1_instability", "gene2_instability"
        ])

        o.write("#" + header.upper() + "\n")
        for locus, gene in loci.iteritems():
            if len(gene) != 2:
                continue
            line = map(str, [
                locus, gene[0]['pI'], gene[1]['pI'], gene[0]['weight'],
                gene[1]['weight'], gene[0]['instability'],
                gene[1]['instability']
            ])
            o.write("\t".join(line) + "\n")
    return outfile
Example #15
0
 def pept_counter(self):
     """this class can be used to get some
        other peptide properties.
     """
     if self.pept:
         pa = ProteinAnalysis(self.pept)
         inst = pa.instability_index()
         if inst > 0:
             self.inst = inst
     else:
         self.inst = 100
Example #16
0
def make_dataset(fasta):
    # a list of dictionaries containing features for all sequences
    ls_features = []

    # assign whether it's from tardigrades 'tar' or poplars 'pop'
    if 'tar' in fasta:
        target = 0
    elif 'pop' in fasta:
        target = 1

    for record in SeqIO.parse(fasta, "fasta"):
        analysed_seq = ProteinAnalysis(str(record.seq))

        # the dictionary containing features for a single sequence
        dict_features = {}

        # compute length
        dict_features['length'] = len(record.seq)

        # compute molecular weight
        dict_features['mol_weight'] = analysed_seq.molecular_weight()

        # compute aromaticity
        dict_features['aromaticity'] = analysed_seq.molecular_weight()

        # compute stability
        dict_features['stability'] = analysed_seq.instability_index()

        # compute flexibility
        dict_features['flexibility'] = analysed_seq.flexibility()

        # compute isoelectric point
        dict_features['isoelectric'] = analysed_seq.isoelectric_point()

        # compute secondary structure fraction
        frac = analysed_seq.secondary_structure_fraction()
        dict_features['helix'] = frac[0]
        dict_features['turn'] = frac[1]
        dict_features['sheet'] = frac[2]

        # compute AAC composition of entire sequence
        aac = analysed_seq.get_amino_acids_percent()

        # merge all features and dictionaries into dict_features
        dict_features.update(aac)
        ls_features += [dict_features]

    df = pd.DataFrame(ls_features)
    df['target'] = target

    print(df)
    df.to_pickle(name + '_set.pkl')
Example #17
0
def biopython_protein_analysis(inseq):
    """Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string.

    For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html

    Args:
        inseq: Amino acid sequence

    Returns:
        dict: Dictionary of sequence properties. Some definitions include:
        instability_index: Any value above 40 means the protein is unstable (has a short half life).
        secondary_structure_fraction: Percentage of protein in helix, turn or sheet

    TODO:
        Finish definitions of dictionary

    """

    inseq = ssbio.protein.sequence.utils.cast_to_str(inseq)

    analysed_seq = ProteinAnalysis(inseq)

    info_dict = {}
    info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids()
    info_dict[
        'amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent()
    info_dict['length-biop'] = analysed_seq.length
    info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic
    info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight()
    info_dict['aromaticity-biop'] = analysed_seq.aromaticity()
    info_dict['instability_index-biop'] = analysed_seq.instability_index()
    # TODO: What is flexibility?
    info_dict['flexibility-biop'] = analysed_seq.flexibility()
    info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point()

    # grand average of hydrophobicity
    info_dict['gravy-biop'] = analysed_seq.gravy()

    # Separated secondary_structure_fraction into each definition
    # info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction()
    info_dict[
        'percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[0]
    info_dict[
        'percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[1]
    info_dict[
        'percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[2]

    return info_dict
def phyChemProps(seq):
    svv = [0 for x in range(10)]
    X = ProteinAnalysis(seq)
    svv[0] = X.aromaticity()
    svv[1] = X.secondary_structure_fraction()[0]
    svv[2] = X.secondary_structure_fraction()[1]
    svv[3] = X.secondary_structure_fraction()[2]
    svv[4] = X.gravy()
    svv[5] = X.instability_index()
    svv[6] = X.isoelectric_point()
    svv[7] = X.molecular_weight()
    svv[8] = X.molar_extinction_coefficient()[0]
    svv[9] = X.molar_extinction_coefficient()[1]
    return svv
Example #19
0
def analysis(listofaas, outlist):
    for prot in listofaas:
        exc = 0
        try:
            templist = []
            p = ProteinAnalysis(prot)
            templist.append(p.molecular_weight())
            templist.append(p.instability_index())
            templist.append(p.isoelectric_point())
            outlist.append(templist)
        except ValueError:
            exc = exc + 1
        except KeyError:
            exc = exc + 1
Example #20
0
def properties(toxin_faa,antitoxin_faa,out):

    # Build a dictionary of {locus:[{properties:values},{properties:values}]}
    from collections import defaultdict
    loci = defaultdict(list)
    from Bio import SeqIO
    for f in [toxin_faa,antitoxin_faa]:
        # Parse FASTA files
        with open(f,'rU') as handle:
            for record in SeqIO.parse(handle,'fasta'):
                locus,start = getNameAndPosition(record)
                if not start:
                    continue
                aaseq = str(record.seq).strip("*")
                # Omit sequences with missing positions or premature stops
                # give them 0 as flag for missing data instead
                if "*" not in aaseq and "X" not in aaseq:
                    data = ProteinAnalysis(aaseq)
                    loci[locus].append({ 'start':  start,
                                         'pI':     data.isoelectric_point(),
                                         'weight': data.molecular_weight(),
                                         'instability': data.instability_index() })
                else:
                    loci[locus].append({ 'start': start,
                                         'pI': 0, 'weight':0 ,
                                         'instability': 0 })

        
    # Order genes in a locus positionally
    loci = orderPairs(loci)

    # Write to output fil
    outfile = ".".join([out,"properties","txt"])
    with open(outfile,'w') as o:
        header = "\t".join(["locus",
                            "gene1_pI","gene2_pI",
                            "gene1_weight","gene2_weight",
                            "gene1_instability","gene2_instability" ])

        o.write("#"+ header.upper() + "\n")
        for locus, gene in loci.iteritems():
            if len(gene) != 2:
                continue
            line = map(str, [ locus,gene[0]['pI'],gene[1]['pI'],
                              gene[0]['weight'],gene[1]['weight'],
                              gene[0]['instability'],gene[1]['instability'] ])
            o.write("\t".join(line)+"\n")
    return outfile
Example #21
0
    def __init__(self, sequence):
        self.sequence = sequence
        self.sequence_length = len(sequence)
        analysis = ProteinAnalysis(sequence)

        self.amino_acid_percents = analysis.get_amino_acids_percent()
        self.amino_acids_composition = calculate_amino_acids_composition(sequence)
        self.aromaticity = analysis.aromaticity()
        self.instability = analysis.instability_index()
        self.flexibility = calculate_flexibility(sequence)
        protein_scale_parameters = [{'name': 'Hydrophilicity', 'dictionary': hw},
                                    {'name': 'Surface accessibility', 'dictionary': em},
                                    {'name': 'Janin Interior to surface transfer energy scale', 'dictionary': ja},
                                    {'name': 'Bulkiness', 'dictionary': bulkiness},
                                    {'name': 'Polarity', 'dictionary': polarity},
                                    {'name': 'Buried residues', 'dictionary': buried_residues},
                                    {'name': 'Average area buried', 'dictionary': average_area_buried},
                                    {'name': 'Retention time', 'dictionary': retention_time}]
        self.protein_scales = calculate_protein_scales(analysis, protein_scale_parameters)
        self.isoelectric_point = analysis.isoelectric_point()
        self.secondary_structure_fraction = calculate_secondary_structure_fraction(analysis)
        self.molecular_weight = analysis.molecular_weight()
        self.kyte_plot = analysis.gravy()
        self.pefing = calculate_pefing(sequence)

        # next parameters are calculated using R.Peptides
        r('require(Peptides)')
        r('sequence = "{0}"'.format(sequence))
        self.aliphatic_index = r('aindex(sequence)')[0]
        self.boman_index = r('boman(sequence)')[0]
        self.charges = calculate_charges(sequence, 1.0, 14.0, 0.5, 'Lehninger')
        self.hydrophobicity = r('seq(sequence)')[0]
        angles = [{'name': 'Alpha-helix', 'angle': -47},
                  {'name': '3-10-helix', 'angle': -26},
                  {'name': 'Pi-helix', 'angle': -80},
                  {'name': 'Omega', 'angle': 180},
                  {'name': 'Antiparallel beta-sheet', 'angle': 135},
                  {'name': 'Parallel beta-sheet', 'angle': 113}]
        if self.amino_acid_percents['P'] + self.amino_acid_percents['G'] > 0.3:
            angles.append({'name': 'Polygly-polypro helix', 'angle': 153})
        self.hydrophobic_moments = calculate_hydrophobic_moments(sequence, angles)
        self.kidera_factors = calculate_kidera_factors(sequence)
        self.peptide_types = calculate_peptide_types(sequence, angles)
Example #22
0
    def calculate_properties_from_sequence(self):
        """
        Function to calculate some molecular properties based on RDKit functionalities
        
        Arguments:
        Sequence - amino acid sequence of the peptide
        
        Return:
        Average Eisenberg hydrophobicity
        ProtParam parameters: Isolectric point, aromaticity, instability index, amino acid percentage
        """

        # Hydrophobicity -> Eisenberg scale
        hydrophobicity = {
            'A': 0.620,
            'R': -2.530,
            'N': -0.780,
            'D': -0.900,
            'C': 0.290,
            'Q': -0.850,
            'E': -0.740,
            'G': 0.480,
            'H': -0.400,
            'Y': 0.260,
            'I': 1.380,
            'L': 1.060,
            'K': -1.500,
            'M': 0.640,
            'F': 1.190,
            'P': 0.120,
            'S': -0.180,
            'T': -0.050,
            'W': 0.810,
            'V': 1.080
        }
        self.avg_hydro = sum([hydrophobicity[resi] for resi in self.sequence])

        # ProParam properties
        prot_parameters = ProteinAnalysis(self.sequence)
        self.aromaticity = prot_parameters.aromaticity()
        self.aa_percent = prot_parameters.get_amino_acids_percent()
        self.instability_index = prot_parameters.instability_index()
        self.isoelectric_point = prot_parameters.isoelectric_point()
def bio_feat(record):
    clean_seq = str(MutableSeq(record.seq)).replace("X", "")
    clean_seq = clean_seq.replace("U", "C")
    clean_seq = clean_seq.replace("B", "N")
    clean_seq = clean_seq.replace('Z', 'Q')
    clean_seq = MutableSeq(clean_seq).toseq()

    ### features
    seq_length = len(str(clean_seq))
    analysed_seq = ProteinAnalysis(str(clean_seq))
    molecular_weight = analysed_seq.molecular_weight()
    amino_percent = analysed_seq.get_amino_acids_percent().values()
    isoelectric_points = analysed_seq.isoelectric_point()
    count = analysed_seq.count_amino_acids().values()
    # aromaticity = analysed_seq.aromaticity()
    instability_index = analysed_seq.instability_index()
    # hydrophobicity = analysed_seq.protein_scale(ProtParamData.kd, 5, 0.4)
    secondary_structure_fraction = analysed_seq.secondary_structure_fraction()
    return np.array([seq_length, molecular_weight, isoelectric_points, instability_index] + list(secondary_structure_fraction) + list(count) + list(amino_percent))
def biochemical_properties(sequence: str) -> Dict[str, Any]:
    # Define objects used for calculations
    analysis_object = ProteinAnalysis(sequence)
    descriptor_object = PyPro.GetProDes(sequence)
    sequence_object = Seq(sequence)
    # TODO(Ahmed): Verify that all these calculations are actually returning reasonable values
    # For example, it says the percent composition of every amino acid is zero when I run
    # calculate_biochem_properties.biochemical_properties('qwertyipasdfghklcvnm')
    return {
        'Isoelectric point': analysis_object.isoelectric_point(),
        'Molecular weight':
        analysis_object.molecular_weight(),  # Daltons? Amu? g/mol?
        'Aromaticity': analysis_object.aromaticity(),
        'Instability index': analysis_object.instability_index(),
        'GRAVY': analysis_object.gravy(),
        'H-bonding percent': h_bonding_percent(sequence),
        'Melting temp': melting_temp(sequence),
        'LCC': lcc.lcc_simp(sequence)
    }
Example #25
0
def protein_analysis():
    if session.username == None: redirect(URL(r=request,f='../account/log_in'))
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    form = FORM(TABLE(
            TR("Amino acid sequence:  ",
               TEXTAREA(_type="text", _name="sequence",
                        requires=IS_NOT_EMPTY())),
            INPUT(_type="submit", _value="SUBMIT")))
    if form.accepts(request.vars,session):
        session['sequence'] = seqClean(form.vars.sequence.upper())
        X = ProteinAnalysis(session['sequence'])
        session['aa_count'] = X.count_amino_acids()
        session['percent_aa'] = X.get_amino_acids_percent()
        session['mw'] = X.molecular_weight()
        session['aromaticity'] = X.aromaticity()
        session['instability'] = X.instability_index()
        session['flexibility'] = X.flexibility()
        session['pI'] = X.isoelectric_point()
        session['sec_struct'] = X.secondary_structure_fraction()
        redirect(URL(r=request, f='protein_analysis_output'))
    return dict(form=form)
def get_features(seq):
    """get global features from a protein sequence

    Parameters
    ----------
    seq : str
        protein sequence

    Return
    ----------
    dictionary:
        global features of the protein sequence

    """

    features = {}
    features['undefined_count'] = len([x for x in seq if x in ['X','B','Z',"'",'O','U']])
    features['length'] = len(seq)
    features['perc_undefined_count'] = features['undefined_count']/features['length']
    features['entropy'] = entropy(seq)
    features['ideal_entropy'] = entropy_ideal(len(seq))
    features['perc_entropy'] = features['entropy']/features['ideal_entropy']
    features['hydr_count'] = sum(1 for x in seq if x in hydrophobic_proteins)
    features['polar_count'] = sum(1 for x in seq if x in polar_proteins)
    features['buried'] = sum(buried[x] for x in seq if x in hydrophobic_proteins)

    seq = ''.join([x for x in seq if x not in ['X','B','Z',"'",'O','U']])

    protein = ProteinAnalysis(seq)
    features['gravy'] = protein.gravy()
    features['molecular_weight'] = protein.molecular_weight()
    features['aromaticity'] = protein.aromaticity()
    features['instability_index'] = protein.instability_index()
    features['isoelectric_point'] = protein.isoelectric_point()
    features['helix'], features['turn'], features['sheet'] = protein.secondary_structure_fraction()

    features.update(protein.count_amino_acids())
    # features.update(protein.get_amino_acids_percent())
    return features
Example #27
0
    def seqs_to_features(self, seqs, no_seqs):
        """ Extract the features from the sequences."""
        X = np.zeros((no_seqs, 32))
        for i, s in enumerate(chain(*seqs)):  # iterate over all sequences
            # get amino acid counts
            alphabet = 'ABCDEFGHIKLMNPQRSTUVWXY'  # no JOZ
            for j, letter in enumerate(alphabet):
                X[i, j] = s.count(letter) / len(s)

            # other analysis
            analysis = ProteinAnalysis(
                s.replace('X', 'A').replace('B', 'A').replace('U', 'A'))
            X[i, -1] = analysis.molecular_weight()
            X[i, -2] = analysis.aromaticity()
            X[i, -3] = analysis.instability_index()
            X[i, -4] = analysis.isoelectric_point()
            helix_array_sheet_fracs = analysis.secondary_structure_fraction()
            X[i, -5] = helix_array_sheet_fracs[0]
            X[i, -6] = helix_array_sheet_fracs[1]
            X[i, -7] = helix_array_sheet_fracs[2]
            X[i, -8] = len(s)
            X[i, -9] = analysis.gravy()  # mean hydrophobicity
        return X
Example #28
0
def GetFeatures (My_seq):

    Features = {}

    ProteinAnalysis(My_seq)
    analysed_seq = ProteinAnalysis(My_seq)
    #Caracteristicas monovaloradas

    Features["Molecular_weight"] = analysed_seq.molecular_weight()
    Features["Aromaticity"] = analysed_seq.aromaticity()
    Features["Instability_index"] = analysed_seq.instability_index()
    Features["Isoelectric_point"] = analysed_seq.isoelectric_point()


    #Caracteristicas multivaloradas

    Features["Flexibility"] = analysed_seq.flexibility() # List 580
    Features["Second_structure_fraction"] = analysed_seq.secondary_structure_fraction() #3 Tupla
    Features["Count_amino_acids"] = analysed_seq.count_amino_acids() #20 Dict
    Features["Amino_acids_percent"] = analysed_seq.get_amino_acids_percent() #20 Dict


    return Features
def physchem_props(ara_d):
    """Calculate the physicochemical properties per protein in ara_d."""
    c = 0
    g = 0
    for protein in ara_d:
        seq = ara_d[protein]["sequence"]
        # Calculates the properties
        if "X" in seq:
            continue  # Skip non-usable sequences, only negs
        if '*' in seq:
            if ara_d[protein]["pos"] != []:
                print(protein)
            continue
        a_seq = ProteinAnalysis(seq)
        # Update ara_d with new physchem properties
        results = [
            a_seq.molecular_weight(),
            a_seq.gravy(),
            a_seq.aromaticity(),
            a_seq.instability_index(),
            a_seq.flexibility(),
            a_seq.isoelectric_point(),
            a_seq.secondary_structure_fraction(),
        ]
        keys = [
            "mol_weight",
            "gravy",
            "aromaticity",
            "instab_index",
            "flexi",
            "iso_point",
            "seq_struct",
        ]
        ara_d[protein]["Properties"] = {}
        for k, v in zip(keys, results):
            ara_d[protein]["Properties"][k] = v
    return ara_d
    def parse_nuc_sequence(self, n_seq, id=None, desc=None):
        """
        Parses valid RNA sequence, translates nucleotides, calculates GC content and other methods available from ProteinAnalysis() in BioPython module.

        Keyword arguments:
        seq -- valid string sequence
        id -- id obtained from FASTA file record (default None)
        desc -- description obtained from FASTA file record (default None)
        """

        try:
            # append fasta sequence metadata
            self.id.append(id)
            self.description.append(desc)
            self.nucleotide_sequence.append(n_seq)
    
            # translate nucleotide string sequence
            p_seq = self.translate_nucleotides(n_seq)
            self.protein_sequence.append(p_seq)
            # self.protein_sequence.append(str(record.seq.translate()).replace('*', ' '))
            
            # GC content
            self.gc_content.append(self.calculate_gc_content(n_seq))
            
            # protein analysis methods
            analysis = ProteinAnalysis(p_seq)
            self.amino_acid_dict.append(analysis.get_amino_acids_percent())
            self.molecular_weight.append(analysis.molecular_weight())
            self.instability_index.append(analysis.instability_index())
            self.aromaticity.append(analysis.aromaticity())            

        except Exception as e:
            print('-'*80)
            print(f"Exception in parsing uploaded virus sequence: {e}")
            traceback.print_exc(file=sys.stdout)
            print('-'*80)
Example #31
0
def biopython_proteinanalysis_seq(seq, scaling=False):
    res = ProteinAnalysis(seq)
    d = {}
    flex = np.array(res.flexibility())
    d['flex:min'], d['flex:max'], d['flex:std'] = flex.min(), flex.max(
    ), flex.std()
    d['gravy'] = res.gravy()
    d['instability_index'] = res.instability_index()
    d['isoelectric_point'] = res.isoelectric_point()
    r, c = res.molar_extinction_coefficient()
    d['molar_extinction_coefficient_reduced'], d[
        'molar_extinction_coefficient_cysteines'] = r, c
    d['molecular_weight'] = res.molecular_weight()
    d['percent_helix_naive'], d['percent_turn_naive'], d[
        'percent_strand_naive'] = res.secondary_structure_fraction()

    aap = res.get_amino_acids_percent()
    aas = sorted(aap.keys())
    d.update({'percent:%s' % aa: aap[aa] for aa in aas})
    d.update({
        'prop_res_%s' % key: sum([aap.get(x, 0) for x in value])
        for key, value in list(property_residues.items())
    })
    return d
instidx=[]
flex=[]

for seq in sequences:
        X=ProteinAnalysis(str(seq))
        isoelectricPt.append(X.isoelectric_point())
        aromaticity.append(X.aromaticity())  
        aminoPercent.append(X.get_amino_acids_percent())
        secstruct.append(X.secondary_structure_fraction())

# These features throw Key & Value Errors due to non standard amino acids
# (i.e. out of the 20 standard ones) e.g. X, U etc
        try:
            gravy.append(X.gravy())
            molweight.append(X.molecular_weight())
            instidx.append(X.instability_index())
            flex.append(X.flexibility())
            hydrophob.append(X.protein_scale(ProtParamData.kd, 9, 0.4))
            hydrophil.append(X.protein_scale(ProtParamData.hw, 9, 0.4))
            surface.append(X.protein_scale(ProtParamData.em, 9, 0.4))

        except (KeyError,ValueError):
            gravy.append(0)
            molweight.append(0)
            instidx.append(0)
            flex.append([0,0])
            hydrophob.append([0,0])
            hydrophil.append([0,0])
            surface.append([0,0])

isoelectricPt_df = pd.DataFrame(isoelectricPt,columns=['isoelectricPt'])
print('done')
with temppathlib.TemporaryDirectory() as tmpdir:
    # unzip the file with all the test PDBs
    with zipfile.ZipFile(args.infile, "r") as zip_:
        zip_.extractall(tmpdir.path)

        for test_pdb in tmpdir.path.glob("*.pdb"):
            for record in SeqIO.parse(test_pdb, "pdb-atom"):
                sequence = str(record.seq).replace('X', 'G')
                protein = ProteinAnalysis(str(sequence))
                p_len.append(len(sequence))
                mol_w.append(protein.molecular_weight())
                iso_p.append(protein.isoelectric_point())
                smell.append(protein.aromaticity())
                taste_factor.append(protein.gravy())
                insta_ind.append(protein.instability_index())
                char_at_acid.append(protein.charge_at_pH(1))
                char_at_neutral.append(protein.charge_at_pH(7))
                char_at_base.append(protein.charge_at_pH(14))
                helter_skeler.append(protein.secondary_structure_fraction()[0])
                turnip.append(protein.secondary_structure_fraction()[1])
                garfield.append(protein.secondary_structure_fraction()[2])
                for x in amino_acids:
                    n = protein.count_amino_acids()[x]
                    for y in d_count.keys():
                        if y[-1] == x:
                            d_count[y].append(n)
                for a in amino_acids:
                    m = protein.get_amino_acids_percent()[a]
                    for b in d_perc.keys():
                        if b[-1] == a:
Example #34
0
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import ProtParamData
from Bio import SeqIO
with open('../../samples/pdbaa') as fh:
   for rec in SeqIO.parse(fh,'fasta'):
       myprot = ProteinAnalysis(str(rec.seq))
       print(myprot.count_amino_acids())
       print(myprot.get_amino_acids_percent())
       print(myprot.molecular_weight())
       print(myprot.aromaticity())
       print(myprot.instability_index())
       print(myprot.flexibility())
       print(myprot.isoelectric_point())
       print(myprot.secondary_structure_fraction())
       print(myprot.protein_scale(ProtParamData.kd, 9, .4))
#!/usr/bin/env python

import sys
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

sys.stdout.write("ID\tMW\tIP\tgravy\tlength\tinstability\tmonoisotpoic\tSequence\n")

for record in SeqIO.parse(sys.stdin, "fasta"):
    a = ProteinAnalysis(str(record.seq))

    properties = list()
    properties.append(record.id)
    properties.append(a.molecular_weight())
    properties.append(a.isoelectric_point())
    properties.append(a.gravy())
    properties.append(a.length)
    properties.append(a.instability_index())
    properties.append(a.aromaticity())
    # always last column to make the output more readable
    properties.append(a.sequence)
    sys.stdout.write( '\t'.join(map(str, properties))+"\n" )

def main(databasePassword, schemaProteins, tableProteinInfo, tableStability):

    # Define N-terminus half life values (explanation http://en.wikipedia.org/wiki/N-end_rule and the ProtParam tool).
    halfLife = {'A' : 4.4, 'C' : 1.2, 'D' : 1.1, 'E' : 1.0, 'F' : 1.1, 'G' : 30.0, 'H' : 3.5, 'I' : 20.0, 'K' : 1.3,
                'L' : 5.5, 'M' : 30.0, 'N' : 1.4, 'P' : 20.0, 'Q' : 0.8, 'R' : 1.0, 'S' : 1.9, 'T' : 7.2,
                'V' : 100.0, 'W' : 2.8, 'Y' : 2.8}

    # Extract all the sequences stored in the database.
    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)
    cursor = mysql.tableSELECT(cursor, 'UPAccession, Sequence', tableProteinInfo)
    results = cursor.fetchall()

    # Calculate the half life and instability index for each protein.
    stabilityTuples = []
    for i in results:
        sequence = i[1]
        if halfLife.has_key(sequence[0]):
            protHalfLife = halfLife[sequence[0]]
        else:
            # This will occur when the N-terminal is not an amino acid with an associated half-life value (e.g. X, B, etc.)
            protHalfLife = -1
        analysedSeq = ProteinAnalysis(sequence)
        try:
            instabilityIndex = analysedSeq.instability_index()
        except:
            instabilityIndex = -1
            print '\tContains invalid aa code: ', i[0]
        stabilityTuples.append(tuple([i[0], protHalfLife, instabilityIndex]))

    cursor.execute('TRUNCATE TABLE ' + tableStability)
    values = '(' + ('%s,' * len(stabilityTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableStability, values, stabilityTuples)
    mysql.closeConnection(conn, cursor)

#def instability_index(prot, sequence):
#
#    # A two dimentional dictionary for calculating the instability index.
#    # Guruprasad K., Reddy B.V.B., Pandit M.W.    Protein Engineering 4:155-161(1990).
#    # It is based on dipeptide values therefore the vale for the dipeptide DG is DIWV['D']['G'].
#    DIWV = {'A': {'A': 1.0, 'C': 44.94, 'E': 1.0, 'D': -7.49,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': -7.49,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0,
#                  'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0},
#            'C': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 20.26,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 33.60,
#                  'K': 1.0, 'M': 33.60, 'L': 20.26, 'N': 1.0,
#                  'Q': -6.54, 'P': 20.26, 'S': 1.0, 'R': 1.0,
#                  'T': 33.60, 'W': 24.68, 'V': -6.54, 'Y': 1.0},
#            'E': {'A': 1.0, 'C': 44.94, 'E': 33.60, 'D': 20.26,
#                  'G': 1.0, 'F': 1.0, 'I': 20.26, 'H': -6.54,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': 1.0,
#                  'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0},
#            'D': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0,
#                  'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 1.0, 'P': 1.0, 'S': 20.26, 'R': -6.54,
#                  'T': -14.03, 'W': 1.0, 'V': 1.0, 'Y': 1.0},
#            'F': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 13.34,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0,
#                  'K': -14.03, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0,
#                  'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 33.601},
#            'I': {'A': 1.0, 'C': 1.0, 'E': 44.94, 'D': 1.0,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 13.34,
#                  'K': -7.49, 'M': 1.0, 'L': 20.26, 'N': 1.0,
#                  'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0,
#                  'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0},
#            'G': {'A': -7.49, 'C': 1.0, 'E': -6.54, 'D': 1.0,
#                  'G': 13.34, 'F': 1.0, 'I': -7.49, 'H': 1.0,
#                  'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': -7.49,
#                  'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0,
#                  'T': -7.49, 'W': 13.34, 'V': 1.0, 'Y': -7.49},
#            'H': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': -9.37, 'F': -9.37, 'I': 44.94, 'H': 1.0,
#                  'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 24.68,
#                  'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0,
#                  'T': -6.54, 'W': -1.88, 'V': 1.0, 'Y': 44.94},
#            'K': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': -7.49, 'F': 1.0, 'I': -7.49, 'H': 1.0,
#                  'K': 1.0, 'M': 33.60, 'L': -7.49, 'N': 1.0,
#                  'Q': 24.64, 'P': -6.54, 'S': 1.0, 'R': 33.60,
#                  'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0},
#            'M': {'A': 13.34, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 58.28,
#                  'K': 1.0, 'M': -1.88, 'L': 1.0, 'N': 1.0,
#                  'Q': -6.54, 'P': 44.94, 'S': 44.94, 'R': -6.54,
#                  'T': -1.88, 'W': 1.0, 'V': 1.0, 'Y': 24.68},
#            'L': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0,
#                  'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 33.60, 'P': 20.26, 'S': 1.0, 'R': 20.26,
#                  'T': 1.0, 'W': 24.68, 'V': 1.0, 'Y': 1.0},
#            'N': {'A': 1.0, 'C': -1.88, 'E': 1.0, 'D': 1.0,
#                  'G': -14.03, 'F': -14.03, 'I': 44.94, 'H': 1.0,
#                  'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': -6.54, 'P': -1.88, 'S': 1.0, 'R': 1.0,
#                  'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 1.0},
#            'Q': {'A': 1.0, 'C': -6.54, 'E': 20.26, 'D': 20.26,
#                  'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 1.0,
#                  'T': 1.0, 'W': 1.0, 'V': -6.54, 'Y': -6.54},
#            'P': {'A': 20.26, 'C': -6.54, 'E': 18.38, 'D': -6.54,
#                  'G': 1.0, 'F': 20.26, 'I': 1.0, 'H': 1.0,
#                  'K': 1.0, 'M': -6.54, 'L': 1.0, 'N': 1.0,
#                  'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': -6.54,
#                  'T': 1.0, 'W': -1.88, 'V': 20.26, 'Y': 1.0},
#            'S': {'A': 1.0, 'C': 33.60, 'E': 20.26, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 44.94, 'S': 20.26, 'R': 20.26,
#                  'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0},
#            'R': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 20.26,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 13.34, 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 58.28,
#                  'T': 1.0, 'W': 58.28, 'V': 1.0, 'Y': -6.54},
#            'T': {'A': 1.0, 'C': 1.0, 'E': 20.26, 'D': 1.0, 'G': -7.49, 'F': 13.34, 'I': 1.0, 'H': 1.0,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': -14.03, 'Q': -6.54, 'P': 1.0, 'S': 1.0, 'R': 1.0,
#                  'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0},
#            'W': {'A': -14.03, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -9.37, 'F': 1.0, 'I': 1.0, 'H': 24.68,
#                  'K': 1.0, 'M': 24.68, 'L': 13.34, 'N': 13.34, 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0,
#                  'T': -14.03, 'W': 1.0, 'V': -7.49, 'Y': 1.0},
#            'V': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': -14.03, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 1.0,
#                  'K': -1.88, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0,
#                  'T': -7.49, 'W': 1.0, 'V': 1.0, 'Y': -6.54},
#            'Y': {'A': 24.68, 'C': 1.0, 'E': -6.54, 'D': 24.68, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 13.34,
#                  'K': 1.0, 'M': 44.94, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 13.34, 'S': 1.0, 'R': -15.91,
#                  'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 13.34},
#            }
#
#    score = 0.0
#    for i in range(len(sequence) - 1):
#        if DIWV.has_key(sequence[i]):
#            if DIWV[sequence[i]].has_key(sequence[i+1]):
#                score += DIWV[sequence[i]][sequence[i+1]]
#    return (10.0 / len(sequence)) * score