def find_composition(df_original):
    df_copy = df_original.copy()

    column_names = []
    for ch in codes:
        column_names.append(ch + '_percent')
        column_names.append(ch + '_percent_first')
        column_names.append(ch + '_percent_last')
    column_names.append('len')
    column_names.append('weight')
    column_names.append('gravy')
    column_names.append('flex_mean')
    column_names.append('flex_std')
    column_names.append('ss_helix')
    column_names.append('ss_turn')
    column_names.append('ss_sheet')
    column_names.append('iep')
    column_names.append('aromaticity')

    df = pd.DataFrame(columns=column_names)
    for _, seq in enumerate(tqdm(df_copy['seq'])):
        df_temp = pd.Series()
        sequence = str(seq)
        analysed = ProteinAnalysis(sequence)
        analysed_first = ProteinAnalysis(sequence[:first_n])
        analysed_last = ProteinAnalysis(sequence[-last_n:])

        df_temp['len'] = analysed.length
        df_temp['ss_helix'], df_temp['ss_turn'], df_temp['ss_sheet'] = analysed.secondary_structure_fraction()
        df_temp['iep'] = analysed.isoelectric_point()

        # overall
        for aa, percent in analysed.get_amino_acids_percent().items():
            df_temp[aa + '_percent'] = percent

        # # first N
        for aa, percent in analysed_first.get_amino_acids_percent().items():
            df_temp[aa + '_percent_first'] = percent

        # last N
        for aa, percent in analysed_last.get_amino_acids_percent().items():
            df_temp[aa + '_percent_last'] = percent

        df_temp['weight'] = analysed.molecular_weight()
        df_temp['gravy'] = analysed.gravy()
        df_temp['aromaticity'] = analysed.aromaticity()
        df_temp['flex_mean'] = np.mean(analysed.flexibility())
        df_temp['flex_std'] = np.std(analysed.flexibility())
        df = df.append(df_temp, ignore_index=True)

    return pd.concat([df_copy, df], axis=1)
Example #2
0
def seq_properties(file_path):
    """Apply protein analysis on a fasta file to get analyzed amino acid profile 
    
    Args: 
        file_path [str]: File directory for the fasta file
        
    Returns: 
        total_percent_dict [dict]: Amino acid with counts dict
    """

    record = SeqIO.read(file_path, 'fasta')
    analyzed_seq = ProteinAnalysis(str(record.seq))

    c = analyzed_seq.get_amino_acids_percent()

    acidic_percent = count_prop(c, acidic_aa)
    basic_percent = count_prop(c, basic_aa)
    hydroxylic_percent = count_prop(c, hydroxylic_aa)
    amidic_percent = count_prop(c, amidic_aa)
    aliphatic_percent = count_prop(c, aliphatic_aa)
    aromatic_percent = count_prop(c, aromatic_aa)

    total_percent_dict = {
        "Acidic": acidic_percent,
        "Basic": basic_percent,
        "Hydroxilic": hydroxylic_percent,
        "Amidic": amidic_percent,
        "Aliphatic": aliphatic_percent,
        "Aromatic": aromatic_percent
    }
    return total_percent_dict
def calculate_residue_features(temp_dict, sequence):
    analyzed_seq = ProteinAnalysis(sequence)
    aa_percent = analyzed_seq.get_amino_acids_percent()

    hydrophobicity = 0
    hydrophilicity = 0
    interior__surface_transfer_energy_scale = 0
    surface_fractional_probability = 0

    for key in aa_percent.keys():
        hydrophobicity += aa_percent[key] * kd[key]
        hydrophilicity += aa_percent[key] * hw[key]
        surface_fractional_probability += aa_percent[key] * em[key]
        interior__surface_transfer_energy_scale += aa_percent[key] * ja[key]

    temp_dict.update({
        "Hydrophobicity":
        hydrophobicity,
        "Hydrophilicity":
        hydrophilicity,
        "Surface Fractional Probability":
        surface_fractional_probability,
        "I2S Transfer Energy Scale":
        interior__surface_transfer_energy_scale
    })
    temp_dict.update(aa_percent)
Example #4
0
def protParam(seq):
    params = ProteinAnalysis(seq)
    mw = params.molecular_weight()
    c_aa = params.count_amino_acids()
    p_aa = params.get_amino_acids_percent()
    gravy = params.gravy()
    aromaticity = params.aromaticity()
    isoelectric_point = params.isoelectric_point()
    ext_coeff = sum([c_aa["W"]*5690,c_aa["Y"]*1280,c_aa["C"]*120])
    mgml = ext_coeff * (1./mw)
    
    print("Amino acid count")
    pprint.pprint(c_aa)
    print("Amino acid percent")
    pprint.pprint(p_aa)
    print("Molecular weight")
    print("%f Da"%mw)
    print("Gravy")
    print(gravy)
    print("Isoelectric point")
    print(isoelectric_point)
    print("Aromaticity")
    print(aromaticity)
    print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)"%ext_coeff)
    print("")
    def parse_pro_sequence(self, p_seq, id=None, desc=None):
        try:
            p_seq = ''.join([pro for pro in p_seq if pro in proteins])

            # append fasta sequence metadata
            self.id.append(id)
            self.description.append(desc)

            # reverse translate protein to nucleotide sequence
            n_seq = ''.join([list(dna_codons.keys())[list(dna_codons.values()).index(pro)] for pro in p_seq])
            self.nucleotide_sequence.append(n_seq)
    
            self.protein_sequence.append(p_seq)
            # self.protein_sequence.append(str(record.seq.translate()).replace('*', ' '))
            
            # GC content
            self.gc_content.append(self.calculate_gc_content(n_seq))
            
            # protein analysis methods
            analysis = ProteinAnalysis(p_seq)
            self.amino_acid_dict.append(analysis.get_amino_acids_percent())
            self.molecular_weight.append(analysis.molecular_weight())
            self.instability_index.append(analysis.instability_index())
            self.aromaticity.append(analysis.aromaticity())

        except Exception as e:
            print('-'*80)
            print(f"Exception in parsing uploaded virus sequence: {e}")
            traceback.print_exc(file=sys.stdout)
            print('-'*80)
def analyze(seq, name):
    analysed = ProteinAnalysis(seq)
    print(name)
    print("pI: ")
    print(analysed.isoelectric_point())
    print("AA percent: ")
    print(analysed.get_amino_acids_percent())
Example #7
0
def protein_analysis():
    if session.username == None:
        redirect(URL(r=request, c='account', f='log_in'))
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    form = FORM(
        TABLE(
            TR(
                "Amino acid sequence:  ",
                TEXTAREA(_type="text",
                         _name="sequence",
                         requires=IS_NOT_EMPTY())),
            INPUT(_type="submit", _value="SUBMIT")))
    if form.accepts(request.vars, session):
        session['sequence'] = seqClean(form.vars.sequence.upper())
        X = ProteinAnalysis(session['sequence'])
        session['aa_count'] = X.count_amino_acids()
        session['percent_aa'] = X.get_amino_acids_percent()
        session['mw'] = X.molecular_weight()
        session['aromaticity'] = X.aromaticity()
        session['instability'] = X.instability_index()
        session['flexibility'] = X.flexibility()
        session['pI'] = X.isoelectric_point()
        session['sec_struct'] = X.secondary_structure_fraction()
        redirect(URL(r=request, f='protein_analysis_output'))
    return dict(form=form)
Example #8
0
def prot_param_features(seq):
    features = {}

    pa = ProteinAnalysis(str(seq.seq))  # .replace('X','G').replace('B','A')

    # 1. Amino Acid Percent
    aa = pa.get_amino_acids_percent()
    aa_dict = {"frac_{}".format(k): v for k, v in aa.items()}
    features.update(aa_dict)

    # 2. Aromaticity
    features["aromaticity"] = pa.aromaticity()

    # 3. Isoelectric Point
    features["isoelectric"] = pa.isoelectric_point()

    # 4. Molecular Weight
    try:
        features["mol_weight"] = pa.molecular_weight()
    except ValueError:
        replaced = str(seq.seq).replace('X', 'G').replace('B', 'N')

    # 5. Flexibility
    # try:
    #     features["flexibility"] = np.mean(pa.flexibility())
    # except KeyError:
    #     replaced = str(seq.seq).replace('X', 'G').replace('B', 'N').replace('U','C')
    #     features["flexibility"] = np.mean(ProteinAnalysis(replaced).flexibility())

    # 6. Secondary Structure Fraction
    struc = ["struc_helix", "struc_turn", "struc_sheet"]
    ss = pa.secondary_structure_fraction()
    features.update(dict(zip(struc, ss)))

    return features
 def transform(self, X):
     vec = np.zeros((len(X), len(VALID_AMINO_ACIDS)))
     for i in range(len(X)):
         pa = ProteinAnalysis(str(X[i]))
         for j, a in enumerate(VALID_AMINO_ACIDS):
             vec[i, j] = pa.get_amino_acids_percent().get(a, 0.0)
     return vec
Example #10
0
def prot_feats_seq(seq):

    aa=['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']


    f=[]



    X = ProteinAnalysis(str(seq))

    X.molecular_weight() #throws an error if 'X' in sequence. we skip such sequences
    p=X.get_amino_acids_percent()

    dp=[]
    for a in aa:
        dp.append(p[a])
    dp=np.array(dp)
    dp=normalize(np.atleast_2d(dp), norm='l2', copy=True, axis=1, return_norm=False)
    f.extend(dp[0])
    tm=np.array(twomerFromSeq(str(seq)))
    tm=normalize(np.atleast_2d(tm), norm='l2', copy=True, axis=1,return_norm=False)

    f.extend(tm[0])
    thm=np.array(threemerFromSeq(str(seq)))
    thm=normalize(np.atleast_2d(thm), norm='l2', copy=True, axis=1,return_norm=False)
    f.extend(thm[0])


    return np.array(f)
 def normal_charge_properties(self):
     df = pd.read_csv(self.train_fpi, sep='\t', index_col=0)
     df = df[df['y'] == 0]
     seqs = list(df['Sequence'])
     all_deltas = []
     net_charges = []
     frac_charges = []
     all_seq_in = ''
     for seq in seqs:
         ms = motif_seq.LcSeq(seq, self.k, self.lca, 'lca')
         in_seq, out_seq = ms.seq_in_motif()
         in_kmer, out_kmer = ms.overlapping_kmer_in_motif()
         if len(in_kmer) > 20:
             ka = kappa.KappaKmers(out_kmer, out_seq)
             delta = ka.deltaForm()
             if ka.NCPR() > -0.1 and ka.NCPR() < 0.1:
                 if delta < 0.1:
                     ns = norm_score.NormScore()
                     score = ns.lc_norm_score([seq])[0]
                     if score > 20:
                         if ka.FCR() < 0.2:
                             all_seq_in += in_seq
     analysed_seq = ProteinAnalysis(all_seq_in)
     aa_perc = analysed_seq.get_amino_acids_percent()
     print(aa_perc)
Example #12
0
def protParam(seq):
    params = ProteinAnalysis(seq)
    mw = params.molecular_weight()
    c_aa = params.count_amino_acids()
    p_aa = params.get_amino_acids_percent()
    gravy = params.gravy()
    aromaticity = params.aromaticity()
    isoelectric_point = params.isoelectric_point()
    ext_coeff = sum([c_aa["W"] * 5690, c_aa["Y"] * 1280, c_aa["C"] * 120])
    mgml = ext_coeff * (1. / mw)

    print("Amino acid count")
    pprint.pprint(c_aa)
    print("Amino acid percent")
    pprint.pprint(p_aa)
    print("Molecular weight")
    print("%f Da" % mw)
    print("Gravy")
    print(gravy)
    print("Isoelectric point")
    print(isoelectric_point)
    print("Aromaticity")
    print(aromaticity)
    print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)" % ext_coeff)
    print("")
def calculate_physiochemical_features(temp_dict, sequence):
    analyzed_seq = ProteinAnalysis(sequence)

    charge_at_pH7 = analyzed_seq.charge_at_pH(7)
    instability_index = analyzed_seq.instability_index()
    molecular_weight = analyzed_seq.molecular_weight()
    aromaticity = analyzed_seq.aromaticity()
    molar_extinction_coefficient = analyzed_seq.molar_extinction_coefficient()
    range_l, range_h = molar_extinction_coefficient
    molar_extinction_coefficient = (float(range_l) + float(range_h)) / 2
    gravy = analyzed_seq.gravy(
    )  #Grand Average Hyrdopathy - Higher value = More Hydrophobic
    isoelectric_point = analyzed_seq.isoelectric_point()
    helix_fraction, turn_fraction, sheet_fraction = analyzed_seq.secondary_structure_fraction(
    )

    physiochem_dict = {
        "Charge at pH7": charge_at_pH7,
        "Instability Index": instability_index,
        "Molecular Wt": molecular_weight,
        "Aromaticity": aromaticity,
        "Molar Extinction Coeff": molar_extinction_coefficient,
        "Gravy": gravy,
        "Isoelectric pt": isoelectric_point,
        "Helix Fraction": helix_fraction,
        "Turn Fraction": turn_fraction,
        "Sheet Fraction": sheet_fraction
    }
    temp_dict.update(physiochem_dict)

    #Adding separately because get_amino_acids_percent() generates a dictionary on its own
    aa_percent = analyzed_seq.get_amino_acids_percent()
    temp_dict.update(aa_percent)
Example #14
0
def percentages_from_proteins(path):
    file=open(path)
    names_list=[]
    sequence_list=[]
    sources_list = []
    desc_list = []
    taxo_list = []
    keyw_list = []
    taxid_list = []
    for record in  parse(file, "genbank"):
      cdsnum=0
      for feat in record.features:
               prot=record.seq
               analysed_seq = ProteinAnalysis(str(prot)) #creating another class ProteinAnalysis
               sequence_list.append(analysed_seq.get_amino_acids_percent()) #invoking method on this class, it returns a dictionary, we store it in the list
               names_list.append(str(record.name)+ "_CDS#" + str(cdsnum))                    
               sources_list.append(record.annotations['source'])
               keyw_list.append(record.annotations['keywords'])
               taxo_list.append(record.annotations['taxonomy'])
               desc_list.append(record.description)
               taxid_list.append(record.annotations["organism"])
               cdsnum+=1
    #List of dictionaties to the numpy array
    aas = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    nseqs = len(sequence_list)
    percents=np.zeros((nseqs,20))
    for i in range(nseqs):
        percdict = sequence_list[i]
        for an in range(20):
             percents[i,an]= percdict[ aas[an] ]
    return percents, names_list, sources_list, desc_list, taxo_list, keyw_list, taxid_list, sequence_list
 def get_aa_bins(self, seq):
     aas = 'SGEQAPDTNKRLHVYFIMCW'
     pa = ProteinAnalysis(seq)
     bc_dict = pa.get_amino_acids_percent()
     aa_bins = []
     for aa in aas:
         aa_bins.append(bc_dict[aa])
     return aa_bins
def get_aa_percentage_vectors(X):
    res = np.zeros((X.shape[0], 20))
    for i, seq in enumerate(X):
        analysed_seq = ProteinAnalysis(seq)
        res[i] = pd.Series(analysed_seq.get_amino_acids_percent())[
            aas  # to ensure the same order every time just in case
        ].values
    return res
Example #17
0
def protein_stats(proteome_file):
    seq = ""    
    for record in SeqIO.parse(proteome_file, "fasta"):
        sequences = record.seq 
        seq = seq+str(sequences)
    s = ProteinAnalysis(str(seq))
    dic = s.get_amino_acids_percent()

    for k, v in sorted(dic.items()):
        print(k, dic[k])
    def amino_acid_composition_last50(self, record):
        '''
		Input:
			- record: a SeqRecord
		Output:
			- dictionary: representing the distribution of amino acids over the
			last 50 amino acids
		'''
        PA = ProteinAnalysis(str(record.seq)[-50:])
        return PA.get_amino_acids_percent()
    def amino_acid_composition(self, record):
        '''
		Input:
			- record: a SeqRecord
		Output:
			- dictionary: representing the distribution of amino acids in
			the sequence
		'''
        PA = ProteinAnalysis(str(record.seq))
        return PA.get_amino_acids_percent()
Example #20
0
 def _one_organism(self, fasta_in):
     all_aa = ''
     fasta_in = os.path.join(self.seg_dpi, fasta_in)
     with open(fasta_in, 'r') as fasta_in:
         for record in SeqIO.parse(fasta_in, 'fasta'):
             sequence = str(record.seq)
             for aa in sequence:
                 if aa.islower():
                     all_aa += aa
     analyzed_sequence = ProteinAnalysis(all_aa)
     return analyzed_sequence.get_amino_acids_percent()
Example #21
0
def aa_frequency(outfile):
    fasta_sequences = SeqIO.parse(open(outfile),'fasta')
    all_seq=""
    for record in fasta_sequences:
        name, sequence = record.id, record.seq        
        #x=ProteinAnalysis(str(record.seq))
        #print(record.id, x.count_amino_acids())         
        all_seq=all_seq+str(sequence)
    #print(all_seq)
    y=ProteinAnalysis(str(all_seq))
    print("all_seq_n", y.count_amino_acids())
    print("all_seq_%", y.get_amino_acids_percent())
Example #22
0
def make_dataset(fasta):
    # a list of dictionaries containing features for all sequences
    ls_features = []

    # assign whether it's from tardigrades 'tar' or poplars 'pop'
    if 'tar' in fasta:
        target = 0
    elif 'pop' in fasta:
        target = 1

    for record in SeqIO.parse(fasta, "fasta"):
        analysed_seq = ProteinAnalysis(str(record.seq))

        # the dictionary containing features for a single sequence
        dict_features = {}

        # compute length
        dict_features['length'] = len(record.seq)

        # compute molecular weight
        dict_features['mol_weight'] = analysed_seq.molecular_weight()

        # compute aromaticity
        dict_features['aromaticity'] = analysed_seq.molecular_weight()

        # compute stability
        dict_features['stability'] = analysed_seq.instability_index()

        # compute flexibility
        dict_features['flexibility'] = analysed_seq.flexibility()

        # compute isoelectric point
        dict_features['isoelectric'] = analysed_seq.isoelectric_point()

        # compute secondary structure fraction
        frac = analysed_seq.secondary_structure_fraction()
        dict_features['helix'] = frac[0]
        dict_features['turn'] = frac[1]
        dict_features['sheet'] = frac[2]

        # compute AAC composition of entire sequence
        aac = analysed_seq.get_amino_acids_percent()

        # merge all features and dictionaries into dict_features
        dict_features.update(aac)
        ls_features += [dict_features]

    df = pd.DataFrame(ls_features)
    df['target'] = target

    print(df)
    df.to_pickle(name + '_set.pkl')
Example #23
0
 def write_aa_comp(self):
     cols = ['Protein ID', 'y'] + [aa for aa in self.aas]
     df_train = pd.read_csv(self.train_fp, sep='\t', index_col=0)
     bc_seqs = list(df_train[df_train['y'] == 0]['Sequence'])
     pdb_seqs = list(df_train[df_train['y'] == 1]['Sequence'])
     df_dict = dict()
     for aa in self.aas:
         df_dict[aa] = []
     df_dict['y'] = list(df_train['y'])
     df_dict['Protein ID'] = list(df_train['Protein ID'])
     for bc_seq in bc_seqs:
         a_bc_seq = ProteinAnalysis(bc_seq)
         bc_aas = a_bc_seq.get_amino_acids_percent()
         for aa in self.aas:
             df_dict[aa].append(bc_aas[aa])
     for pdb_seq in pdb_seqs:
         a_pdb_seq = ProteinAnalysis(pdb_seq)
         pdb_aas = a_pdb_seq.get_amino_acids_percent()
         for aa in self.aas:
             df_dict[aa].append(pdb_aas[aa])
     df = pd.DataFrame(df_dict, columns=cols)
     df.to_csv(self.comp_fp, sep='\t')
Example #24
0
def biopython_protein_analysis(inseq):
    """Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string.

    For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html

    Args:
        inseq: Amino acid sequence

    Returns:
        dict: Dictionary of sequence properties. Some definitions include:
        instability_index: Any value above 40 means the protein is unstable (has a short half life).
        secondary_structure_fraction: Percentage of protein in helix, turn or sheet

    TODO:
        Finish definitions of dictionary

    """

    inseq = ssbio.protein.sequence.utils.cast_to_str(inseq)

    analysed_seq = ProteinAnalysis(inseq)

    info_dict = {}
    info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids()
    info_dict[
        'amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent()
    info_dict['length-biop'] = analysed_seq.length
    info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic
    info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight()
    info_dict['aromaticity-biop'] = analysed_seq.aromaticity()
    info_dict['instability_index-biop'] = analysed_seq.instability_index()
    # TODO: What is flexibility?
    info_dict['flexibility-biop'] = analysed_seq.flexibility()
    info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point()

    # grand average of hydrophobicity
    info_dict['gravy-biop'] = analysed_seq.gravy()

    # Separated secondary_structure_fraction into each definition
    # info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction()
    info_dict[
        'percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[0]
    info_dict[
        'percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[1]
    info_dict[
        'percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[2]

    return info_dict
Example #25
0
 def plot_comp(self):
     df_train = pd.read_csv(self.train_fp, sep='\t', index_col=0)
     bc_seqs = list(df_train[df_train['y'] == 0]['Sequence'])
     pdb_seqs = list(df_train[df_train['y'] == 1]['Sequence'])
     aas_list = [aa for aa in self.aas]
     ind = range(len(self.aas))
     pdb_seq = ''
     for seq in pdb_seqs:
         pdb_seq += seq
     cb_seq = ''
     for seq in bc_seqs:
         cb_seq += seq
     an_pdb_seq = ProteinAnalysis(pdb_seq)
     pdb_dict = an_pdb_seq.get_amino_acids_percent()
     an_cb_seq = ProteinAnalysis(cb_seq)
     cb_dict = an_cb_seq.get_amino_acids_percent()
     pdb_bins = []
     cb_bins = []
     for aa in aas_list:
         pdb_bins.append(pdb_dict[aa])
         cb_bins.append(cb_dict[aa])
     plt.bar(ind,
             pdb_bins,
             color='darkblue',
             alpha=0.7,
             label='PDB',
             align='center')
     plt.bar(ind,
             cb_bins,
             color='orangered',
             alpha=0.7,
             label='BC',
             align='center')
     plt.xticks(ind, aas_list)
     plt.xlim([-1, len(self.aas)])
     plt.legend()
     plt.xlabel('Amino Acids', size=12)
     plt.ylabel('Relative Fraction', size=12)
Example #26
0
    def solubility_rules(self):
        """
        Function to calculate some solubility rules based on recommendations of http://bioserv.rpbs.univ-paris-diderot.fr/services/SolyPep/
        
        Output:
        solubility_rules_failed - return the number of rules faild based on the criteria
        """
        # Rule N1. Number of hydrophobic or charged residues
        hydro_residues = ['V', 'I', 'L', 'M', 'F', 'W', 'C']
        charged_residues = ['H', 'R', 'K', 'D', 'E']

        count_hydro_charged = 0
        for aa in self.sequence:
            if aa in hydro_residues or aa in charged_residues:
                count_hydro_charged += 1

        # This condition should change depending on the sequence length
        hydro_char_threshold = float(self.length_peptide) * 0.45
        if count_hydro_charged > hydro_char_threshold:
            self.solubility_rules_failed += 1

        # Rule N2. Computed peptide charge
        charge_threshold = 1
        self.compute_peptide_charges()
        if self.netCharge > 1:
            self.solubility_rules_failed += 1

        # Rule N3. Glycine or Proline content in the sequence
        count_gly_pro = 0
        for aa in self.sequence:
            if aa == "G" or aa == "P": count_gly_pro += 1
        # Check threshold
        if count_gly_pro > 1: self.solubility_rules_failed += 1

        # Rule N4. First or last amino acid charged
        count_charge = 0
        if self.sequence[0] in charged_residues:
            count_charge += 1
        if self.sequence[-1] in charged_residues:
            count_charge += 1
        # Check threshold
        if count_charge > 0: self.solubility_rules_failed += 1

        # Rule N5. Any amino acid represent more than 25% of the total sequence
        prot_parameters = ProteinAnalysis(self.sequence)
        aa_content = prot_parameters.get_amino_acids_percent()
        for aa in aa_content:
            if aa_content[aa] >= 0.3:
                self.solubility_rules_failed += 1
                break
Example #27
0
def aminoAcidComposition(path):
    dicoProt = pk.load(open(path, "rb"))

    hist = dict()
    for k, v in dicoProt.items():
        hist_temp = ProteinAnalysis.get_amino_acids_percent(
            ProteinAnalysis(str(v)))
        for key in hist_temp.keys():
            if (key in hist):
                hist[key] = hist[key] + 100 * hist_temp[key] / (float)(
                    len(dicoProt))
            else:
                hist[key] = 100 * hist_temp[key] / (float)(len(dicoProt))

    return hist
Example #28
0
    def __init__(self, sequence):
        self.sequence = sequence
        self.sequence_length = len(sequence)
        analysis = ProteinAnalysis(sequence)

        self.amino_acid_percents = analysis.get_amino_acids_percent()
        self.amino_acids_composition = calculate_amino_acids_composition(sequence)
        self.aromaticity = analysis.aromaticity()
        self.instability = analysis.instability_index()
        self.flexibility = calculate_flexibility(sequence)
        protein_scale_parameters = [{'name': 'Hydrophilicity', 'dictionary': hw},
                                    {'name': 'Surface accessibility', 'dictionary': em},
                                    {'name': 'Janin Interior to surface transfer energy scale', 'dictionary': ja},
                                    {'name': 'Bulkiness', 'dictionary': bulkiness},
                                    {'name': 'Polarity', 'dictionary': polarity},
                                    {'name': 'Buried residues', 'dictionary': buried_residues},
                                    {'name': 'Average area buried', 'dictionary': average_area_buried},
                                    {'name': 'Retention time', 'dictionary': retention_time}]
        self.protein_scales = calculate_protein_scales(analysis, protein_scale_parameters)
        self.isoelectric_point = analysis.isoelectric_point()
        self.secondary_structure_fraction = calculate_secondary_structure_fraction(analysis)
        self.molecular_weight = analysis.molecular_weight()
        self.kyte_plot = analysis.gravy()
        self.pefing = calculate_pefing(sequence)

        # next parameters are calculated using R.Peptides
        r('require(Peptides)')
        r('sequence = "{0}"'.format(sequence))
        self.aliphatic_index = r('aindex(sequence)')[0]
        self.boman_index = r('boman(sequence)')[0]
        self.charges = calculate_charges(sequence, 1.0, 14.0, 0.5, 'Lehninger')
        self.hydrophobicity = r('seq(sequence)')[0]
        angles = [{'name': 'Alpha-helix', 'angle': -47},
                  {'name': '3-10-helix', 'angle': -26},
                  {'name': 'Pi-helix', 'angle': -80},
                  {'name': 'Omega', 'angle': 180},
                  {'name': 'Antiparallel beta-sheet', 'angle': 135},
                  {'name': 'Parallel beta-sheet', 'angle': 113}]
        if self.amino_acid_percents['P'] + self.amino_acid_percents['G'] > 0.3:
            angles.append({'name': 'Polygly-polypro helix', 'angle': 153})
        self.hydrophobic_moments = calculate_hydrophobic_moments(sequence, angles)
        self.kidera_factors = calculate_kidera_factors(sequence)
        self.peptide_types = calculate_peptide_types(sequence, angles)
Example #29
0
    def calculate_properties_from_sequence(self):
        """
        Function to calculate some molecular properties based on RDKit functionalities
        
        Arguments:
        Sequence - amino acid sequence of the peptide
        
        Return:
        Average Eisenberg hydrophobicity
        ProtParam parameters: Isolectric point, aromaticity, instability index, amino acid percentage
        """

        # Hydrophobicity -> Eisenberg scale
        hydrophobicity = {
            'A': 0.620,
            'R': -2.530,
            'N': -0.780,
            'D': -0.900,
            'C': 0.290,
            'Q': -0.850,
            'E': -0.740,
            'G': 0.480,
            'H': -0.400,
            'Y': 0.260,
            'I': 1.380,
            'L': 1.060,
            'K': -1.500,
            'M': 0.640,
            'F': 1.190,
            'P': 0.120,
            'S': -0.180,
            'T': -0.050,
            'W': 0.810,
            'V': 1.080
        }
        self.avg_hydro = sum([hydrophobicity[resi] for resi in self.sequence])

        # ProParam properties
        prot_parameters = ProteinAnalysis(self.sequence)
        self.aromaticity = prot_parameters.aromaticity()
        self.aa_percent = prot_parameters.get_amino_acids_percent()
        self.instability_index = prot_parameters.instability_index()
        self.isoelectric_point = prot_parameters.isoelectric_point()
Example #30
0
 def amino_acid_analysis(self):
     """
     Adds fraction of amino acid residues (defined in RESIDUES) to data frame.
     """
     for res in RESIDUES:
         self.df["fraction_" + res] = (
             self.df["sequence"].str.count(res) / self.df["sequence"].str.len()
         )
     self.df["length"] = self.df["sequence"].str.len()
     for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0]):
         # for index, row in self.df.iterrows():
         seq = row["sequence"]
         seqanalysis = ProteinAnalysis(seq)
         acidist = seqanalysis.get_amino_acids_percent()
         self.df.loc[index, "IEP"] = seqanalysis.isoelectric_point()
         if "X" not in seq and "B" not in seq:
             self.df.loc[index, "molecular_weight"] = seqanalysis.molecular_weight()
         if "U" not in seq and "X" not in seq and "B" not in seq:
             self.df.loc[index, "gravy"] = seqanalysis.gravy()
def bio_feat(record):
    clean_seq = str(MutableSeq(record.seq)).replace("X", "")
    clean_seq = clean_seq.replace("U", "C")
    clean_seq = clean_seq.replace("B", "N")
    clean_seq = clean_seq.replace('Z', 'Q')
    clean_seq = MutableSeq(clean_seq).toseq()

    ### features
    seq_length = len(str(clean_seq))
    analysed_seq = ProteinAnalysis(str(clean_seq))
    molecular_weight = analysed_seq.molecular_weight()
    amino_percent = analysed_seq.get_amino_acids_percent().values()
    isoelectric_points = analysed_seq.isoelectric_point()
    count = analysed_seq.count_amino_acids().values()
    # aromaticity = analysed_seq.aromaticity()
    instability_index = analysed_seq.instability_index()
    # hydrophobicity = analysed_seq.protein_scale(ProtParamData.kd, 5, 0.4)
    secondary_structure_fraction = analysed_seq.secondary_structure_fraction()
    return np.array([seq_length, molecular_weight, isoelectric_points, instability_index] + list(secondary_structure_fraction) + list(count) + list(amino_percent))
Example #32
0
def is_string_aminoacids1(item):

    output = False

    if type(item) is str:

        if item.startswith('aminoacids1:'):

            output = True

        else:

            from ..string_aminoacids3 import is_string_aminoacids3

            if not is_string_aminoacids3(item):

                from Bio.SeqUtils.ProtParam import ProteinAnalysis
                analysed_seq = ProteinAnalysis(item)
                output = (sum(analysed_seq.get_amino_acids_percent().values()) > 0.99)

    return output
Example #33
0
def protein_analysis():
    if session.username == None: redirect(URL(r=request,f='../account/log_in'))
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    form = FORM(TABLE(
            TR("Amino acid sequence:  ",
               TEXTAREA(_type="text", _name="sequence",
                        requires=IS_NOT_EMPTY())),
            INPUT(_type="submit", _value="SUBMIT")))
    if form.accepts(request.vars,session):
        session['sequence'] = seqClean(form.vars.sequence.upper())
        X = ProteinAnalysis(session['sequence'])
        session['aa_count'] = X.count_amino_acids()
        session['percent_aa'] = X.get_amino_acids_percent()
        session['mw'] = X.molecular_weight()
        session['aromaticity'] = X.aromaticity()
        session['instability'] = X.instability_index()
        session['flexibility'] = X.flexibility()
        session['pI'] = X.isoelectric_point()
        session['sec_struct'] = X.secondary_structure_fraction()
        redirect(URL(r=request, f='protein_analysis_output'))
    return dict(form=form)
Example #34
0
class amp:
	"stores all data of peptide"
	def __init__(self,readed):
		self.seq = readed[1]
		self.length = len(readed[1])
		self.name = readed[0]
		
	def netcharge(self): #i don't thonk biopython calculates net charge
		self.pos = 'KRH'
		self.neg = 'DE'
		self.net = 0
		self.posRe = 0
		for i in self.seq:
			if i in self.pos: 
				self.net += 1
			# no 	self.posRe += 1 #need it for searching 
			if i in self.neg: 
				self.net -= 1
			else: continue
	def hphobFract(self): #i don't know if biopython calculates just froaction of hphobs
                hph = 'ACFGILMPV'
                self.hpf = 0.
                for i in self.seq:
                        if i in hph: self.hpf += 1
                        else: continue
		self.hpn = self.hpf
		self.hpf = self.hpf/self.length

	def analyzeAMP(self):
                from Bio.SeqUtils.ProtParam import ProteinAnalysis
		
		self.netcharge()
		self.hphobFract()
                #self.aaPerc = self.pepParam.get_amino_acids_percent()
                self.pepParam = ProteinAnalysis(self.seq)

		self.data = {'charge': self.net,
				'length': self.length,
				'hydrophobic':self.hpf,
				'aminoacids': self.pepParam.get_amino_acids_percent()}
		
		return self.data

	def detectAMP(self):
		from Bio.SeqUtils.ProtParam import ProteinAnalysis
		import re
		import ConfigParser
		import numpy as np
		parser = ConfigParser.SafeConfigParser()
		parser.read('config.ini')

		"floating window and search for values"
		lowNet = parser.getfloat('Parameters','lowNet') #0
		midNet = parser.getfloat('Parameters','midNet')#2
		highNet = parser.getfloat('Parameters','highNet')#6
		lowHpf = parser.getfloat('Parameters','lowHpf')#0.5
		highHpf = parser.getfloat('Parameters','highHpf')#0.9
		lowCompCoeff = parser.getfloat('Parameters','lowCompCoeff')#0.85
		highCompCoeff = parser.getfloat('Parameters','highCompCoeff')#1.5
		baseWind = parser.getint('Parameters','baseWind')#15
#		maxWind = parser.getfloat('Parameters','maxWind')#100
		thresh = parser.getint('Parameters','thresh')#6
		minLen = parser.getint('Parameters','minLen')#10
		# 		  C    R     W    H    K    D    E 
		baseCompose = [0.01,0.06,0.005,0.02,0.06,0.05,0.07]
		ampCompose  = [0.06,0.09,0.01, 0.02,0.1, 0.02,0.03]
		changes = [i[1]/i[0] for i in zip(baseCompose,ampCompose)]
		upAvg = np.average(changes[:-2])
		downAvg = np.average(changes[-2:])
		self.result = [0 for i in self.seq]
		if self.length > baseWind*2:
			for i in range(self.length-baseWind):
				self.subPep = amp(['subPep',self.seq[i:i+baseWind]])
				self.subPep.netcharge()
				self.subPep.hphobFract()
				#print self.subPep.net, self.subPep.hpf, i, i+baseWind
				self.pepParam = ProteinAnalysis(self.subPep.seq)
				self.aaPerc = self.pepParam.get_amino_acids_percent()
				self.subPepComp = [self.aaPerc[aminame] for aminame in ['C','R','W','H','K','D','E']]
				self.subPepChanges = [k[1]/k[0] for k in zip(baseCompose,self.subPepComp)]
				self.upSubAvg = np.average(self.subPepChanges[:-2])
				self.downSubAvg = np.average(self.subPepChanges[-2:])
#really #really hate such muliticondidtional 
				#print downAvg,',,,,,',self.downSubAvg
				if (((lowNet < self.subPep.net < highNet and\
				    self.subPep.hpf > lowHpf) or\
				   (midNet < self.subPep.net ) or \
				   (self.subPep.hpf > highHpf)) and\
				    self.upSubAvg > lowCompCoeff*upAvg) or\
				    self.upSubAvg > highCompCoeff*upAvg:
					for aa in range(i,i+baseWind):
						self.result[aa] += 1
				
				else:
					continue
		else:
			self.subPep = self
			self.subPep.netcharge()
			self.subPep.hphobFract()
			self.pepParam = ProteinAnalysis(self.subPep.seq)
			self.aaPerc = self.pepParam.get_amino_acids_percent()
			self.subPepComp = [self.aaPerc[aminame] for aminame in ['C','R','W','H','K','D','E']]
                        self.subPepChanges = [k[1]/k[0] for k in zip(baseCompose,self.subPepComp)]      
                        self.upSubAvg = np.average(self.subPepChanges[:-2])
                        self.downSubAvg = np.average(self.subPepChanges[-2:])
			#print downAvg,',,,,,',self.downSubAvg
			if ((lowNet < self.subPep.net < highNet and\
                           self.subPep.hpf > lowHpf) or\
                          (midNet < self.subPep.net) or \
                          (self.subPep.hpf > lowCompCoeff*upAvg)) and\
                           self.upSubAvg > highCompCoeff*upAvg:
				self.result = [i+1 for i in self.result]
			else:
				pass
		self.thrRes = []
		for val in self.result:
			if val > thresh: self.thrRes.append(1)
			else: self.thrRes.append(0)
		self.strRes= ''.join([str(i) for i in self.thrRes])
		self.matches = re.split('0*',self.strRes)
		#for match in self.matches:
		self.matches = [match for match in self.matches if len(match) > minLen]
		
		if len(self.matches) > 0:
#			print 'found peptide of length ',len(self.matches[0]) 
			return 'found peptide of length ' + str(len(self.matches[0]))
			
		else:
			return 'nothing found'
		
	def plotPred(self):
		import matplotlib.pylab as pl
		try:
			checker = self.result[0]
		except:
			self.detectAMP()
		
		pl.plot(self.thrRes,'.-')
		pl.savefig('testy.pdf')
Example #35
0
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import ProtParamData
import sys
import json

inp = json.loads(sys.argv[1])

seq = inp["Sequence"]

X = ProteinAnalysis(seq)

data = dict()

if "MW" in inp["Options"]:
	data["MW"] = X.molecular_weight()

if "EC280" in inp["Options"]:
	aa_count = X.count_amino_acids()
	if "hasDisulfide" in inp["Options"]:
		data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"] + 62.5 * aa_count["C"]
	else:
		data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"]

if "PI" in inp["Options"]:
	data["PI"] = X.isoelectric_point()

if "AACont" in inp["Options"]:
	ratios = X.get_amino_acids_percent()
	data["AACont"] = {aa: ratios[aa] * 100. for aa in ratios}

print json.dumps(data)
        protein_name = get_protein_name(line)
        protein_names_and_segments[protein_name] = get_segments(line)
        protein_names_and_sequences[protein_name] = ''
    else:
        sequence = protein_names_and_sequences.get(protein_name)
        sequence += line.strip('\n' and '\r' and '\r\n')
        protein_names_and_sequences[protein_name] = sequence

for key in protein_names_and_segments.keys():

    for segment in protein_names_and_segments.get(key):

        segment_sequence = protein_names_and_sequences.get(key)[segment[0]
                                                                - 1:segment[1]]
        x += segment_sequence
        y = ProteinAnalysis(str(x))
        z = y.get_amino_acids_percent()


# visual for command line
print 'parsing ' + FILE_INPUT + '\n'

# build the output file as CSV
with open('percent_AA_per_seg_OUTPUT.csv', 'wb') as f:
    w = csv.writer(f)
    w.writerows(z.items())

# opens the ouput file
file = '/Users/simonkeng/senior-research-project/percent_AA_per_seg_OUTPUT.csv'
open_file(file)
Example #37
0
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import ProtParamData
from Bio import SeqIO
with open('../../samples/pdbaa') as fh:
   for rec in SeqIO.parse(fh,'fasta'):
       myprot = ProteinAnalysis(str(rec.seq))
       print(myprot.count_amino_acids())
       print(myprot.get_amino_acids_percent())
       print(myprot.molecular_weight())
       print(myprot.aromaticity())
       print(myprot.instability_index())
       print(myprot.flexibility())
       print(myprot.isoelectric_point())
       print(myprot.secondary_structure_fraction())
       print(myprot.protein_scale(ProtParamData.kd, 9, .4))