Esempio n. 1
0
def biopython_protein_analysis(inseq):
    """Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string.

    For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html

    Args:
        inseq: Amino acid sequence

    Returns:
        dict: Dictionary of sequence properties. Some definitions include:
        instability_index: Any value above 40 means the protein is unstable (has a short half life).
        secondary_structure_fraction: Percentage of protein in helix, turn or sheet

    TODO:
        Finish definitions of dictionary

    """

    inseq = ssbio.protein.sequence.utils.cast_to_str(inseq)

    analysed_seq = ProteinAnalysis(inseq)

    info_dict = {}
    info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids()
    info_dict[
        'amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent()
    info_dict['length-biop'] = analysed_seq.length
    info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic
    info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight()
    info_dict['aromaticity-biop'] = analysed_seq.aromaticity()
    info_dict['instability_index-biop'] = analysed_seq.instability_index()
    # TODO: What is flexibility?
    info_dict['flexibility-biop'] = analysed_seq.flexibility()
    info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point()

    # grand average of hydrophobicity
    info_dict['gravy-biop'] = analysed_seq.gravy()

    # Separated secondary_structure_fraction into each definition
    # info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction()
    info_dict[
        'percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[0]
    info_dict[
        'percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[1]
    info_dict[
        'percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[2]

    return info_dict
def phyChemProps(seq):
    svv = [0 for x in range(10)]
    X = ProteinAnalysis(seq)
    svv[0] = X.aromaticity()
    svv[1] = X.secondary_structure_fraction()[0]
    svv[2] = X.secondary_structure_fraction()[1]
    svv[3] = X.secondary_structure_fraction()[2]
    svv[4] = X.gravy()
    svv[5] = X.instability_index()
    svv[6] = X.isoelectric_point()
    svv[7] = X.molecular_weight()
    svv[8] = X.molar_extinction_coefficient()[0]
    svv[9] = X.molar_extinction_coefficient()[1]
    return svv
def calculate_physiochemical_features(temp_dict, sequence):
    analyzed_seq = ProteinAnalysis(sequence)

    charge_at_pH7 = analyzed_seq.charge_at_pH(7)
    instability_index = analyzed_seq.instability_index()
    molecular_weight = analyzed_seq.molecular_weight()
    aromaticity = analyzed_seq.aromaticity()
    molar_extinction_coefficient = analyzed_seq.molar_extinction_coefficient()
    range_l, range_h = molar_extinction_coefficient
    molar_extinction_coefficient = (float(range_l) + float(range_h)) / 2
    gravy = analyzed_seq.gravy(
    )  #Grand Average Hyrdopathy - Higher value = More Hydrophobic
    isoelectric_point = analyzed_seq.isoelectric_point()
    helix_fraction, turn_fraction, sheet_fraction = analyzed_seq.secondary_structure_fraction(
    )

    physiochem_dict = {
        "Charge at pH7": charge_at_pH7,
        "Instability Index": instability_index,
        "Molecular Wt": molecular_weight,
        "Aromaticity": aromaticity,
        "Molar Extinction Coeff": molar_extinction_coefficient,
        "Gravy": gravy,
        "Isoelectric pt": isoelectric_point,
        "Helix Fraction": helix_fraction,
        "Turn Fraction": turn_fraction,
        "Sheet Fraction": sheet_fraction
    }
    temp_dict.update(physiochem_dict)

    #Adding separately because get_amino_acids_percent() generates a dictionary on its own
    aa_percent = analyzed_seq.get_amino_acids_percent()
    temp_dict.update(aa_percent)
Esempio n. 4
0
    def protAnalysis(self, content):
        result, resultFlexDic = dict(), dict()
        content = Parsers.normalizeSequence(content, self.sourceType)
        protein = ProteinAnalysis(content)

        result['proteinMWeight'] = protein.molecular_weight()
        result['proteinAroma'] = protein.aromaticity()
        result['proteinInstab'] = protein.instability_index()
        result['proteinIsoelec'] = protein.isoelectric_point()
        result['proteinGravy'] = protein.gravy()

        proteinStructure = protein.secondary_structure_fraction()
        protStruct = self.flatten('proteinSecstruc', proteinStructure)

        result = {**protStruct, **result}

        # merge result and protein Structure
        flexibility = protein.flexibility()
        flexibFlat = self.flatten('proteinFlex', flexibility)
        flexibAmino = self.flatten(list(content), flexibility)

        flattened = {**flexibFlat, **result}
        flattenedFlexDic = {**flexibAmino, **result}

        return result, flattened, flattenedFlexDic,
Esempio n. 5
0
def prot_param_features(seq):
    features = {}

    pa = ProteinAnalysis(str(seq.seq))  # .replace('X','G').replace('B','A')

    # 1. Amino Acid Percent
    aa = pa.get_amino_acids_percent()
    aa_dict = {"frac_{}".format(k): v for k, v in aa.items()}
    features.update(aa_dict)

    # 2. Aromaticity
    features["aromaticity"] = pa.aromaticity()

    # 3. Isoelectric Point
    features["isoelectric"] = pa.isoelectric_point()

    # 4. Molecular Weight
    try:
        features["mol_weight"] = pa.molecular_weight()
    except ValueError:
        replaced = str(seq.seq).replace('X', 'G').replace('B', 'N')

    # 5. Flexibility
    # try:
    #     features["flexibility"] = np.mean(pa.flexibility())
    # except KeyError:
    #     replaced = str(seq.seq).replace('X', 'G').replace('B', 'N').replace('U','C')
    #     features["flexibility"] = np.mean(ProteinAnalysis(replaced).flexibility())

    # 6. Secondary Structure Fraction
    struc = ["struc_helix", "struc_turn", "struc_sheet"]
    ss = pa.secondary_structure_fraction()
    features.update(dict(zip(struc, ss)))

    return features
Esempio n. 6
0
    def _protein_parameters(self, sequence):
        """Calculates physicochemical properties for the amino acid sequence.
        
        Args:
            sequence: str, amino acid sequence.
            
        Returns: 
            property_arr: np array, vector of properties.
            
        """

        analysis = ProteinAnalysis(sequence)

        property_arr = []

        property_arr.append(analysis.molecular_weight())
        property_arr.append(analysis.aromaticity())
        property_arr.append(analysis.instability_index())
        property_arr.append(analysis.gravy())
        property_arr.append(analysis.isoelectric_point())

        secondary = analysis.secondary_structure_fraction()
        property_arr.append(secondary[0])
        property_arr.append(secondary[1])
        property_arr.append(secondary[2])

        molar_extinction_coefficient = analysis.molar_extinction_coefficient()
        property_arr.append(molar_extinction_coefficient[0])
        property_arr.append(molar_extinction_coefficient[1])

        property_arr.append(self._net_charge(sequence))

        return np.array(property_arr)
Esempio n. 7
0
 def get_secondary_structure(self):
     x = ProteinAnalysis(self.sequence)
     sec_stru = x.secondary_structure_fraction()
     helix = "{0:0.2f}".format(sec_stru[0])
     turn = "{0:0.2f}".format(sec_stru[1])
     sheet = "{0:0.2f}".format(sec_stru[2])
     return helix, turn, sheet
Esempio n. 8
0
    def get_sec_struct(self):
        """
        Calculates the fraction of amino acids which tend to be in helix, turn or sheet (3 value) from biopython

        :return: dictionary with the 3 value of helix, turn, sheet
        """

        res = {}
        analysed_seq = ProteinAnalysis(self.ProteinSequence)
        res['SecStruct_helix'] = analysed_seq.secondary_structure_fraction()[
            0]  # helix
        res['SecStruct_turn'] = analysed_seq.secondary_structure_fraction()[
            1]  # turn
        res['SecStruct_sheet'] = analysed_seq.secondary_structure_fraction()[
            2]  # sheet
        return res
def get_protein_features(seq):
    seq = correct(seq)
    prot_analysis = ProteinAnalysis(seq)
    prot_weight = molecular_weight(seq)
    pI = prot_analysis.isoelectric_point()
    aa_count = prot_analysis.count_amino_acids()
    neg_charged_residues = aa_count['D'] + aa_count['E']
    pos_charged_residues = aa_count['K'] + aa_count['R']
    extinction_coefficient_1 = aa_count['Y'] * 1490 + aa_count['W'] * 5500
    extinction_coefficient_2 = aa_count['Y'] * 1490 + aa_count[
        'W'] * 5500 + aa_count['C'] * 125
    instability_idx = instability_index(seq)
    gravy = hydrophobicity(seq)
    secondary_structure_fraction = [
        frac for frac in prot_analysis.secondary_structure_fraction()
    ]

    names = [
        'length', 'weight', 'pI', 'neg_charged_residues',
        'pos_charged_residues', 'extinction_coeff1', 'extinction_coeff2',
        'instability_index', 'gravy', 'helix', 'turn', 'sheet'
    ]

    return names, [
        len(seq), prot_weight, pI, neg_charged_residues, pos_charged_residues,
        extinction_coefficient_1, extinction_coefficient_2, instability_idx,
        gravy, *secondary_structure_fraction
    ]
Esempio n. 10
0
def protein_analysis():
    if session.username == None:
        redirect(URL(r=request, c='account', f='log_in'))
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    form = FORM(
        TABLE(
            TR(
                "Amino acid sequence:  ",
                TEXTAREA(_type="text",
                         _name="sequence",
                         requires=IS_NOT_EMPTY())),
            INPUT(_type="submit", _value="SUBMIT")))
    if form.accepts(request.vars, session):
        session['sequence'] = seqClean(form.vars.sequence.upper())
        X = ProteinAnalysis(session['sequence'])
        session['aa_count'] = X.count_amino_acids()
        session['percent_aa'] = X.get_amino_acids_percent()
        session['mw'] = X.molecular_weight()
        session['aromaticity'] = X.aromaticity()
        session['instability'] = X.instability_index()
        session['flexibility'] = X.flexibility()
        session['pI'] = X.isoelectric_point()
        session['sec_struct'] = X.secondary_structure_fraction()
        redirect(URL(r=request, f='protein_analysis_output'))
    return dict(form=form)
def get_protein_analysis(aa):
    protein_analysis = ProteinAnalysis(aa)
    analyze = [protein_analysis.molecular_weight(), 
        protein_analysis.aromaticity(),
        protein_analysis.instability_index(),
        protein_analysis.isoelectric_point(),
        protein_analysis.gravy()] + list(
        protein_analysis.secondary_structure_fraction())
    return analyze
    def secondary_structure(self, record):
        '''
		Input:
			- record: a SeqRecord
		Output:
			- tuple of integers
		'''
        PA = ProteinAnalysis(str(record.seq))
        return PA.secondary_structure_fraction()
def get_biopython_features(X):
    res = np.zeros((X.shape[0], 6))
    for i, seq in enumerate(X):
        analysed_seq = ProteinAnalysis(seq)
        res[i] = np.array([analysed_seq.molecular_weight()] +
                          [analysed_seq.instability_index()] +
                          [analysed_seq.isoelectric_point()] +
                          list(analysed_seq.secondary_structure_fraction()))

    return res
Esempio n. 14
0
def processSeq(seq):
    ''' Protein features found:
        - Sequence Length
        - Amino Acid Composition (global)
        - Amino Acid Composition (First 50/Last 50)
        - Isoelectric Point
        - Aromacity
        - Grand Average Hydropathy (Gravy)
        - Molecular Weight (global)
        - Molecular Weight (First 50/Last 50)
        - Secondary Structure Fraction
    '''

    # seq = str(seq_record.seq)
    prot = ProteinAnalysis(seq)
    # desc = str(seq_record.description).split('_')
    # species = desc[1].split(' ')[0]
    seq_length = len(seq)
    isoelectric = prot.isoelectric_point()
    gravy = calculateGravy(seq, 0, seq_length)
    aroma = prot.aromaticity()
    ss_frac = prot.secondary_structure_fraction()

    mol_global_weight = calculateMolecularWeight(seq, 0, seq_length)
    AA_global_dist = getAAPercent(seq, 0, seq_length)
    flex_global = calculateFlexibility(seq, 0, seq_length)
    if (seq_length > 50):
        AA_local_head = getAAPercent(seq, 0, 50)
        AA_local_tail = getAAPercent(seq, seq_length - 50, seq_length)
        mol_local_weight_head = calculateMolecularWeight(seq, 0, 50)
        mol_local_weight_tail = calculateMolecularWeight(
            seq, seq_length - 50, seq_length)
        flex_localh = calculateFlexibility(seq, 0, 50)
        flex_localt = calculateFlexibility(seq, seq_length - 50, seq_length)
    else:
        AA_local_head = AA_global_dist
        AA_local_tail = AA_global_dist
        mol_local_weight_head = mol_global_weight
        mol_local_weight_tail = mol_global_weight
        flex_localh = flex_global
        flex_localt = flex_global

    return_vector = [seq_length,aroma,
                    isoelectric,
                    mol_global_weight,
                    mol_local_weight_head,
                    mol_local_weight_tail,
                    gravy,flex_global,
                    flex_localh,
                    flex_localt] + \
                    AA_global_dist + AA_local_head + AA_local_tail + list(ss_frac)

    # print seq_length, GC_distribution, mol_weight, aroma, isoelectric
    return return_vector
Esempio n. 15
0
def make_dataset(fasta):
    # a list of dictionaries containing features for all sequences
    ls_features = []

    # assign whether it's from tardigrades 'tar' or poplars 'pop'
    if 'tar' in fasta:
        target = 0
    elif 'pop' in fasta:
        target = 1

    for record in SeqIO.parse(fasta, "fasta"):
        analysed_seq = ProteinAnalysis(str(record.seq))

        # the dictionary containing features for a single sequence
        dict_features = {}

        # compute length
        dict_features['length'] = len(record.seq)

        # compute molecular weight
        dict_features['mol_weight'] = analysed_seq.molecular_weight()

        # compute aromaticity
        dict_features['aromaticity'] = analysed_seq.molecular_weight()

        # compute stability
        dict_features['stability'] = analysed_seq.instability_index()

        # compute flexibility
        dict_features['flexibility'] = analysed_seq.flexibility()

        # compute isoelectric point
        dict_features['isoelectric'] = analysed_seq.isoelectric_point()

        # compute secondary structure fraction
        frac = analysed_seq.secondary_structure_fraction()
        dict_features['helix'] = frac[0]
        dict_features['turn'] = frac[1]
        dict_features['sheet'] = frac[2]

        # compute AAC composition of entire sequence
        aac = analysed_seq.get_amino_acids_percent()

        # merge all features and dictionaries into dict_features
        dict_features.update(aac)
        ls_features += [dict_features]

    df = pd.DataFrame(ls_features)
    df['target'] = target

    print(df)
    df.to_pickle(name + '_set.pkl')
def find_composition(df_original):
    df_copy = df_original.copy()

    column_names = []
    for ch in codes:
        column_names.append(ch + '_percent')
        column_names.append(ch + '_percent_first')
        column_names.append(ch + '_percent_last')
    column_names.append('len')
    column_names.append('weight')
    column_names.append('gravy')
    column_names.append('flex_mean')
    column_names.append('flex_std')
    column_names.append('ss_helix')
    column_names.append('ss_turn')
    column_names.append('ss_sheet')
    column_names.append('iep')
    column_names.append('aromaticity')

    df = pd.DataFrame(columns=column_names)
    for _, seq in enumerate(tqdm(df_copy['seq'])):
        df_temp = pd.Series()
        sequence = str(seq)
        analysed = ProteinAnalysis(sequence)
        analysed_first = ProteinAnalysis(sequence[:first_n])
        analysed_last = ProteinAnalysis(sequence[-last_n:])

        df_temp['len'] = analysed.length
        df_temp['ss_helix'], df_temp['ss_turn'], df_temp['ss_sheet'] = analysed.secondary_structure_fraction()
        df_temp['iep'] = analysed.isoelectric_point()

        # overall
        for aa, percent in analysed.get_amino_acids_percent().items():
            df_temp[aa + '_percent'] = percent

        # # first N
        for aa, percent in analysed_first.get_amino_acids_percent().items():
            df_temp[aa + '_percent_first'] = percent

        # last N
        for aa, percent in analysed_last.get_amino_acids_percent().items():
            df_temp[aa + '_percent_last'] = percent

        df_temp['weight'] = analysed.molecular_weight()
        df_temp['gravy'] = analysed.gravy()
        df_temp['aromaticity'] = analysed.aromaticity()
        df_temp['flex_mean'] = np.mean(analysed.flexibility())
        df_temp['flex_std'] = np.std(analysed.flexibility())
        df = df.append(df_temp, ignore_index=True)

    return pd.concat([df_copy, df], axis=1)
Esempio n. 17
0
def get_structure_perc(seq, structure="helix"):
    """
    """
    bio_seq = ProteinAnalysis(seq)
    helix, turn, sheets = bio_seq.secondary_structure_fraction()

    if structure == "helix":
        return (helix)

    elif structure == "turn":
        return (turn)

    else:
        return (sheets)
def bio_feat(record):
    clean_seq = str(MutableSeq(record.seq)).replace("X", "")
    clean_seq = clean_seq.replace("U", "C")
    clean_seq = clean_seq.replace("B", "N")
    clean_seq = clean_seq.replace('Z', 'Q')
    clean_seq = MutableSeq(clean_seq).toseq()

    ### features
    seq_length = len(str(clean_seq))
    analysed_seq = ProteinAnalysis(str(clean_seq))
    molecular_weight = analysed_seq.molecular_weight()
    amino_percent = analysed_seq.get_amino_acids_percent().values()
    isoelectric_points = analysed_seq.isoelectric_point()
    count = analysed_seq.count_amino_acids().values()
    # aromaticity = analysed_seq.aromaticity()
    instability_index = analysed_seq.instability_index()
    # hydrophobicity = analysed_seq.protein_scale(ProtParamData.kd, 5, 0.4)
    secondary_structure_fraction = analysed_seq.secondary_structure_fraction()
    return np.array([seq_length, molecular_weight, isoelectric_points, instability_index] + list(secondary_structure_fraction) + list(count) + list(amino_percent))
Esempio n. 19
0
def protein_analysis():
    if session.username == None: redirect(URL(r=request,f='../account/log_in'))
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    form = FORM(TABLE(
            TR("Amino acid sequence:  ",
               TEXTAREA(_type="text", _name="sequence",
                        requires=IS_NOT_EMPTY())),
            INPUT(_type="submit", _value="SUBMIT")))
    if form.accepts(request.vars,session):
        session['sequence'] = seqClean(form.vars.sequence.upper())
        X = ProteinAnalysis(session['sequence'])
        session['aa_count'] = X.count_amino_acids()
        session['percent_aa'] = X.get_amino_acids_percent()
        session['mw'] = X.molecular_weight()
        session['aromaticity'] = X.aromaticity()
        session['instability'] = X.instability_index()
        session['flexibility'] = X.flexibility()
        session['pI'] = X.isoelectric_point()
        session['sec_struct'] = X.secondary_structure_fraction()
        redirect(URL(r=request, f='protein_analysis_output'))
    return dict(form=form)
def get_features(seq):
    """get global features from a protein sequence

    Parameters
    ----------
    seq : str
        protein sequence

    Return
    ----------
    dictionary:
        global features of the protein sequence

    """

    features = {}
    features['undefined_count'] = len([x for x in seq if x in ['X','B','Z',"'",'O','U']])
    features['length'] = len(seq)
    features['perc_undefined_count'] = features['undefined_count']/features['length']
    features['entropy'] = entropy(seq)
    features['ideal_entropy'] = entropy_ideal(len(seq))
    features['perc_entropy'] = features['entropy']/features['ideal_entropy']
    features['hydr_count'] = sum(1 for x in seq if x in hydrophobic_proteins)
    features['polar_count'] = sum(1 for x in seq if x in polar_proteins)
    features['buried'] = sum(buried[x] for x in seq if x in hydrophobic_proteins)

    seq = ''.join([x for x in seq if x not in ['X','B','Z',"'",'O','U']])

    protein = ProteinAnalysis(seq)
    features['gravy'] = protein.gravy()
    features['molecular_weight'] = protein.molecular_weight()
    features['aromaticity'] = protein.aromaticity()
    features['instability_index'] = protein.instability_index()
    features['isoelectric_point'] = protein.isoelectric_point()
    features['helix'], features['turn'], features['sheet'] = protein.secondary_structure_fraction()

    features.update(protein.count_amino_acids())
    # features.update(protein.get_amino_acids_percent())
    return features
Esempio n. 21
0
def GetFeatures (My_seq):

    Features = {}

    ProteinAnalysis(My_seq)
    analysed_seq = ProteinAnalysis(My_seq)
    #Caracteristicas monovaloradas

    Features["Molecular_weight"] = analysed_seq.molecular_weight()
    Features["Aromaticity"] = analysed_seq.aromaticity()
    Features["Instability_index"] = analysed_seq.instability_index()
    Features["Isoelectric_point"] = analysed_seq.isoelectric_point()


    #Caracteristicas multivaloradas

    Features["Flexibility"] = analysed_seq.flexibility() # List 580
    Features["Second_structure_fraction"] = analysed_seq.secondary_structure_fraction() #3 Tupla
    Features["Count_amino_acids"] = analysed_seq.count_amino_acids() #20 Dict
    Features["Amino_acids_percent"] = analysed_seq.get_amino_acids_percent() #20 Dict


    return Features
Esempio n. 22
0
    def seqs_to_features(self, seqs, no_seqs):
        """ Extract the features from the sequences."""
        X = np.zeros((no_seqs, 32))
        for i, s in enumerate(chain(*seqs)):  # iterate over all sequences
            # get amino acid counts
            alphabet = 'ABCDEFGHIKLMNPQRSTUVWXY'  # no JOZ
            for j, letter in enumerate(alphabet):
                X[i, j] = s.count(letter) / len(s)

            # other analysis
            analysis = ProteinAnalysis(
                s.replace('X', 'A').replace('B', 'A').replace('U', 'A'))
            X[i, -1] = analysis.molecular_weight()
            X[i, -2] = analysis.aromaticity()
            X[i, -3] = analysis.instability_index()
            X[i, -4] = analysis.isoelectric_point()
            helix_array_sheet_fracs = analysis.secondary_structure_fraction()
            X[i, -5] = helix_array_sheet_fracs[0]
            X[i, -6] = helix_array_sheet_fracs[1]
            X[i, -7] = helix_array_sheet_fracs[2]
            X[i, -8] = len(s)
            X[i, -9] = analysis.gravy()  # mean hydrophobicity
        return X
def physchem_props(ara_d):
    """Calculate the physicochemical properties per protein in ara_d."""
    c = 0
    g = 0
    for protein in ara_d:
        seq = ara_d[protein]["sequence"]
        # Calculates the properties
        if "X" in seq:
            continue  # Skip non-usable sequences, only negs
        if '*' in seq:
            if ara_d[protein]["pos"] != []:
                print(protein)
            continue
        a_seq = ProteinAnalysis(seq)
        # Update ara_d with new physchem properties
        results = [
            a_seq.molecular_weight(),
            a_seq.gravy(),
            a_seq.aromaticity(),
            a_seq.instability_index(),
            a_seq.flexibility(),
            a_seq.isoelectric_point(),
            a_seq.secondary_structure_fraction(),
        ]
        keys = [
            "mol_weight",
            "gravy",
            "aromaticity",
            "instab_index",
            "flexi",
            "iso_point",
            "seq_struct",
        ]
        ara_d[protein]["Properties"] = {}
        for k, v in zip(keys, results):
            ara_d[protein]["Properties"][k] = v
    return ara_d
Esempio n. 24
0
def add_protein_characteristics(df):
    df = df.copy()
    aa_list = [
        'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P',
        'S', 'R', 'T', 'W', 'V', 'Y'
    ]
    aa_dict = {}
    for aa in aa_list:
        aa_dict[aa] = []
    prop_dict = {
        'aromaticity': [],
        'helix': [],
        'turn': [],
        'sheet': [],
        'isoelectric_point': [],
        'gravy': []
    }  #, 'flexibility': [], 'instability_index': []}
    for i, s in enumerate(df['sequence']):
        s = s.replace('B', 'D').replace('Z', 'E').replace('J', 'L').replace(
            'X', 'G').replace('U', 'C').replace('O', 'K')
        pa = ProteinAnalysis(s)
        prop_dict['aromaticity'].append(pa.aromaticity())
        prop_dict['isoelectric_point'].append(pa.isoelectric_point())
        prop_dict['gravy'].append(pa.gravy())
        # prop_dict['instability_index'].append(pa.instability_index())
        # prop_dict['flexibility'].append(np.mean(pa.flexibility()))
        for fraction, ss in zip(pa.secondary_structure_fraction(),
                                ['helix', 'turn', 'sheet']):
            prop_dict[ss].append(fraction)
        for k, v in pa.get_amino_acids_percent().items():
            aa_dict[k].append(v)
    for k, v in aa_dict.items():
        df[k] = v
    for k, v in prop_dict.items():
        df[k] = v
    return df
Esempio n. 25
0
def biopython_proteinanalysis_seq(seq, scaling=False):
    res = ProteinAnalysis(seq)
    d = {}
    flex = np.array(res.flexibility())
    d['flex:min'], d['flex:max'], d['flex:std'] = flex.min(), flex.max(
    ), flex.std()
    d['gravy'] = res.gravy()
    d['instability_index'] = res.instability_index()
    d['isoelectric_point'] = res.isoelectric_point()
    r, c = res.molar_extinction_coefficient()
    d['molar_extinction_coefficient_reduced'], d[
        'molar_extinction_coefficient_cysteines'] = r, c
    d['molecular_weight'] = res.molecular_weight()
    d['percent_helix_naive'], d['percent_turn_naive'], d[
        'percent_strand_naive'] = res.secondary_structure_fraction()

    aap = res.get_amino_acids_percent()
    aas = sorted(aap.keys())
    d.update({'percent:%s' % aa: aap[aa] for aa in aas})
    d.update({
        'prop_res_%s' % key: sum([aap.get(x, 0) for x in value])
        for key, value in list(property_residues.items())
    })
    return d
Esempio n. 26
0
def openfile():
    global prob, probab, te
    global my_seq
    global anti
    global structure, structure_id, filename
    global antigenicity, hydro, flex, sec
    global m, a, c, b, length, j, k
    global hydroph, flexi, access
    anti = []
    sec = []
    probab = []
    from tkinter import filedialog
    root = Tk()
    root.filename = filedialog.askopenfilename(
        initialdir="/",
        title="Select file",
        filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb")))
    filename = root.filename
    print(filename)
    structure_id = "1e6j"
    structure = PDBParser().get_structure(structure_id, root.filename)
    ppb = PPBuilder()
    for pp in ppb.build_peptides(structure):
        my_seq = pp.get_sequence()  # type: Seq
        print(my_seq)
    for model in structure:
        for chain in model:
            print(chain)
    sequence = list(my_seq)
    m = ''.join(sequence)
    print(m)
    length = len(m)  # type: int
    print("Sequence consist of", length, "Amino Acids")
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    analysed_seq = ProteinAnalysis(m)
    print("Molecular weight = ", analysed_seq.molecular_weight())
    print("Amino Acid Count = ", analysed_seq.count_amino_acids())
    print("Secondary structure fraction =",
          analysed_seq.secondary_structure_fraction())
    kd = {
        'A': 1.8,
        'R': -4.5,
        'N': -3.5,
        'D': -3.5,
        'C': 2.5,
        'Q': -3.5,
        'E': -3.5,
        'G': -0.4,
        'H': -3.2,
        'I': 4.5,
        'L': 3.8,
        'K': -3.9,
        'M': 1.9,
        'F': 2.8,
        'P': -1.6,
        'S': -0.8,
        'T': -0.7,
        'W': -0.9,
        'Y': -1.3,
        'V': 4.2
    }
    c = list(analysed_seq.flexibility())
    b = list(analysed_seq.protein_scale(kd, 10, 1.0))
    hydro = list(analysed_seq.protein_scale(kd, 10, 1.0))
    flex = list(analysed_seq.flexibility())
    hydroph = list(analysed_seq.protein_scale(kd, 10, 1.0))
    flexi = list(analysed_seq.flexibility())

    i = 1
    j = -1  # type: int
    k = 9
    while i <= (length - 10):
        print("Sequence is = ", m[j + 1:k + 1])
        print("Flexibility value = ", c[j + 1])
        print("Hydrophilicity value = ", b[j + 1])
        ana_seq = ''.join(m[j + 1:k + 1])
        analyze_seq = ProteinAnalysis(ana_seq)
        # For Secondary structure Analysis
        print("Secondary structure fraction =",
              analyze_seq.secondary_structure_fraction())
        a = list(analyze_seq.secondary_structure_fraction())
        a = a[0]
        sec.append(a)
        i += 1
        j += 1
        k += 1
    f = length
    r = 1
    y = 10
    global acc, logacc
    acc = []
    for i in range(0, f):
        str1 = "accessibility, resi "
        str2 = str(r) + "-" + str(y)
        saving = str1 + str2
        print(saving)
        r = r + 1
        y = y + 1
        structure = freesasa.Structure("1e6j.pdb")
        resulta = freesasa.calc(structure)
        area_classes = freesasa.classifyResults(resulta, structure)
        print("Total : %.2f A2" % resulta.totalArea())
        for key in area_classes:
            print(key, ": %.2f A2" % area_classes[key])
        resulta = freesasa.calc(
            structure,
            freesasa.Parameters({
                'algorithm': freesasa.LeeRichards,
                'n-slices': 10
            }))
        selections = freesasa.selectArea(('alanine, resn ala', saving),
                                         structure, resulta)
        for key in selections:
            print(key, ": %.2f A2" % selections[key])
            a = selections[key]
            acc.append(a)

    l = acc[0::2]
    access = l
    print(acc)
    print(l)
    logacc = [math.log(y, 10) for y in l]

    print(logacc)
Esempio n. 27
0
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import ProtParamData
from Bio import SeqIO
with open('../../samples/pdbaa') as fh:
   for rec in SeqIO.parse(fh,'fasta'):
       myprot = ProteinAnalysis(str(rec.seq))
       print(myprot.count_amino_acids())
       print(myprot.get_amino_acids_percent())
       print(myprot.molecular_weight())
       print(myprot.aromaticity())
       print(myprot.instability_index())
       print(myprot.flexibility())
       print(myprot.isoelectric_point())
       print(myprot.secondary_structure_fraction())
       print(myprot.protein_scale(ProtParamData.kd, 9, .4))
aromaticity = []
instability_index = []
# flexibility = []
isoelectric_point = []
secondary_structure_fraction = []

for protein in sequences_a:
    analysed_seq = ProteinAnalysis(str(protein.seq).replace("X", ""))

    gravy_index.append([sys.argv[1], analysed_seq.gravy()])
    aromaticity.append([sys.argv[1], analysed_seq.aromaticity()])
    instability_index.append([sys.argv[1], analysed_seq.instability_index()])
    # flexibility.append([sys.argv[1], analysed_seq.flexibility()])
    isoelectric_point.append([sys.argv[1], analysed_seq.isoelectric_point()])
    secondary_structure_fraction.append(
        [sys.argv[1], analysed_seq.secondary_structure_fraction()])

for protein in sequences_b:
    analysed_seq = ProteinAnalysis(str(protein.seq).replace("X", ""))

    gravy_index.append([sys.argv[2], analysed_seq.gravy()])
    aromaticity.append([sys.argv[2], analysed_seq.aromaticity()])
    instability_index.append([sys.argv[2], analysed_seq.instability_index()])
    # flexibility.append([sys.argv[2], analysed_seq.flexibility()])
    isoelectric_point.append([sys.argv[2], analysed_seq.isoelectric_point()])
    secondary_structure_fraction.append(
        [sys.argv[2], analysed_seq.secondary_structure_fraction()])

# Box plot showing gravy indexes
gravy_index = pd.DataFrame(gravy_index, columns=["Filename", "Gravy Index"])
Esempio n. 29
0
def openfile():
    global my_seq
    global antigenicity
    global m, a, c, b
    from tkinter import filedialog
    root = Tk()
    root.filename = filedialog.askopenfilename(
        initialdir="/",
        title="Select file",
        filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb")))
    print(root.filename)
    structure_id = "1e6j"
    structure = PDBParser().get_structure(structure_id, root.filename)
    ppb = PPBuilder()
    for pp in ppb.build_peptides(structure):
        my_seq = pp.get_sequence()  # type: Seq
        print(my_seq)
    for model in structure:
        for chain in model:
            print(chain)
    sequence = list(my_seq)
    m = ''.join(sequence)  # type: str
    print(m)
    length = len(m)  # type: int
    print(length)
    print("Sequence consist of", len(m), "Amino Acids")
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    analysed_seq = ProteinAnalysis(m)
    print("Molecular weight = ", analysed_seq.molecular_weight())
    print("Amino Acid Count = ", analysed_seq.count_amino_acids())
    print("Secondary structure fraction =",
          analysed_seq.secondary_structure_fraction())
    kd = {
        'A': 1.8,
        'R': -4.5,
        'N': -3.5,
        'D': -3.5,
        'C': 2.5,
        'Q': -3.5,
        'E': -3.5,
        'G': -0.4,
        'H': -3.2,
        'I': 4.5,
        'L': 3.8,
        'K': -3.9,
        'M': 1.9,
        'F': 2.8,
        'P': -1.6,
        'S': -0.8,
        'T': -0.7,
        'W': -0.9,
        'Y': -1.3,
        'V': 4.2
    }
    c = list(analysed_seq.flexibility())
    b = list(analysed_seq.protein_scale(kd, 10, 1.0))
    i = 1
    j = -1  # type: int
    k = 9
    while i <= (length - 10):
        print("Sequence is = ", m[j + 1:k + 1])
        print("Flexibility value = ", c[j + 1])
        print("Hydrophilicity value = ", b[j + 1])
        ana_seq = ''.join(m[j + 1:k + 1])
        analyze_seq = ProteinAnalysis(ana_seq)
        # For Secondary structure Analysis
        print("Secondary structure fraction =",
              analyze_seq.secondary_structure_fraction())
        a = list(analyze_seq.secondary_structure_fraction())
        global tupleall
        tupleall = (m[j + 1:k + 1], c[j + 1], b[j + 1], a)
        print(tupleall[0], tupleall[2], tupleall[1], tupleall[3])
        i = i + 1
        if a[0] >= a[1]:
            a[0] = 1
        else:
            a[0] = a[1]
        # For Hydrophilicity
        if b[j + 1] > 0.5:
            b[j + 1] = 2
        elif b[j + 1] < 0.5 or b[j + 1] > 0:
            b[j + 1] = 1
        elif b[j + 1] > 0 or b[j + 1] > -0.4:
            b[j + 1] = -1
        elif b[j + 1] < -0.4:
            b[j + 1] = -2
        else:
            b[j + 1] = 0
        # For Flexibility
        if c[j + 1] > 1.0:
            c[j + 1] = 1
        else:
            c[j + 1] = 0
        # For antigenicity Index
        antigenicity = 0.3 * b[j + 1] + 0.15 * 1 + 0.15 * c[j + 1] + 0.2 * a[0]
        print("antigenicity", antigenicity)
        j += 1
        k += 1
        zip_.extractall(tmpdir.path)

        for test_pdb in tmpdir.path.glob("*.pdb"):
            for record in SeqIO.parse(test_pdb, "pdb-atom"):
                sequence = str(record.seq).replace('X', 'G')
                protein = ProteinAnalysis(str(sequence))
                p_len.append(len(sequence))
                mol_w.append(protein.molecular_weight())
                iso_p.append(protein.isoelectric_point())
                smell.append(protein.aromaticity())
                taste_factor.append(protein.gravy())
                insta_ind.append(protein.instability_index())
                char_at_acid.append(protein.charge_at_pH(1))
                char_at_neutral.append(protein.charge_at_pH(7))
                char_at_base.append(protein.charge_at_pH(14))
                helter_skeler.append(protein.secondary_structure_fraction()[0])
                turnip.append(protein.secondary_structure_fraction()[1])
                garfield.append(protein.secondary_structure_fraction()[2])
                for x in amino_acids:
                    n = protein.count_amino_acids()[x]
                    for y in d_count.keys():
                        if y[-1] == x:
                            d_count[y].append(n)
                for a in amino_acids:
                    m = protein.get_amino_acids_percent()[a]
                    for b in d_perc.keys():
                        if b[-1] == a:
                            d_perc[b].append(m)
            #areas = get_area_classes(test_pdb)
            #polar_area.append(areas[0])
            #apolar_area.append(areas[1])
aminoPercent=[]
secstruct=[]
hydrophob=[]
hydrophil=[]
surface=[]
gravy=[]
molweight=[]
instidx=[]
flex=[]

for seq in sequences:
        X=ProteinAnalysis(str(seq))
        isoelectricPt.append(X.isoelectric_point())
        aromaticity.append(X.aromaticity())  
        aminoPercent.append(X.get_amino_acids_percent())
        secstruct.append(X.secondary_structure_fraction())

# These features throw Key & Value Errors due to non standard amino acids
# (i.e. out of the 20 standard ones) e.g. X, U etc
        try:
            gravy.append(X.gravy())
            molweight.append(X.molecular_weight())
            instidx.append(X.instability_index())
            flex.append(X.flexibility())
            hydrophob.append(X.protein_scale(ProtParamData.kd, 9, 0.4))
            hydrophil.append(X.protein_scale(ProtParamData.hw, 9, 0.4))
            surface.append(X.protein_scale(ProtParamData.em, 9, 0.4))

        except (KeyError,ValueError):
            gravy.append(0)
            molweight.append(0)
                      "," + str(mol_w) + "," + str(ins) + "," + str(cnt) +
                      "\n")
    else:
        with open(path_ + "\\data\\output\\svm_out.txt", "a+") as s:
            s.write("-1 " + ' '.join("{}:{}".format(k, v)
                                     for k, v in a.items()) + "\n")
        with open(pth + "weka_output.arff", "a+") as w:
            w.write(' '.join("{},".format(x)
                             for x in list(aa_count.values())) + " loc\n")
        with open(pth + "tain_DL.csv", "a+") as DPL:
            DPL.write(''.join("{},".format(x)
                              for x in list(aa_count.values())) +
                      str(round(aromat, 3)) + "," +
                      str(round(fraction[0], 3)) + "," +
                      str(round(fraction[1], 3)) + "," +
                      str(round(fraction[2], 3)) + "," + str(round(iso, 3)) +
                      "," + str(mol_w) + "," + str(ins) + "," + "0" + "\n")


for seq, cl in zip(seq_list, cls_list):  # main loop to extract the features
    _ = ProteinAnalysis(seq)  # Biopython protein analysis package
    aa_count = (_.count_amino_acids())  # amino acid count
    aromat, fraction, iso = _.aromaticity(), _.secondary_structure_fraction(
    ), _.isoelectric_point()
    try:
        mol_w, ins = ("%0.2f" % _.molecular_weight()), ("%0.2f" %
                                                        _.instability_index())
    except Exception:
        mol_w, ins = mol_w, ins  # aromaticity, sec_strucure_fraction, iso_electric point , molecular weight, instability index
    format_output(aa_count, cl)