def find_composition(df_original):
    df_copy = df_original.copy()

    column_names = []
    for ch in codes:
        column_names.append(ch + '_percent')
        column_names.append(ch + '_percent_first')
        column_names.append(ch + '_percent_last')
    column_names.append('len')
    column_names.append('weight')
    column_names.append('gravy')
    column_names.append('flex_mean')
    column_names.append('flex_std')
    column_names.append('ss_helix')
    column_names.append('ss_turn')
    column_names.append('ss_sheet')
    column_names.append('iep')
    column_names.append('aromaticity')

    df = pd.DataFrame(columns=column_names)
    for _, seq in enumerate(tqdm(df_copy['seq'])):
        df_temp = pd.Series()
        sequence = str(seq)
        analysed = ProteinAnalysis(sequence)
        analysed_first = ProteinAnalysis(sequence[:first_n])
        analysed_last = ProteinAnalysis(sequence[-last_n:])

        df_temp['len'] = analysed.length
        df_temp['ss_helix'], df_temp['ss_turn'], df_temp['ss_sheet'] = analysed.secondary_structure_fraction()
        df_temp['iep'] = analysed.isoelectric_point()

        # overall
        for aa, percent in analysed.get_amino_acids_percent().items():
            df_temp[aa + '_percent'] = percent

        # # first N
        for aa, percent in analysed_first.get_amino_acids_percent().items():
            df_temp[aa + '_percent_first'] = percent

        # last N
        for aa, percent in analysed_last.get_amino_acids_percent().items():
            df_temp[aa + '_percent_last'] = percent

        df_temp['weight'] = analysed.molecular_weight()
        df_temp['gravy'] = analysed.gravy()
        df_temp['aromaticity'] = analysed.aromaticity()
        df_temp['flex_mean'] = np.mean(analysed.flexibility())
        df_temp['flex_std'] = np.std(analysed.flexibility())
        df = df.append(df_temp, ignore_index=True)

    return pd.concat([df_copy, df], axis=1)
Example #2
0
def getProps(f):
    """
    Code for getting the molecular weight and other properties using Biopython
    """
    L = myPDB.loader(f)
    aseq = ProteinAnalysis(L.seq)
    return aseq.molecular_weight(), np.max(aseq.flexibility()), np.sum(L.ASA)
Example #3
0
def protein_analysis():
    if session.username == None:
        redirect(URL(r=request, c='account', f='log_in'))
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    form = FORM(
        TABLE(
            TR(
                "Amino acid sequence:  ",
                TEXTAREA(_type="text",
                         _name="sequence",
                         requires=IS_NOT_EMPTY())),
            INPUT(_type="submit", _value="SUBMIT")))
    if form.accepts(request.vars, session):
        session['sequence'] = seqClean(form.vars.sequence.upper())
        X = ProteinAnalysis(session['sequence'])
        session['aa_count'] = X.count_amino_acids()
        session['percent_aa'] = X.get_amino_acids_percent()
        session['mw'] = X.molecular_weight()
        session['aromaticity'] = X.aromaticity()
        session['instability'] = X.instability_index()
        session['flexibility'] = X.flexibility()
        session['pI'] = X.isoelectric_point()
        session['sec_struct'] = X.secondary_structure_fraction()
        redirect(URL(r=request, f='protein_analysis_output'))
    return dict(form=form)
Example #4
0
    def protAnalysis(self, content):
        result, resultFlexDic = dict(), dict()
        content = Parsers.normalizeSequence(content, self.sourceType)
        protein = ProteinAnalysis(content)

        result['proteinMWeight'] = protein.molecular_weight()
        result['proteinAroma'] = protein.aromaticity()
        result['proteinInstab'] = protein.instability_index()
        result['proteinIsoelec'] = protein.isoelectric_point()
        result['proteinGravy'] = protein.gravy()

        proteinStructure = protein.secondary_structure_fraction()
        protStruct = self.flatten('proteinSecstruc', proteinStructure)

        result = {**protStruct, **result}

        # merge result and protein Structure
        flexibility = protein.flexibility()
        flexibFlat = self.flatten('proteinFlex', flexibility)
        flexibAmino = self.flatten(list(content), flexibility)

        flattened = {**flexibFlat, **result}
        flattenedFlexDic = {**flexibAmino, **result}

        return result, flattened, flattenedFlexDic,
Example #5
0
def make_dataset(fasta):
    # a list of dictionaries containing features for all sequences
    ls_features = []

    # assign whether it's from tardigrades 'tar' or poplars 'pop'
    if 'tar' in fasta:
        target = 0
    elif 'pop' in fasta:
        target = 1

    for record in SeqIO.parse(fasta, "fasta"):
        analysed_seq = ProteinAnalysis(str(record.seq))

        # the dictionary containing features for a single sequence
        dict_features = {}

        # compute length
        dict_features['length'] = len(record.seq)

        # compute molecular weight
        dict_features['mol_weight'] = analysed_seq.molecular_weight()

        # compute aromaticity
        dict_features['aromaticity'] = analysed_seq.molecular_weight()

        # compute stability
        dict_features['stability'] = analysed_seq.instability_index()

        # compute flexibility
        dict_features['flexibility'] = analysed_seq.flexibility()

        # compute isoelectric point
        dict_features['isoelectric'] = analysed_seq.isoelectric_point()

        # compute secondary structure fraction
        frac = analysed_seq.secondary_structure_fraction()
        dict_features['helix'] = frac[0]
        dict_features['turn'] = frac[1]
        dict_features['sheet'] = frac[2]

        # compute AAC composition of entire sequence
        aac = analysed_seq.get_amino_acids_percent()

        # merge all features and dictionaries into dict_features
        dict_features.update(aac)
        ls_features += [dict_features]

    df = pd.DataFrame(ls_features)
    df['target'] = target

    print(df)
    df.to_pickle(name + '_set.pkl')
Example #6
0
    def get_flexibility(self):
        """
        Calculates the flexibility according to Vihinen, 1994 (return proteinsequencelenght-9 values ) from biopython

        :return: dictionary with proteinsequencelenght-9 values of flexiblity
        """

        res = {}
        analysed_seq = ProteinAnalysis(self.ProteinSequence)
        flexibility = analysed_seq.flexibility()
        for i in range(len(flexibility)):
            res['flexibility_' + str(i)] = flexibility[i]
        return res
Example #7
0
def biopython_protein_analysis(inseq):
    """Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string.

    For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html

    Args:
        inseq: Amino acid sequence

    Returns:
        dict: Dictionary of sequence properties. Some definitions include:
        instability_index: Any value above 40 means the protein is unstable (has a short half life).
        secondary_structure_fraction: Percentage of protein in helix, turn or sheet

    TODO:
        Finish definitions of dictionary

    """

    inseq = ssbio.protein.sequence.utils.cast_to_str(inseq)

    analysed_seq = ProteinAnalysis(inseq)

    info_dict = {}
    info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids()
    info_dict[
        'amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent()
    info_dict['length-biop'] = analysed_seq.length
    info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic
    info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight()
    info_dict['aromaticity-biop'] = analysed_seq.aromaticity()
    info_dict['instability_index-biop'] = analysed_seq.instability_index()
    # TODO: What is flexibility?
    info_dict['flexibility-biop'] = analysed_seq.flexibility()
    info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point()

    # grand average of hydrophobicity
    info_dict['gravy-biop'] = analysed_seq.gravy()

    # Separated secondary_structure_fraction into each definition
    # info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction()
    info_dict[
        'percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[0]
    info_dict[
        'percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[1]
    info_dict[
        'percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[2]

    return info_dict
Example #8
0
    def flexibility(self):  #overrides base class
        if self._contains_unknown:
            new_sequence = self._sequence.replace('X',
                                                  'R')  #replace with an avg aa
            new_sequence = new_sequence.replace('U',
                                                'R')  #replace with an avg aa
            new_sequence = new_sequence.replace('B',
                                                'R')  #replace with an avg aa
            new_p = ProteinAnalysis(new_sequence)
            flex = new_p.flexibility()
        else:
            flex = super(Protein, self).flexibility()

        return flex
Example #9
0
def protein_analysis():
    if session.username == None: redirect(URL(r=request,f='../account/log_in'))
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    form = FORM(TABLE(
            TR("Amino acid sequence:  ",
               TEXTAREA(_type="text", _name="sequence",
                        requires=IS_NOT_EMPTY())),
            INPUT(_type="submit", _value="SUBMIT")))
    if form.accepts(request.vars,session):
        session['sequence'] = seqClean(form.vars.sequence.upper())
        X = ProteinAnalysis(session['sequence'])
        session['aa_count'] = X.count_amino_acids()
        session['percent_aa'] = X.get_amino_acids_percent()
        session['mw'] = X.molecular_weight()
        session['aromaticity'] = X.aromaticity()
        session['instability'] = X.instability_index()
        session['flexibility'] = X.flexibility()
        session['pI'] = X.isoelectric_point()
        session['sec_struct'] = X.secondary_structure_fraction()
        redirect(URL(r=request, f='protein_analysis_output'))
    return dict(form=form)
Example #10
0
def GetFeatures (My_seq):

    Features = {}

    ProteinAnalysis(My_seq)
    analysed_seq = ProteinAnalysis(My_seq)
    #Caracteristicas monovaloradas

    Features["Molecular_weight"] = analysed_seq.molecular_weight()
    Features["Aromaticity"] = analysed_seq.aromaticity()
    Features["Instability_index"] = analysed_seq.instability_index()
    Features["Isoelectric_point"] = analysed_seq.isoelectric_point()


    #Caracteristicas multivaloradas

    Features["Flexibility"] = analysed_seq.flexibility() # List 580
    Features["Second_structure_fraction"] = analysed_seq.secondary_structure_fraction() #3 Tupla
    Features["Count_amino_acids"] = analysed_seq.count_amino_acids() #20 Dict
    Features["Amino_acids_percent"] = analysed_seq.get_amino_acids_percent() #20 Dict


    return Features
def physchem_props(ara_d):
    """Calculate the physicochemical properties per protein in ara_d."""
    c = 0
    g = 0
    for protein in ara_d:
        seq = ara_d[protein]["sequence"]
        # Calculates the properties
        if "X" in seq:
            continue  # Skip non-usable sequences, only negs
        if '*' in seq:
            if ara_d[protein]["pos"] != []:
                print(protein)
            continue
        a_seq = ProteinAnalysis(seq)
        # Update ara_d with new physchem properties
        results = [
            a_seq.molecular_weight(),
            a_seq.gravy(),
            a_seq.aromaticity(),
            a_seq.instability_index(),
            a_seq.flexibility(),
            a_seq.isoelectric_point(),
            a_seq.secondary_structure_fraction(),
        ]
        keys = [
            "mol_weight",
            "gravy",
            "aromaticity",
            "instab_index",
            "flexi",
            "iso_point",
            "seq_struct",
        ]
        ara_d[protein]["Properties"] = {}
        for k, v in zip(keys, results):
            ara_d[protein]["Properties"][k] = v
    return ara_d
Example #12
0
def biopython_proteinanalysis_seq(seq, scaling=False):
    res = ProteinAnalysis(seq)
    d = {}
    flex = np.array(res.flexibility())
    d['flex:min'], d['flex:max'], d['flex:std'] = flex.min(), flex.max(
    ), flex.std()
    d['gravy'] = res.gravy()
    d['instability_index'] = res.instability_index()
    d['isoelectric_point'] = res.isoelectric_point()
    r, c = res.molar_extinction_coefficient()
    d['molar_extinction_coefficient_reduced'], d[
        'molar_extinction_coefficient_cysteines'] = r, c
    d['molecular_weight'] = res.molecular_weight()
    d['percent_helix_naive'], d['percent_turn_naive'], d[
        'percent_strand_naive'] = res.secondary_structure_fraction()

    aap = res.get_amino_acids_percent()
    aas = sorted(aap.keys())
    d.update({'percent:%s' % aa: aap[aa] for aa in aas})
    d.update({
        'prop_res_%s' % key: sum([aap.get(x, 0) for x in value])
        for key, value in list(property_residues.items())
    })
    return d
flex=[]

for seq in sequences:
        X=ProteinAnalysis(str(seq))
        isoelectricPt.append(X.isoelectric_point())
        aromaticity.append(X.aromaticity())  
        aminoPercent.append(X.get_amino_acids_percent())
        secstruct.append(X.secondary_structure_fraction())

# These features throw Key & Value Errors due to non standard amino acids
# (i.e. out of the 20 standard ones) e.g. X, U etc
        try:
            gravy.append(X.gravy())
            molweight.append(X.molecular_weight())
            instidx.append(X.instability_index())
            flex.append(X.flexibility())
            hydrophob.append(X.protein_scale(ProtParamData.kd, 9, 0.4))
            hydrophil.append(X.protein_scale(ProtParamData.hw, 9, 0.4))
            surface.append(X.protein_scale(ProtParamData.em, 9, 0.4))

        except (KeyError,ValueError):
            gravy.append(0)
            molweight.append(0)
            instidx.append(0)
            flex.append([0,0])
            hydrophob.append([0,0])
            hydrophil.append([0,0])
            surface.append([0,0])

isoelectricPt_df = pd.DataFrame(isoelectricPt,columns=['isoelectricPt'])
aromaticity_df = pd.DataFrame(aromaticity,columns=['aromaticity'])
Example #14
0
def openfile():
    global prob, probab, te
    global my_seq
    global anti
    global structure, structure_id, filename
    global antigenicity, hydro, flex, sec
    global m, a, c, b, length, j, k
    global hydroph, flexi, access
    anti = []
    sec = []
    probab = []
    from tkinter import filedialog
    root = Tk()
    root.filename = filedialog.askopenfilename(
        initialdir="/",
        title="Select file",
        filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb")))
    filename = root.filename
    print(filename)
    structure_id = "1e6j"
    structure = PDBParser().get_structure(structure_id, root.filename)
    ppb = PPBuilder()
    for pp in ppb.build_peptides(structure):
        my_seq = pp.get_sequence()  # type: Seq
        print(my_seq)
    for model in structure:
        for chain in model:
            print(chain)
    sequence = list(my_seq)
    m = ''.join(sequence)
    print(m)
    length = len(m)  # type: int
    print("Sequence consist of", length, "Amino Acids")
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    analysed_seq = ProteinAnalysis(m)
    print("Molecular weight = ", analysed_seq.molecular_weight())
    print("Amino Acid Count = ", analysed_seq.count_amino_acids())
    print("Secondary structure fraction =",
          analysed_seq.secondary_structure_fraction())
    kd = {
        'A': 1.8,
        'R': -4.5,
        'N': -3.5,
        'D': -3.5,
        'C': 2.5,
        'Q': -3.5,
        'E': -3.5,
        'G': -0.4,
        'H': -3.2,
        'I': 4.5,
        'L': 3.8,
        'K': -3.9,
        'M': 1.9,
        'F': 2.8,
        'P': -1.6,
        'S': -0.8,
        'T': -0.7,
        'W': -0.9,
        'Y': -1.3,
        'V': 4.2
    }
    c = list(analysed_seq.flexibility())
    b = list(analysed_seq.protein_scale(kd, 10, 1.0))
    hydro = list(analysed_seq.protein_scale(kd, 10, 1.0))
    flex = list(analysed_seq.flexibility())
    hydroph = list(analysed_seq.protein_scale(kd, 10, 1.0))
    flexi = list(analysed_seq.flexibility())

    i = 1
    j = -1  # type: int
    k = 9
    while i <= (length - 10):
        print("Sequence is = ", m[j + 1:k + 1])
        print("Flexibility value = ", c[j + 1])
        print("Hydrophilicity value = ", b[j + 1])
        ana_seq = ''.join(m[j + 1:k + 1])
        analyze_seq = ProteinAnalysis(ana_seq)
        # For Secondary structure Analysis
        print("Secondary structure fraction =",
              analyze_seq.secondary_structure_fraction())
        a = list(analyze_seq.secondary_structure_fraction())
        a = a[0]
        sec.append(a)
        i += 1
        j += 1
        k += 1
    f = length
    r = 1
    y = 10
    global acc, logacc
    acc = []
    for i in range(0, f):
        str1 = "accessibility, resi "
        str2 = str(r) + "-" + str(y)
        saving = str1 + str2
        print(saving)
        r = r + 1
        y = y + 1
        structure = freesasa.Structure("1e6j.pdb")
        resulta = freesasa.calc(structure)
        area_classes = freesasa.classifyResults(resulta, structure)
        print("Total : %.2f A2" % resulta.totalArea())
        for key in area_classes:
            print(key, ": %.2f A2" % area_classes[key])
        resulta = freesasa.calc(
            structure,
            freesasa.Parameters({
                'algorithm': freesasa.LeeRichards,
                'n-slices': 10
            }))
        selections = freesasa.selectArea(('alanine, resn ala', saving),
                                         structure, resulta)
        for key in selections:
            print(key, ": %.2f A2" % selections[key])
            a = selections[key]
            acc.append(a)

    l = acc[0::2]
    access = l
    print(acc)
    print(l)
    logacc = [math.log(y, 10) for y in l]

    print(logacc)
Example #15
0
class ProteinFeatureExtractor:
    """
    Feature extraction from protein sequence for
    Machine Learning classification or deeper analysis

    Example usage:

        from features.extractors.proteins import ProteinFeatureExtractor

        pfe = ProteinFeatureExtractor(protein_sequence='MAKINELLRESTTTNSNSIGRPNLVALTRATTKLIYSDIVATQRTNQPVAA')
        pfe.get_features()
    """

    FEATURE_NAMES = [
        "protein_length",
        "gravy",
        "molecular_weight",
        "aromaticity",
        "instability_index",
        "isoelectric_point",
        "flexibility",
        "mec_cysteines",
        "mec_cystines",
        "ssf_helix",
        "ssf_turn",
        "ssf_sheet",
    ]

    def __init__(self, protein_sequence: str):
        self.protein_sequence = self._normalize(protein_sequence)

        self.protein_analysis = ProteinAnalysis(self.protein_sequence)

    @staticmethod
    def _normalize(source: Union[str, SeqRecord]) -> str:
        """
        Normalize each protein sequence
        to uppercase and without blank chars
        """

        # If source is a string
        if isinstance(source, str):
            entry = source
        # If source is a BioPython object with seq field
        else:
            entry = source.seq

        return str(entry).upper().strip()

    def _get_protein_length(self) -> int:
        """
        Protein length
        """

        return len(self.protein_sequence)

    def _calculate_gravy(self) -> float:
        """
        GRAVY (Grand Average of Hydropathy) index score
        is calculated by adding the hydropathy value for
        each residue and then dividing by the length of
        the protein sequence

        Negative GRAVY value indicates that the protein
        is non-polar and Positive value indicates that
        the protein is polar
        """

        return self.protein_analysis.gravy()

    def _calculate_molecular_weight(self) -> float:
        """
        Molecular Weight is calculated as the sum
        of atomic masses of all atoms in the molecul
        """

        return self.protein_analysis.molecular_weight()

    def _calculate_aromaticity(self) -> float:
        """
        Aromaticity is used to describe a planar, cyclic
        molecule with a ring of resonance bonds which is
        more stable when compared to other connective or
        geometric arrangements consisting of the same set
        of atoms
        """

        return self.protein_analysis.aromaticity()

    def _calculate_instability_index(self) -> float:
        """
        Instability index gives an estimate of the stability
        of the protein in a test tube

        Any value above 40 means that the protein is unstable
        (has a short half life)
        """

        return self.protein_analysis.instability_index()

    def _calculate_isoelectric_point(self) -> float:
        """
        Isoelectric point (pI) is the pH at which net charge of
        the protein is zero. Isoelectric point is widely useful
        for choosing a buffer system for purification and
        crystallisation of a given protein
        """

        return self.protein_analysis.isoelectric_point()

    def _calculate_flexibility(self) -> float:
        """
        Flexibility is of overwhelming importance for protein function,
        because of the changes in protein structure during interactions
        with binding partners
        """

        return sum(self.protein_analysis.flexibility())

    def _calculate_molar_extinction_coefficient(self) -> Mapping[str, float]:
        """
        Molar extinction coefficient of a protein sequence can be calculated
        from the molar extension coefficient of amino acids which are
        Cystine, Tyrosine and Tryptophan
        """

        cysteines, cystines = self.protein_analysis.molar_extinction_coefficient()

        residues = {self.FEATURE_NAMES[7]: cysteines, self.FEATURE_NAMES[8]: cystines}

        return residues

    def _calculate_secondary_structure_fraction(self) -> Mapping[str, float]:
        """
        This function returns a list of the fraction of amino acids which
        tend to be in Helix, Turn or Sheet

        Amino acids present in Turn are:
        Asparagine (N), Proline (P), Glycine (G), Serine (S)

        Amino acids present in Sheets are:
        Glutamic acid (E), Methionine (M), Alanine (A), Leucine (L)
        """

        helix, turn, sheet = self.protein_analysis.secondary_structure_fraction()

        fractions = {
            self.FEATURE_NAMES[9]: helix,
            self.FEATURE_NAMES[10]: turn,
            self.FEATURE_NAMES[11]: sheet,
        }

        return fractions

    def get_features(self) -> Mapping[str, Union[int, float, None]]:
        """
        Return full feature space for single protein as Python dict
        """

        features = {
            self.FEATURE_NAMES[0]: self._get_protein_length(),
            self.FEATURE_NAMES[1]: self._calculate_gravy(),
            self.FEATURE_NAMES[2]: self._calculate_molecular_weight(),
            self.FEATURE_NAMES[3]: self._calculate_aromaticity(),
            self.FEATURE_NAMES[4]: self._calculate_instability_index(),
            self.FEATURE_NAMES[5]: self._calculate_isoelectric_point(),
            self.FEATURE_NAMES[6]: self._calculate_flexibility(),
        }

        features.update(self._calculate_molar_extinction_coefficient())

        features.update(self._calculate_secondary_structure_fraction())

        return features
Example #16
0
def openfile():
    global my_seq
    global antigenicity
    global m, a, c, b
    from tkinter import filedialog
    root = Tk()
    root.filename = filedialog.askopenfilename(
        initialdir="/",
        title="Select file",
        filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb")))
    print(root.filename)
    structure_id = "1e6j"
    structure = PDBParser().get_structure(structure_id, root.filename)
    ppb = PPBuilder()
    for pp in ppb.build_peptides(structure):
        my_seq = pp.get_sequence()  # type: Seq
        print(my_seq)
    for model in structure:
        for chain in model:
            print(chain)
    sequence = list(my_seq)
    m = ''.join(sequence)  # type: str
    print(m)
    length = len(m)  # type: int
    print(length)
    print("Sequence consist of", len(m), "Amino Acids")
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    analysed_seq = ProteinAnalysis(m)
    print("Molecular weight = ", analysed_seq.molecular_weight())
    print("Amino Acid Count = ", analysed_seq.count_amino_acids())
    print("Secondary structure fraction =",
          analysed_seq.secondary_structure_fraction())
    kd = {
        'A': 1.8,
        'R': -4.5,
        'N': -3.5,
        'D': -3.5,
        'C': 2.5,
        'Q': -3.5,
        'E': -3.5,
        'G': -0.4,
        'H': -3.2,
        'I': 4.5,
        'L': 3.8,
        'K': -3.9,
        'M': 1.9,
        'F': 2.8,
        'P': -1.6,
        'S': -0.8,
        'T': -0.7,
        'W': -0.9,
        'Y': -1.3,
        'V': 4.2
    }
    c = list(analysed_seq.flexibility())
    b = list(analysed_seq.protein_scale(kd, 10, 1.0))
    i = 1
    j = -1  # type: int
    k = 9
    while i <= (length - 10):
        print("Sequence is = ", m[j + 1:k + 1])
        print("Flexibility value = ", c[j + 1])
        print("Hydrophilicity value = ", b[j + 1])
        ana_seq = ''.join(m[j + 1:k + 1])
        analyze_seq = ProteinAnalysis(ana_seq)
        # For Secondary structure Analysis
        print("Secondary structure fraction =",
              analyze_seq.secondary_structure_fraction())
        a = list(analyze_seq.secondary_structure_fraction())
        global tupleall
        tupleall = (m[j + 1:k + 1], c[j + 1], b[j + 1], a)
        print(tupleall[0], tupleall[2], tupleall[1], tupleall[3])
        i = i + 1
        if a[0] >= a[1]:
            a[0] = 1
        else:
            a[0] = a[1]
        # For Hydrophilicity
        if b[j + 1] > 0.5:
            b[j + 1] = 2
        elif b[j + 1] < 0.5 or b[j + 1] > 0:
            b[j + 1] = 1
        elif b[j + 1] > 0 or b[j + 1] > -0.4:
            b[j + 1] = -1
        elif b[j + 1] < -0.4:
            b[j + 1] = -2
        else:
            b[j + 1] = 0
        # For Flexibility
        if c[j + 1] > 1.0:
            c[j + 1] = 1
        else:
            c[j + 1] = 0
        # For antigenicity Index
        antigenicity = 0.3 * b[j + 1] + 0.15 * 1 + 0.15 * c[j + 1] + 0.2 * a[0]
        print("antigenicity", antigenicity)
        j += 1
        k += 1
def protein_features(protein_sequences):
    """
    This function calculates a number of basic properties for a list of protein sequences
    
    Input: list of protein sequences (as strings), length can also be 1
    Output: a dataframe of features
    """

    import numpy as np
    import pandas as pd
    from Bio.SeqUtils.ProtParam import ProteinAnalysis

    # AA frequency and protein characteristics
    mol_weight = []
    aromaticity = []
    instability = []
    flexibility = []
    prot_length = []
    pI = []
    helix_frac = []
    turn_frac = []
    sheet_frac = []
    frac_aliph = []
    frac_unch_polar = []
    frac_polar = []
    frac_hydrophob = []
    frac_pos = []
    frac_sulfur = []
    frac_neg = []
    frac_amide = []
    frac_alcohol = []
    AA_dict = {
        'G': [],
        'A': [],
        'V': [],
        'L': [],
        'I': [],
        'F': [],
        'P': [],
        'S': [],
        'T': [],
        'Y': [],
        'Q': [],
        'N': [],
        'E': [],
        'D': [],
        'W': [],
        'H': [],
        'R': [],
        'K': [],
        'M': [],
        'C': []
    }

    for item in protein_sequences:
        # calculate various protein properties
        prot_length.append(len(item))
        frac_aliph.append(
            (item.count('A') + item.count('G') + item.count('I') +
             item.count('L') + item.count('P') + item.count('V')) / len(item))
        frac_unch_polar.append((item.count('S') + item.count('T') +
                                item.count('N') + item.count('Q')) / len(item))
        frac_polar.append(
            (item.count('Q') + item.count('N') + item.count('H') +
             item.count('S') + item.count('T') + item.count('Y') +
             item.count('C') + item.count('M') + item.count('W')) / len(item))
        frac_hydrophob.append(
            (item.count('A') + item.count('G') + item.count('I') +
             item.count('L') + item.count('P') + item.count('V') +
             item.count('F')) / len(item))
        frac_pos.append(
            (item.count('H') + item.count('K') + item.count('R')) / len(item))
        frac_sulfur.append((item.count('C') + item.count('M')) / len(item))
        frac_neg.append((item.count('D') + item.count('E')) / len(item))
        frac_amide.append((item.count('N') + item.count('Q')) / len(item))
        frac_alcohol.append((item.count('S') + item.count('T')) / len(item))
        protein_chars = ProteinAnalysis(item)
        mol_weight.append(protein_chars.molecular_weight())
        aromaticity.append(protein_chars.aromaticity())
        instability.append(protein_chars.instability_index())
        flexibility.append(np.mean(protein_chars.flexibility()))
        pI.append(protein_chars.isoelectric_point())
        H, T, S = protein_chars.secondary_structure_fraction()
        helix_frac.append(H)
        turn_frac.append(T)
        sheet_frac.append(S)

        # calculate AA frequency
        for key in AA_dict.keys():
            AA_dict[key].append(item.count(key) / len(item))

    # make new dataframe & return
    features_protein = pd.DataFrame.from_dict(AA_dict)
    features_protein['protein_length'] = np.asarray(prot_length)
    features_protein['mol_weight'] = np.asarray(mol_weight)
    features_protein['aromaticity'] = np.asarray(aromaticity)
    features_protein['instability'] = np.asarray(instability)
    features_protein['flexibility'] = np.asarray(flexibility)
    features_protein['pI'] = np.asarray(pI)
    features_protein['frac_aliphatic'] = np.asarray(frac_aliph)
    features_protein['frac_uncharged_polar'] = np.asarray(frac_unch_polar)
    features_protein['frac_polar'] = np.asarray(frac_polar)
    features_protein['frac_hydrophobic'] = np.asarray(frac_hydrophob)
    features_protein['frac_positive'] = np.asarray(frac_pos)
    features_protein['frac_sulfur'] = np.asarray(frac_sulfur)
    features_protein['frac_negative'] = np.asarray(frac_neg)
    features_protein['frac_amide'] = np.asarray(frac_amide)
    features_protein['frac_alcohol'] = np.asarray(frac_alcohol)
    features_protein['AA_frac_helix'] = np.asarray(helix_frac)
    features_protein['AA_frac_turn'] = np.asarray(turn_frac)
    features_protein['AA_frac_sheet'] = np.asarray(sheet_frac)

    return features_protein
Example #18
0
def get_flexibility(seq):
    """
    Get weight ignoreing modifications.
    """
    bio_seq = ProteinAnalysis(seq)
    return (bio_seq.flexibility())
Example #19
0
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import ProtParamData
from Bio import SeqIO
with open('../../samples/pdbaa') as fh:
   for rec in SeqIO.parse(fh,'fasta'):
       myprot = ProteinAnalysis(str(rec.seq))
       print(myprot.count_amino_acids())
       print(myprot.get_amino_acids_percent())
       print(myprot.molecular_weight())
       print(myprot.aromaticity())
       print(myprot.instability_index())
       print(myprot.flexibility())
       print(myprot.isoelectric_point())
       print(myprot.secondary_structure_fraction())
       print(myprot.protein_scale(ProtParamData.kd, 9, .4))
 def transform(self, X):
     vec = np.zeros((len(X), 1))
     for i in range(len(X)):
         pa = ProteinAnalysis(str(X[i]))
         vec[i, 0] = pa.flexibility()
     return vec