Ejemplo n.º 1
0
def protParam(seq):
    params = ProteinAnalysis(seq)
    mw = params.molecular_weight()
    c_aa = params.count_amino_acids()
    p_aa = params.get_amino_acids_percent()
    gravy = params.gravy()
    aromaticity = params.aromaticity()
    isoelectric_point = params.isoelectric_point()
    ext_coeff = sum([c_aa["W"]*5690,c_aa["Y"]*1280,c_aa["C"]*120])
    mgml = ext_coeff * (1./mw)
    
    print("Amino acid count")
    pprint.pprint(c_aa)
    print("Amino acid percent")
    pprint.pprint(p_aa)
    print("Molecular weight")
    print("%f Da"%mw)
    print("Gravy")
    print(gravy)
    print("Isoelectric point")
    print(isoelectric_point)
    print("Aromaticity")
    print(aromaticity)
    print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)"%ext_coeff)
    print("")
Ejemplo n.º 2
0
def sequence_vector(temp_window: str, window: int = 6, chemical=1):
    """
    This vector takes the sequence and has each amino acid represented by an int
    0 represents nonstandard amino acids or as fluff for tails/heads of sequences
    Strip is a list which can be modified as user needs call for
    """
    temp_window = clean(temp_window)
    temp_window = windower(sequence=temp_window, position=int(len(temp_window)*.5), wing_size=window)

    vec = []
    aa = {"G": 1, "A": 2, "L": 3, "M": 4, "F": 5, "W": 6, "K": 7, "Q": 8, "E": 9, "S": 10, "P": 11, "V": 12, "I": 13,
          "C": 14, "Y": 15, "H": 16, "R": 17, "N": 18, "D": 19, "T": 20, "X": 0}

    for i in temp_window:
        vec.append(aa[i])
    if len(vec) != (window*2)+1:
        t = len(vec)
        for i in range((window*2)+1-t):
            vec.append(0)
    # Hydrophobicity is optional
    if chemical == 1:
        s = ProteinAnalysis(temp_window)
        vec.append(s.gravy())
        vec.append(s.instability_index())
        vec.append(s.aromaticity())

    return vec
Ejemplo n.º 3
0
def find_gravy_stats(folders, outfile, condition, regex = None, frequency = False):
    mean_list = []
    for folder in folders:
        with open(folder[0] + '/5_AA-sequences.txt') as f:
            gravy_all = 0
            total_seqs = 0
            reader = csv.DictReader(f, delimiter = '\t')
            for row in reader:
                try:
                    if row['Functionality'] == 'productive' and condition(row['CDR3-IMGT']):
                        protein = Prot(row['CDR3-IMGT'])
                        gravy = protein.gravy()
                        if frequency:
                            pat = re.compile(regex)
                            info = pat.match(row['Sequence ID'])
                            freq = int(info.group(1))
                        else:
                            freq = 1
                        total_seqs += freq
                        gravy_all += gravy * freq
                except:
                    pass
            try:    
                mean_list.append(gravy_all/float(total_seqs))
                print mean_list
            except:
                pass
    with open(outfile + '_means.txt', 'w') as out:
        for item in mean_list:
            out.write(str(item) +'\n')
    with open(outfile + '.txt', 'w') as out:
        out.write('mean CDR3 gravy,standard deviation\n')
        out.write(str(np.mean(mean_list)) + ',' + str(np.std(mean_list)))
Ejemplo n.º 4
0
    def _protein_parameters(self, sequence):
        """Calculates physicochemical properties for the amino acid sequence.
        
        Args:
            sequence: str, amino acid sequence.
            
        Returns: 
            property_arr: np array, vector of properties.
            
        """

        analysis = ProteinAnalysis(sequence)

        property_arr = []

        property_arr.append(analysis.molecular_weight())
        property_arr.append(analysis.aromaticity())
        property_arr.append(analysis.instability_index())
        property_arr.append(analysis.gravy())
        property_arr.append(analysis.isoelectric_point())

        secondary = analysis.secondary_structure_fraction()
        property_arr.append(secondary[0])
        property_arr.append(secondary[1])
        property_arr.append(secondary[2])

        molar_extinction_coefficient = analysis.molar_extinction_coefficient()
        property_arr.append(molar_extinction_coefficient[0])
        property_arr.append(molar_extinction_coefficient[1])

        property_arr.append(self._net_charge(sequence))

        return np.array(property_arr)
Ejemplo n.º 5
0
def protein_properties(seq):
    """Return a tuple with some protein biochemical properties

    seq is a Bio.Seq.Seq or str representing protein sequence
    """
    pa = ProteinAnalysis(seq)

    aa_counts = pa.count_amino_acids()
    arom = pa.aromaticity()
    isoelec = pa.isoelectric_point()
    try:
        instability = pa.instability_index()
    except KeyError:
        instability = None
    try:
        gravy = pa.gravy()
    except KeyError:
        gravy = None

    return ProtProp(aa=str(seq),
                    gravy=gravy,
                    aromaticity=arom,
                    isoelectric_point=isoelec,
                    instability=instability,
                    aa_counts=aa_counts)
def calculate_physiochemical_features(temp_dict, sequence):
    analyzed_seq = ProteinAnalysis(sequence)

    charge_at_pH7 = analyzed_seq.charge_at_pH(7)
    instability_index = analyzed_seq.instability_index()
    molecular_weight = analyzed_seq.molecular_weight()
    aromaticity = analyzed_seq.aromaticity()
    molar_extinction_coefficient = analyzed_seq.molar_extinction_coefficient()
    range_l, range_h = molar_extinction_coefficient
    molar_extinction_coefficient = (float(range_l) + float(range_h)) / 2
    gravy = analyzed_seq.gravy(
    )  #Grand Average Hyrdopathy - Higher value = More Hydrophobic
    isoelectric_point = analyzed_seq.isoelectric_point()
    helix_fraction, turn_fraction, sheet_fraction = analyzed_seq.secondary_structure_fraction(
    )

    physiochem_dict = {
        "Charge at pH7": charge_at_pH7,
        "Instability Index": instability_index,
        "Molecular Wt": molecular_weight,
        "Aromaticity": aromaticity,
        "Molar Extinction Coeff": molar_extinction_coefficient,
        "Gravy": gravy,
        "Isoelectric pt": isoelectric_point,
        "Helix Fraction": helix_fraction,
        "Turn Fraction": turn_fraction,
        "Sheet Fraction": sheet_fraction
    }
    temp_dict.update(physiochem_dict)

    #Adding separately because get_amino_acids_percent() generates a dictionary on its own
    aa_percent = analyzed_seq.get_amino_acids_percent()
    temp_dict.update(aa_percent)
Ejemplo n.º 7
0
    def protAnalysis(self, content):
        result, resultFlexDic = dict(), dict()
        content = Parsers.normalizeSequence(content, self.sourceType)
        protein = ProteinAnalysis(content)

        result['proteinMWeight'] = protein.molecular_weight()
        result['proteinAroma'] = protein.aromaticity()
        result['proteinInstab'] = protein.instability_index()
        result['proteinIsoelec'] = protein.isoelectric_point()
        result['proteinGravy'] = protein.gravy()

        proteinStructure = protein.secondary_structure_fraction()
        protStruct = self.flatten('proteinSecstruc', proteinStructure)

        result = {**protStruct, **result}

        # merge result and protein Structure
        flexibility = protein.flexibility()
        flexibFlat = self.flatten('proteinFlex', flexibility)
        flexibAmino = self.flatten(list(content), flexibility)

        flattened = {**flexibFlat, **result}
        flattenedFlexDic = {**flexibAmino, **result}

        return result, flattened, flattenedFlexDic,
Ejemplo n.º 8
0
def physchem_props(data):
    """Calculate the physicochemical properties per protein in ara_d."""
    new_table = []
    header = "ID\tclass\tindex\tsequon\tsequence\tmol_weight\tgravy\taromaticity\tinstab_index\tiso_point\n"
    new_table.append(header)
    for line in data:
        split_line = line.rstrip().split('\t')
        seq = split_line[-2]  # Sequon, not sequence
        # Calculates the properties
        if "X" in seq or '*' in seq or seq == '':
            continue  # Skip non-usable sequences, only negs
        try:
            a_seq = ProteinAnalysis(seq)
            # Update ara_d with new physchem properties
            results = [
                a_seq.molecular_weight(),
                a_seq.gravy(),
                a_seq.aromaticity(),
                a_seq.instability_index(),
                #a_seq.flexibility(),
                a_seq.isoelectric_point(),
                #a_seq.secondary_structure_fraction(),
            ]
        except:
            print(split_line)
            sys.exit(1)
        new_line = line.rstrip() + "\t{}\t{}\t{}\t{}\t{}\n".format(*results)
        new_table.append(new_line)
    return new_table
Ejemplo n.º 9
0
def protParam(seq):
    params = ProteinAnalysis(seq)
    mw = params.molecular_weight()
    c_aa = params.count_amino_acids()
    p_aa = params.get_amino_acids_percent()
    gravy = params.gravy()
    aromaticity = params.aromaticity()
    isoelectric_point = params.isoelectric_point()
    ext_coeff = sum([c_aa["W"] * 5690, c_aa["Y"] * 1280, c_aa["C"] * 120])
    mgml = ext_coeff * (1. / mw)

    print("Amino acid count")
    pprint.pprint(c_aa)
    print("Amino acid percent")
    pprint.pprint(p_aa)
    print("Molecular weight")
    print("%f Da" % mw)
    print("Gravy")
    print(gravy)
    print("Isoelectric point")
    print(isoelectric_point)
    print("Aromaticity")
    print(aromaticity)
    print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)" % ext_coeff)
    print("")
Ejemplo n.º 10
0
 def get_gravy_list(self):
     gravy_list = []
     for seq in self.df.index:  # for every seq, add gravy to list
         seq = ProteinAnalysis(seq)
         gravy = "{:.6f}".format(seq.gravy())
         gravy_list.append(gravy)
     gravy_list = np.array(gravy_list)  # convert to np array
     return self.normalize(gravy_list)  # return normalized
def get_protein_analysis(aa):
    protein_analysis = ProteinAnalysis(aa)
    analyze = [protein_analysis.molecular_weight(), 
        protein_analysis.aromaticity(),
        protein_analysis.instability_index(),
        protein_analysis.isoelectric_point(),
        protein_analysis.gravy()] + list(
        protein_analysis.secondary_structure_fraction())
    return analyze
Ejemplo n.º 12
0
    def get_gravy(self):
        """
        Calculates Gravy from sequence (1 value) from biopython

        :return: dictionary with the value of gravy
        """

        res = {}
        analysed_seq = ProteinAnalysis(self.ProteinSequence)
        res['Gravy'] = analysed_seq.gravy()
        return res
Ejemplo n.º 13
0
def biopython_protein_analysis(inseq):
    """Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string.

    For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html

    Args:
        inseq: Amino acid sequence

    Returns:
        dict: Dictionary of sequence properties. Some definitions include:
        instability_index: Any value above 40 means the protein is unstable (has a short half life).
        secondary_structure_fraction: Percentage of protein in helix, turn or sheet

    TODO:
        Finish definitions of dictionary

    """

    inseq = ssbio.protein.sequence.utils.cast_to_str(inseq)

    analysed_seq = ProteinAnalysis(inseq)

    info_dict = {}
    info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids()
    info_dict[
        'amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent()
    info_dict['length-biop'] = analysed_seq.length
    info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic
    info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight()
    info_dict['aromaticity-biop'] = analysed_seq.aromaticity()
    info_dict['instability_index-biop'] = analysed_seq.instability_index()
    # TODO: What is flexibility?
    info_dict['flexibility-biop'] = analysed_seq.flexibility()
    info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point()

    # grand average of hydrophobicity
    info_dict['gravy-biop'] = analysed_seq.gravy()

    # Separated secondary_structure_fraction into each definition
    # info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction()
    info_dict[
        'percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[0]
    info_dict[
        'percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[1]
    info_dict[
        'percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[2]

    return info_dict
def find_composition(df_original):
    df_copy = df_original.copy()

    column_names = []
    for ch in codes:
        column_names.append(ch + '_percent')
        column_names.append(ch + '_percent_first')
        column_names.append(ch + '_percent_last')
    column_names.append('len')
    column_names.append('weight')
    column_names.append('gravy')
    column_names.append('flex_mean')
    column_names.append('flex_std')
    column_names.append('ss_helix')
    column_names.append('ss_turn')
    column_names.append('ss_sheet')
    column_names.append('iep')
    column_names.append('aromaticity')

    df = pd.DataFrame(columns=column_names)
    for _, seq in enumerate(tqdm(df_copy['seq'])):
        df_temp = pd.Series()
        sequence = str(seq)
        analysed = ProteinAnalysis(sequence)
        analysed_first = ProteinAnalysis(sequence[:first_n])
        analysed_last = ProteinAnalysis(sequence[-last_n:])

        df_temp['len'] = analysed.length
        df_temp['ss_helix'], df_temp['ss_turn'], df_temp['ss_sheet'] = analysed.secondary_structure_fraction()
        df_temp['iep'] = analysed.isoelectric_point()

        # overall
        for aa, percent in analysed.get_amino_acids_percent().items():
            df_temp[aa + '_percent'] = percent

        # # first N
        for aa, percent in analysed_first.get_amino_acids_percent().items():
            df_temp[aa + '_percent_first'] = percent

        # last N
        for aa, percent in analysed_last.get_amino_acids_percent().items():
            df_temp[aa + '_percent_last'] = percent

        df_temp['weight'] = analysed.molecular_weight()
        df_temp['gravy'] = analysed.gravy()
        df_temp['aromaticity'] = analysed.aromaticity()
        df_temp['flex_mean'] = np.mean(analysed.flexibility())
        df_temp['flex_std'] = np.std(analysed.flexibility())
        df = df.append(df_temp, ignore_index=True)

    return pd.concat([df_copy, df], axis=1)
def phyChemProps(seq):
    svv = [0 for x in range(10)]
    X = ProteinAnalysis(seq)
    svv[0] = X.aromaticity()
    svv[1] = X.secondary_structure_fraction()[0]
    svv[2] = X.secondary_structure_fraction()[1]
    svv[3] = X.secondary_structure_fraction()[2]
    svv[4] = X.gravy()
    svv[5] = X.instability_index()
    svv[6] = X.isoelectric_point()
    svv[7] = X.molecular_weight()
    svv[8] = X.molar_extinction_coefficient()[0]
    svv[9] = X.molar_extinction_coefficient()[1]
    return svv
Ejemplo n.º 16
0
    def __init__(self, sequence):
        self.sequence = sequence
        self.sequence_length = len(sequence)
        analysis = ProteinAnalysis(sequence)

        self.amino_acid_percents = analysis.get_amino_acids_percent()
        self.amino_acids_composition = calculate_amino_acids_composition(sequence)
        self.aromaticity = analysis.aromaticity()
        self.instability = analysis.instability_index()
        self.flexibility = calculate_flexibility(sequence)
        protein_scale_parameters = [{'name': 'Hydrophilicity', 'dictionary': hw},
                                    {'name': 'Surface accessibility', 'dictionary': em},
                                    {'name': 'Janin Interior to surface transfer energy scale', 'dictionary': ja},
                                    {'name': 'Bulkiness', 'dictionary': bulkiness},
                                    {'name': 'Polarity', 'dictionary': polarity},
                                    {'name': 'Buried residues', 'dictionary': buried_residues},
                                    {'name': 'Average area buried', 'dictionary': average_area_buried},
                                    {'name': 'Retention time', 'dictionary': retention_time}]
        self.protein_scales = calculate_protein_scales(analysis, protein_scale_parameters)
        self.isoelectric_point = analysis.isoelectric_point()
        self.secondary_structure_fraction = calculate_secondary_structure_fraction(analysis)
        self.molecular_weight = analysis.molecular_weight()
        self.kyte_plot = analysis.gravy()
        self.pefing = calculate_pefing(sequence)

        # next parameters are calculated using R.Peptides
        r('require(Peptides)')
        r('sequence = "{0}"'.format(sequence))
        self.aliphatic_index = r('aindex(sequence)')[0]
        self.boman_index = r('boman(sequence)')[0]
        self.charges = calculate_charges(sequence, 1.0, 14.0, 0.5, 'Lehninger')
        self.hydrophobicity = r('seq(sequence)')[0]
        angles = [{'name': 'Alpha-helix', 'angle': -47},
                  {'name': '3-10-helix', 'angle': -26},
                  {'name': 'Pi-helix', 'angle': -80},
                  {'name': 'Omega', 'angle': 180},
                  {'name': 'Antiparallel beta-sheet', 'angle': 135},
                  {'name': 'Parallel beta-sheet', 'angle': 113}]
        if self.amino_acid_percents['P'] + self.amino_acid_percents['G'] > 0.3:
            angles.append({'name': 'Polygly-polypro helix', 'angle': 153})
        self.hydrophobic_moments = calculate_hydrophobic_moments(sequence, angles)
        self.kidera_factors = calculate_kidera_factors(sequence)
        self.peptide_types = calculate_peptide_types(sequence, angles)
def biochemical_properties(sequence: str) -> Dict[str, Any]:
    # Define objects used for calculations
    analysis_object = ProteinAnalysis(sequence)
    descriptor_object = PyPro.GetProDes(sequence)
    sequence_object = Seq(sequence)
    # TODO(Ahmed): Verify that all these calculations are actually returning reasonable values
    # For example, it says the percent composition of every amino acid is zero when I run
    # calculate_biochem_properties.biochemical_properties('qwertyipasdfghklcvnm')
    return {
        'Isoelectric point': analysis_object.isoelectric_point(),
        'Molecular weight':
        analysis_object.molecular_weight(),  # Daltons? Amu? g/mol?
        'Aromaticity': analysis_object.aromaticity(),
        'Instability index': analysis_object.instability_index(),
        'GRAVY': analysis_object.gravy(),
        'H-bonding percent': h_bonding_percent(sequence),
        'Melting temp': melting_temp(sequence),
        'LCC': lcc.lcc_simp(sequence)
    }
Ejemplo n.º 18
0
 def amino_acid_analysis(self):
     """
     Adds fraction of amino acid residues (defined in RESIDUES) to data frame.
     """
     for res in RESIDUES:
         self.df["fraction_" + res] = (
             self.df["sequence"].str.count(res) / self.df["sequence"].str.len()
         )
     self.df["length"] = self.df["sequence"].str.len()
     for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0]):
         # for index, row in self.df.iterrows():
         seq = row["sequence"]
         seqanalysis = ProteinAnalysis(seq)
         acidist = seqanalysis.get_amino_acids_percent()
         self.df.loc[index, "IEP"] = seqanalysis.isoelectric_point()
         if "X" not in seq and "B" not in seq:
             self.df.loc[index, "molecular_weight"] = seqanalysis.molecular_weight()
         if "U" not in seq and "X" not in seq and "B" not in seq:
             self.df.loc[index, "gravy"] = seqanalysis.gravy()
def get_features(seq):
    """get global features from a protein sequence

    Parameters
    ----------
    seq : str
        protein sequence

    Return
    ----------
    dictionary:
        global features of the protein sequence

    """

    features = {}
    features['undefined_count'] = len([x for x in seq if x in ['X','B','Z',"'",'O','U']])
    features['length'] = len(seq)
    features['perc_undefined_count'] = features['undefined_count']/features['length']
    features['entropy'] = entropy(seq)
    features['ideal_entropy'] = entropy_ideal(len(seq))
    features['perc_entropy'] = features['entropy']/features['ideal_entropy']
    features['hydr_count'] = sum(1 for x in seq if x in hydrophobic_proteins)
    features['polar_count'] = sum(1 for x in seq if x in polar_proteins)
    features['buried'] = sum(buried[x] for x in seq if x in hydrophobic_proteins)

    seq = ''.join([x for x in seq if x not in ['X','B','Z',"'",'O','U']])

    protein = ProteinAnalysis(seq)
    features['gravy'] = protein.gravy()
    features['molecular_weight'] = protein.molecular_weight()
    features['aromaticity'] = protein.aromaticity()
    features['instability_index'] = protein.instability_index()
    features['isoelectric_point'] = protein.isoelectric_point()
    features['helix'], features['turn'], features['sheet'] = protein.secondary_structure_fraction()

    features.update(protein.count_amino_acids())
    # features.update(protein.get_amino_acids_percent())
    return features
Ejemplo n.º 20
0
def GRAvy_ARomo(seq, genetic_code_=1, G=False, A=False):
    """calculating Gravy and Aroma for DNA sequence.

    Args:
        seq (str):DNA sequence
        genetic_code_(int): default = 1, The Genetic Codes number described by NCBI (https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi)
        G (bool): default = False
        A (bool): default = False
        

    Returns:
        - Gravy value if arg(G) is True

        - Aroma value if arg(A) is True
		
        - None if both args are False

    """
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    from Bio.Seq import Seq

    try:
        seq = Seq(seq)
    except:
        pass

    translate_seq = str(seq.translate(table=genetic_code_))

    protein_seq = translate_seq.replace("*", "")

    protein_seq = ProteinAnalysis(protein_seq)

    AROMO = protein_seq.aromaticity()
    gravy = protein_seq.gravy()

    if G and G == True:
        return gravy
    elif A and A == True:
        return AROMO
Ejemplo n.º 21
0
    def seqs_to_features(self, seqs, no_seqs):
        """ Extract the features from the sequences."""
        X = np.zeros((no_seqs, 32))
        for i, s in enumerate(chain(*seqs)):  # iterate over all sequences
            # get amino acid counts
            alphabet = 'ABCDEFGHIKLMNPQRSTUVWXY'  # no JOZ
            for j, letter in enumerate(alphabet):
                X[i, j] = s.count(letter) / len(s)

            # other analysis
            analysis = ProteinAnalysis(
                s.replace('X', 'A').replace('B', 'A').replace('U', 'A'))
            X[i, -1] = analysis.molecular_weight()
            X[i, -2] = analysis.aromaticity()
            X[i, -3] = analysis.instability_index()
            X[i, -4] = analysis.isoelectric_point()
            helix_array_sheet_fracs = analysis.secondary_structure_fraction()
            X[i, -5] = helix_array_sheet_fracs[0]
            X[i, -6] = helix_array_sheet_fracs[1]
            X[i, -7] = helix_array_sheet_fracs[2]
            X[i, -8] = len(s)
            X[i, -9] = analysis.gravy()  # mean hydrophobicity
        return X
Ejemplo n.º 22
0
def physchem_props(ara_d):
    """Calculate the physicochemical properties per protein in ara_d."""
    c = 0
    g = 0
    for protein in ara_d:
        seq = ara_d[protein]["sequence"]
        # Calculates the properties
        if "X" in seq:
            continue  # Skip non-usable sequences, only negs
        if '*' in seq:
            if ara_d[protein]["pos"] != []:
                print(protein)
            continue
        a_seq = ProteinAnalysis(seq)
        # Update ara_d with new physchem properties
        results = [
            a_seq.molecular_weight(),
            a_seq.gravy(),
            a_seq.aromaticity(),
            a_seq.instability_index(),
            a_seq.flexibility(),
            a_seq.isoelectric_point(),
            a_seq.secondary_structure_fraction(),
        ]
        keys = [
            "mol_weight",
            "gravy",
            "aromaticity",
            "instab_index",
            "flexi",
            "iso_point",
            "seq_struct",
        ]
        ara_d[protein]["Properties"] = {}
        for k, v in zip(keys, results):
            ara_d[protein]["Properties"][k] = v
    return ara_d
Ejemplo n.º 23
0
def add_protein_characteristics(df):
    df = df.copy()
    aa_list = [
        'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P',
        'S', 'R', 'T', 'W', 'V', 'Y'
    ]
    aa_dict = {}
    for aa in aa_list:
        aa_dict[aa] = []
    prop_dict = {
        'aromaticity': [],
        'helix': [],
        'turn': [],
        'sheet': [],
        'isoelectric_point': [],
        'gravy': []
    }  #, 'flexibility': [], 'instability_index': []}
    for i, s in enumerate(df['sequence']):
        s = s.replace('B', 'D').replace('Z', 'E').replace('J', 'L').replace(
            'X', 'G').replace('U', 'C').replace('O', 'K')
        pa = ProteinAnalysis(s)
        prop_dict['aromaticity'].append(pa.aromaticity())
        prop_dict['isoelectric_point'].append(pa.isoelectric_point())
        prop_dict['gravy'].append(pa.gravy())
        # prop_dict['instability_index'].append(pa.instability_index())
        # prop_dict['flexibility'].append(np.mean(pa.flexibility()))
        for fraction, ss in zip(pa.secondary_structure_fraction(),
                                ['helix', 'turn', 'sheet']):
            prop_dict[ss].append(fraction)
        for k, v in pa.get_amino_acids_percent().items():
            aa_dict[k].append(v)
    for k, v in aa_dict.items():
        df[k] = v
    for k, v in prop_dict.items():
        df[k] = v
    return df
Ejemplo n.º 24
0
def biopython_proteinanalysis_seq(seq, scaling=False):
    res = ProteinAnalysis(seq)
    d = {}
    flex = np.array(res.flexibility())
    d['flex:min'], d['flex:max'], d['flex:std'] = flex.min(), flex.max(
    ), flex.std()
    d['gravy'] = res.gravy()
    d['instability_index'] = res.instability_index()
    d['isoelectric_point'] = res.isoelectric_point()
    r, c = res.molar_extinction_coefficient()
    d['molar_extinction_coefficient_reduced'], d[
        'molar_extinction_coefficient_cysteines'] = r, c
    d['molecular_weight'] = res.molecular_weight()
    d['percent_helix_naive'], d['percent_turn_naive'], d[
        'percent_strand_naive'] = res.secondary_structure_fraction()

    aap = res.get_amino_acids_percent()
    aas = sorted(aap.keys())
    d.update({'percent:%s' % aa: aap[aa] for aa in aas})
    d.update({
        'prop_res_%s' % key: sum([aap.get(x, 0) for x in value])
        for key, value in list(property_residues.items())
    })
    return d
gravy=[]
molweight=[]
instidx=[]
flex=[]

for seq in sequences:
        X=ProteinAnalysis(str(seq))
        isoelectricPt.append(X.isoelectric_point())
        aromaticity.append(X.aromaticity())  
        aminoPercent.append(X.get_amino_acids_percent())
        secstruct.append(X.secondary_structure_fraction())

# These features throw Key & Value Errors due to non standard amino acids
# (i.e. out of the 20 standard ones) e.g. X, U etc
        try:
            gravy.append(X.gravy())
            molweight.append(X.molecular_weight())
            instidx.append(X.instability_index())
            flex.append(X.flexibility())
            hydrophob.append(X.protein_scale(ProtParamData.kd, 9, 0.4))
            hydrophil.append(X.protein_scale(ProtParamData.hw, 9, 0.4))
            surface.append(X.protein_scale(ProtParamData.em, 9, 0.4))

        except (KeyError,ValueError):
            gravy.append(0)
            molweight.append(0)
            instidx.append(0)
            flex.append([0,0])
            hydrophob.append([0,0])
            hydrophil.append([0,0])
            surface.append([0,0])
print('done')
with temppathlib.TemporaryDirectory() as tmpdir:
    # unzip the file with all the test PDBs
    with zipfile.ZipFile(args.infile, "r") as zip_:
        zip_.extractall(tmpdir.path)

        for test_pdb in tmpdir.path.glob("*.pdb"):
            for record in SeqIO.parse(test_pdb, "pdb-atom"):
                sequence = str(record.seq).replace('X', 'G')
                protein = ProteinAnalysis(str(sequence))
                p_len.append(len(sequence))
                mol_w.append(protein.molecular_weight())
                iso_p.append(protein.isoelectric_point())
                smell.append(protein.aromaticity())
                taste_factor.append(protein.gravy())
                insta_ind.append(protein.instability_index())
                char_at_acid.append(protein.charge_at_pH(1))
                char_at_neutral.append(protein.charge_at_pH(7))
                char_at_base.append(protein.charge_at_pH(14))
                helter_skeler.append(protein.secondary_structure_fraction()[0])
                turnip.append(protein.secondary_structure_fraction()[1])
                garfield.append(protein.secondary_structure_fraction()[2])
                for x in amino_acids:
                    n = protein.count_amino_acids()[x]
                    for y in d_count.keys():
                        if y[-1] == x:
                            d_count[y].append(n)
                for a in amino_acids:
                    m = protein.get_amino_acids_percent()[a]
                    for b in d_perc.keys():
Ejemplo n.º 27
0
    def extract(self):
        AA=["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y"]
        SC=["1","2","3","4","5","6","7"]
        tri_pep = [''.join(i) for i in itertools.product(AA, repeat = 3)]
        myseq="AILMVNQSTGPCHKRDEFWY"
        trantab2=myseq.maketrans("AILMVNQSTGPCHKRDEFWY","11111222233455566777")
        tetra_sc = [''.join(i) for i in itertools.product(SC, repeat = 4)]
        total_fasta=self.g_total_fasta
        sec_code=0
        record_current=0
        arr = numpy.empty((total_fasta,10409), dtype=numpy.float)
        names = numpy.empty((total_fasta,1),  dtype=object)
        names_dic=dict()
        for record in SeqIO.parse(self.infile, "fasta"):
            data=(record_current/total_fasta) * 100
            if (self.g_is_socket==1):
                self.g_socketio.emit('set bar', {'data': data},room=self.g_sid)
            else:
                print('extracting features of seq ' + str(record_current+1) + ' of ' + str(total_fasta),end='\r')
            #yield "event: update\ndata:" + str(data) + "\n\n"
            record_current += 1
            
            #job.meta['current']=record_current
            #job.save_meta()
            ll=len(record.seq)
            seq_name=''
            if not self.prot_check(str(record.seq)):
                print("Warning: " + record.id + " is not a valid protein sequence")
                continue
            if record.id in names_dic:
                seq_name= record.id + '_' + str(names_dic[record.id])
                names_dic[record.id]=names_dic[record.id]+1
            else:
                seq_name= record.id
                names_dic[record.id]=1
            seqq=record.seq.__str__().upper()
            seqqq=seqq.replace('X','A').replace('J','L').replace('*','A').replace('Z','E').replace('B','D')
           # X = ProteinAnalysis(record.seq.__str__().upper().replace('X','A').replace('J','L').replace('*',''))
            X = ProteinAnalysis(seqqq)
            myseq=seqq.translate(trantab2)
            tt= [X.isoelectric_point(), X.instability_index(),ll,X.aromaticity(),
                 X.molar_extinction_coefficient()[0],X.molar_extinction_coefficient()[1],
                 X.gravy(),X.molecular_weight()]
            tt_n = numpy.asarray(tt,dtype=numpy.float)

            tri_pep_count=[seqq.count(i)/(ll-2) for i in tri_pep]
            tri_pep_count_n = numpy.asarray(tri_pep_count,dtype=numpy.float)
            
            tetra_sc_count=[myseq.count(i)/(ll-3) for i in tetra_sc]
            tetra_sc_count_n = numpy.asarray(tetra_sc_count,dtype=numpy.float)
    
            cat_n= numpy.concatenate((tetra_sc_count_n,tri_pep_count_n,tt_n))
            cat_n = cat_n.reshape((1,cat_n.shape[0]))


            arr[sec_code,:]=cat_n
            names[sec_code,0]=seq_name
            sec_code += 1
        if (self.g_is_socket==1):
            self.g_socketio.emit('set bar', {'data': 100},room=self.g_sid)
            self.g_socketio.emit('done features',1,room=self.g_sid)
        print("\nDone")
        return (names,arr)
Ejemplo n.º 28
0
from Bio.SeqUtils.ProtParam import ProteinAnalysis
my_seq = str(input("manual sequence from translate.py :"))
analysed_seq = ProteinAnalysis(my_seq)
answer1 = str(input("detect molecular weight y/n? :"))
if answer1 == "y":
    mweight = analysed_seq.molecular_weight()
    print(mweight)
answer2 = str(input("detect gravy y/n? :"))
if answer2 == "y":
    gravy_protein = analysed_seq.gravy()
    print(gravy_protein)
print(analysed_seq.count_amino_acids())
input("enter")
Ejemplo n.º 29
0
# for a in pe_list.index:
#     if pe_list[a] == 'Predicted':
#         u_list.append(reading['acc. code'][a])

if os.path.isfile('nP20k.fasta') == False:
    link = 'http://www.peptideatlas.org/tmp/nP20k.fasta.gz'
    resp = requests.get(link)
    with open('nP20k.fasta', 'wb') as f_output:
        f_output.write(resp.content)

info = []
with open('nP20k.fasta', 'rU') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.description.__contains__('PE=2'):
            analyzed_seq = ProteinAnalysis(str(record.seq))
            tup = (record.id, '2', analyzed_seq.gravy(),
                   textwrap.fill(record.description, 20))
            info.append(tup)
        if record.description.__contains__('PE=3'):
            analyzed_seq = ProteinAnalysis(str(record.seq))
            tup = (record.id, '3', analyzed_seq.gravy(),
                   textwrap.fill(record.description, 20))
            info.append(tup)
        if record.description.__contains__('PE=4'):
            analyzed_seq = ProteinAnalysis(str(record.seq))
            tup = (record.id, '4', analyzed_seq.gravy(),
                   textwrap.fill(record.description, 20))
            info.append(tup)

print(tabulate(info, headers=['Identifier', 'PE', 'GRAVY', 'Description']))
Ejemplo n.º 30
0
#!/usr/bin/env python

import sys
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

sys.stdout.write("ID\tMW\tIP\tgravy\tlength\tinstability\tmonoisotpoic\tSequence\n")

for record in SeqIO.parse(sys.stdin, "fasta"):
    a = ProteinAnalysis(str(record.seq))

    properties = list()
    properties.append(record.id)
    properties.append(a.molecular_weight())
    properties.append(a.isoelectric_point())
    properties.append(a.gravy())
    properties.append(a.length)
    properties.append(a.instability_index())
    properties.append(a.aromaticity())
    # always last column to make the output more readable
    properties.append(a.sequence)
    sys.stdout.write( '\t'.join(map(str, properties))+"\n" )

Ejemplo n.º 31
0
def get_phanns_input(fasta_list, d2vmodel):
    #     d2vmodel = pickle.load(open('d2v_model1.p','rb'))
    AA = [
        "A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q",
        "R", "S", "T", "V", "W", "Y"
    ]
    SC = ["1", "2", "3", "4", "5", "6", "7"]
    tri_pep = [''.join(i) for i in itertools.product(AA, repeat=3)]
    tetra_sc = [''.join(i) for i in itertools.product(SC, repeat=4)]
    prot_class = 0
    myseq = "AILMVNQSTGPCHKRDEFWY"
    trantab2 = myseq.maketrans("AILMVNQSTGPCHKRDEFWY", "11111222233455566777")
    kmer_size = 3
    this_prot = 0
    vectors = []
    classes = []
    for file in fasta_list:
        print('####################' + file)
        #         file_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))),"fasta",file + "_all_clustered.fasta")
        for record in SeqIO.parse(file, "fasta"):
            ll = len(record.seq)
            seqq = record.seq.__str__().upper()
            seqqq = seqq.replace('X', 'A').replace('J', 'L').replace(
                '*', 'A').replace('Z', 'E').replace('B', 'D')
            X = ProteinAnalysis(seqqq)
            tt = [
                X.isoelectric_point(),
                X.instability_index(), ll,
                X.aromaticity(),
                X.molar_extinction_coefficient()[0],
                X.molar_extinction_coefficient()[1],
                X.gravy(),
                X.molecular_weight()
            ]
            tt_n = np.asarray(tt, dtype=np.float)
            myseq = seqq.translate(trantab2)

            #count tripeptides
            tri_pep_count = [seqq.count(i) / (ll - 2) for i in tri_pep]
            tri_pep_count_n = np.asarray(tri_pep_count, dtype=np.float)

            #count tetra side chains
            tetra_sc_count = [myseq.count(i) / (ll - 3) for i in tetra_sc]
            tetra_sc_count_n = np.asarray(tetra_sc_count, dtype=np.float)

            #get embedding vector
            vec = d2vmodel.infer_vector([
                seqqq[k:k + kmer_size] for k in range(0, len(seqqq), kmer_size)
            ])
            for s in range(1, kmer_size):
                vec = vec + d2vmodel.infer_vector([
                    seqqq[k:k + kmer_size]
                    for k in range(s, len(seqqq), kmer_size)
                ])
            vec = vec / kmer_size

            cat_n = np.concatenate(
                (tri_pep_count_n, tetra_sc_count_n, tt_n, vec))
            vectors.append((cat_n, record))

            this_prot += 1
            if (this_prot % 500 == 0):
                print("processing sequence # " + str(this_prot), end="\r")
        prot_class += 1
        this_prot = 0
    return vectors
Ejemplo n.º 32
0
#!/usr/bin/env python

import sys

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

sys.stdout.write("ID\tMW\tIP\tgravy\tlength\tinstability\tmonoisotpoic\tSequence\n")

for record in SeqIO.parse(sys.stdin, "fasta"):
    a = ProteinAnalysis(str(record.seq))

    properties = list()
    properties.append(record.id)
    properties.append(a.molecular_weight())
    properties.append(a.isoelectric_point())
    properties.append(a.gravy())
    properties.append(a.length)
    properties.append(a.instability_index())
    properties.append(a.aromaticity())
    # always last column to make the output more readable
    properties.append(a.sequence)
    sys.stdout.write("\t".join(map(str, properties)) + "\n")
Ejemplo n.º 33
0
plt.title("Distribution of Protein Molecular Weights")
plt.savefig("plotMolecularWeights.pdf")
plt.clf()
# plt.show()

gravy_index = []
aromaticity = []
instability_index = []
# flexibility = []
isoelectric_point = []
secondary_structure_fraction = []

for protein in sequences_a:
    analysed_seq = ProteinAnalysis(str(protein.seq).replace("X", ""))

    gravy_index.append([sys.argv[1], analysed_seq.gravy()])
    aromaticity.append([sys.argv[1], analysed_seq.aromaticity()])
    instability_index.append([sys.argv[1], analysed_seq.instability_index()])
    # flexibility.append([sys.argv[1], analysed_seq.flexibility()])
    isoelectric_point.append([sys.argv[1], analysed_seq.isoelectric_point()])
    secondary_structure_fraction.append(
        [sys.argv[1], analysed_seq.secondary_structure_fraction()])

for protein in sequences_b:
    analysed_seq = ProteinAnalysis(str(protein.seq).replace("X", ""))

    gravy_index.append([sys.argv[2], analysed_seq.gravy()])
    aromaticity.append([sys.argv[2], analysed_seq.aromaticity()])
    instability_index.append([sys.argv[2], analysed_seq.instability_index()])
    # flexibility.append([sys.argv[2], analysed_seq.flexibility()])
    isoelectric_point.append([sys.argv[2], analysed_seq.isoelectric_point()])
Ejemplo n.º 34
0
def main():
    aa = [
        'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'
    ]
    dipeptide = [
        'AA', 'AC', 'AD', 'AE', 'AF', 'AG', 'AH', 'AI', 'AK', 'AL', 'AM', 'AN',
        'AP', 'AQ', 'AR', 'AS', 'AT', 'AV', 'AW', 'AY', 'CA', 'CC', 'CD', 'CE',
        'CF', 'CG', 'CH', 'CI', 'CK', 'CL', 'CM', 'CN', 'CP', 'CQ', 'CR', 'CS',
        'CT', 'CV', 'CW', 'CY', 'DA', 'DC', 'DD', 'DE', 'DF', 'DG', 'DH', 'DI',
        'DK', 'DL', 'DM', 'DN', 'DP', 'DQ', 'DR', 'DS', 'DT', 'DV', 'DW', 'DY',
        'EA', 'EC', 'ED', 'EE', 'EF', 'EG', 'EH', 'EI', 'EK', 'EL', 'EM', 'EN',
        'EP', 'EQ', 'ER', 'ES', 'ET', 'EV', 'EW', 'EY', 'FA', 'FC', 'FD', 'FE',
        'FF', 'FG', 'FH', 'FI', 'FK', 'FL', 'FM', 'FN', 'FP', 'FQ', 'FR', 'FS',
        'FT', 'FV', 'FW', 'FY', 'GA', 'GC', 'GD', 'GE', 'GF', 'GG', 'GH', 'GI',
        'GK', 'GL', 'GM', 'GN', 'GP', 'GQ', 'GR', 'GS', 'GT', 'GV', 'GW', 'GY',
        'HA', 'HC', 'HD', 'HE', 'HF', 'HG', 'HH', 'HI', 'HK', 'HL', 'HM', 'HN',
        'HP', 'HQ', 'HR', 'HS', 'HT', 'HV', 'HW', 'HY', 'IA', 'IC', 'ID', 'IE',
        'IF', 'IG', 'IH', 'II', 'IK', 'IL', 'IM', 'IN', 'IP', 'IQ', 'IR', 'IS',
        'IT', 'IV', 'IW', 'IY', 'KA', 'KC', 'KD', 'KE', 'KF', 'KG', 'KH', 'KI',
        'KK', 'KL', 'KM', 'KN', 'KP', 'KQ', 'KR', 'KS', 'KT', 'KV', 'KW', 'KY',
        'LA', 'LC', 'LD', 'LE', 'LF', 'LG', 'LH', 'LI', 'LK', 'LL', 'LM', 'LN',
        'LP', 'LQ', 'LR', 'LS', 'LT', 'LV', 'LW', 'LY', 'MA', 'MC', 'MD', 'ME',
        'MF', 'MG', 'MH', 'MI', 'MK', 'ML', 'MM', 'MN', 'MP', 'MQ', 'MR', 'MS',
        'MT', 'MV', 'MW', 'MY', 'NA', 'NC', 'ND', 'NE', 'NF', 'NG', 'NH', 'NI',
        'NK', 'NL', 'NM', 'NN', 'NP', 'NQ', 'NR', 'NS', 'NT', 'NV', 'NW', 'NY',
        'PA', 'PC', 'PD', 'PE', 'PF', 'PG', 'PH', 'PI', 'PK', 'PL', 'PM', 'PN',
        'PP', 'PQ', 'PR', 'PS', 'PT', 'PV', 'PW', 'PY', 'QA', 'QC', 'QD', 'QE',
        'QF', 'QG', 'QH', 'QI', 'QK', 'QL', 'QM', 'QN', 'QP', 'QQ', 'QR', 'QS',
        'QT', 'QV', 'QW', 'QY', 'RA', 'RC', 'RD', 'RE', 'RF', 'RG', 'RH', 'RI',
        'RK', 'RL', 'RM', 'RN', 'RP', 'RQ', 'RR', 'RS', 'RT', 'RV', 'RW', 'RY',
        'SA', 'SC', 'SD', 'SE', 'SF', 'SG', 'SH', 'SI', 'SK', 'SL', 'SM', 'SN',
        'SP', 'SQ', 'SR', 'SS', 'ST', 'SV', 'SW', 'SY', 'TA', 'TC', 'TD', 'TE',
        'TF', 'TG', 'TH', 'TI', 'TK', 'TL', 'TM', 'TN', 'TP', 'TQ', 'TR', 'TS',
        'TT', 'TV', 'TW', 'TY', 'VA', 'VC', 'VD', 'VE', 'VF', 'VG', 'VH', 'VI',
        'VK', 'VL', 'VM', 'VN', 'VP', 'VQ', 'VR', 'VS', 'VT', 'VV', 'VW', 'VY',
        'WA', 'WC', 'WD', 'WE', 'WF', 'WG', 'WH', 'WI', 'WK', 'WL', 'WM', 'WN',
        'WP', 'WQ', 'WR', 'WS', 'WT', 'WV', 'WW', 'WY', 'YA', 'YC', 'YD', 'YE',
        'YF', 'YG', 'YH', 'YI', 'YK', 'YL', 'YM', 'YN', 'YP', 'YQ', 'YR', 'YS',
        'YT', 'YV', 'YW', 'YY'
    ]

    sequences = pandas.read_csv('protein_data.csv', header=None)

    lengths = []
    weights = []
    for protein in sequences.itertuples():
        protein_length = len(str(protein[1]))  # length of protein sequence
        lengths.append(protein_length)
        analyzed_protein = ProteinAnalysis(str(protein[1]))
        ambigious_match = re.findall("X+|Z+", protein[1])
        if ambigious_match:
            molecular_weight = "?"
        else:
            molecular_weight = analyzed_protein.molecular_weight()
        weights.append(molecular_weight)
    # remove bad amino acids from sequences
    for i in range(len(sequences)):
        sequences[0][i] = sequences[0][i].replace('B', '')
        sequences[0][i] = sequences[0][i].replace('U', '')
        sequences[0][i] = sequences[0][i].replace('X', '')
        sequences[0][i] = sequences[0][i].replace('Z', '')
    pandas.DataFrame(sequences).to_csv('updated_protein_data.csv',
                                       index_label=None,
                                       header=None,
                                       index=None)

    # use amino acid composition results from pfeature to generate most common amino acid and dipeptide
    data = pandas.read_csv('updated_protein_data.csv', header=None)
    data = numpy.asarray(data)
    most_frequent_di = []
    most_frequent = []
    for i in range(len(data)):
        max = 0
        col = 0
        for j in range(len(dipeptide)):
            c = data[i][0].count(dipeptide[j])
            if (c > max):
                max = c
                col = j
        most_frequent_di.append(dipeptide[col])
        for j in range(len(aa)):
            c = data[i][0].count(aa[j])
            if (c > max):
                max = c
                col = j
        most_frequent.append(aa[col])

    # more features
    amino_acid = {}
    first_aa = []
    last_aa = []
    arom = []
    ii = []
    ip = []
    mec_rc = []
    mec_db = []
    ssf_helix = []
    ssf_turn = []
    ssf_sheet = []
    gravy = []
    ph_0 = []
    ph_7 = []
    ph_14 = []
    A = []
    C = []
    D = []
    E = []
    F = []
    G = []
    H = []
    I = []
    K = []
    L = []
    M = []
    N = []
    P = []
    Q = []
    R = []
    S = []
    T = []
    V = []
    W = []
    Y = []
    classes = []
    data = pandas.read_csv('updated_protein_data.csv', header=None)
    for protein in data.itertuples():
        analyzed_protein = ProteinAnalysis(str(protein[1]))
        amino_acid = (analyzed_protein.count_amino_acids())
        A.append(amino_acid.get('A'))
        C.append(amino_acid.get('C'))
        D.append(amino_acid.get('D'))
        E.append(amino_acid.get('E'))
        F.append(amino_acid.get('F'))
        G.append(amino_acid.get('G'))
        H.append(amino_acid.get('H'))
        I.append(amino_acid.get('I'))
        K.append(amino_acid.get('K'))
        L.append(amino_acid.get('L'))
        M.append(amino_acid.get('M'))
        N.append(amino_acid.get('N'))
        P.append(amino_acid.get('P'))
        Q.append(amino_acid.get('Q'))
        R.append(amino_acid.get('R'))
        S.append(amino_acid.get('S'))
        T.append(amino_acid.get('T'))
        V.append(amino_acid.get('V'))
        W.append(amino_acid.get('W'))
        Y.append(amino_acid.get('Y'))

        first_aa.append(str(protein[1])[0])
        last_aa.append(str(protein[1])[-1])
        arom.append(analyzed_protein.aromaticity())
        ii.append(analyzed_protein.instability_index())
        ip.append(analyzed_protein.isoelectric_point())
        mec_rc.append(analyzed_protein.molar_extinction_coefficient()[0])
        mec_db.append(analyzed_protein.molar_extinction_coefficient()[1])
        ssf_helix.append(analyzed_protein.secondary_structure_fraction()[0])
        ssf_turn.append(analyzed_protein.secondary_structure_fraction()[1])
        ssf_sheet.append(analyzed_protein.secondary_structure_fraction()[2])
        gravy.append(analyzed_protein.gravy())
        ph_0.append(analyzed_protein.charge_at_pH(0.0))
        ph_7.append(analyzed_protein.charge_at_pH(7.0))
        ph_14.append(analyzed_protein.charge_at_pH(14.0))
        classes.append(protein[2])

    features = pandas.DataFrame()
    features["LENGTH"] = lengths
    #features["MOLECULAR WEIGHT"] = weights
    features["most frequent aa"] = most_frequent
    #features["first amino acids"] = first_aa
    features["last amino acid"] = last_aa
    features["most frequence dipeptide"] = most_frequent_di
    features["aromaticity"] = arom
    features["instability index"] = ii
    features["isolectric point"] = ip
    features["molecular extinction coefficient - reduced cysteines"] = mec_rc
    features["molecular extinction coefficient - disulfid bridges"] = mec_db
    features["secondary structure fraction helix"] = ssf_helix
    features["secondary structure fraction turn"] = ssf_turn
    features["secondary structure fraction sheet"] = ssf_sheet
    features["gravy"] = gravy
    features["charge at ph 0"] = ph_0
    features["charge at ph 7"] = ph_7
    features["charge at ph 14"] = ph_14
    features['A'] = A
    features['C'] = C
    features['D'] = D
    features['E'] = E
    features['F'] = F
    features['G'] = G
    features['H'] = H
    features['I'] = I
    features['K'] = K
    features['L'] = L
    features['M'] = M
    features['N'] = N
    features['P'] = P
    features['Q'] = Q
    features['R'] = R
    features['S'] = S
    features['T'] = T
    features['V'] = V
    features['W'] = W
    features['Y'] = Y
    features["CLASS"] = classes
    features.to_csv('features.csv', index=None)