def protParam(seq): params = ProteinAnalysis(seq) mw = params.molecular_weight() c_aa = params.count_amino_acids() p_aa = params.get_amino_acids_percent() gravy = params.gravy() aromaticity = params.aromaticity() isoelectric_point = params.isoelectric_point() ext_coeff = sum([c_aa["W"]*5690,c_aa["Y"]*1280,c_aa["C"]*120]) mgml = ext_coeff * (1./mw) print("Amino acid count") pprint.pprint(c_aa) print("Amino acid percent") pprint.pprint(p_aa) print("Molecular weight") print("%f Da"%mw) print("Gravy") print(gravy) print("Isoelectric point") print(isoelectric_point) print("Aromaticity") print(aromaticity) print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)"%ext_coeff) print("")
def sequence_vector(temp_window: str, window: int = 6, chemical=1): """ This vector takes the sequence and has each amino acid represented by an int 0 represents nonstandard amino acids or as fluff for tails/heads of sequences Strip is a list which can be modified as user needs call for """ temp_window = clean(temp_window) temp_window = windower(sequence=temp_window, position=int(len(temp_window)*.5), wing_size=window) vec = [] aa = {"G": 1, "A": 2, "L": 3, "M": 4, "F": 5, "W": 6, "K": 7, "Q": 8, "E": 9, "S": 10, "P": 11, "V": 12, "I": 13, "C": 14, "Y": 15, "H": 16, "R": 17, "N": 18, "D": 19, "T": 20, "X": 0} for i in temp_window: vec.append(aa[i]) if len(vec) != (window*2)+1: t = len(vec) for i in range((window*2)+1-t): vec.append(0) # Hydrophobicity is optional if chemical == 1: s = ProteinAnalysis(temp_window) vec.append(s.gravy()) vec.append(s.instability_index()) vec.append(s.aromaticity()) return vec
def find_gravy_stats(folders, outfile, condition, regex = None, frequency = False): mean_list = [] for folder in folders: with open(folder[0] + '/5_AA-sequences.txt') as f: gravy_all = 0 total_seqs = 0 reader = csv.DictReader(f, delimiter = '\t') for row in reader: try: if row['Functionality'] == 'productive' and condition(row['CDR3-IMGT']): protein = Prot(row['CDR3-IMGT']) gravy = protein.gravy() if frequency: pat = re.compile(regex) info = pat.match(row['Sequence ID']) freq = int(info.group(1)) else: freq = 1 total_seqs += freq gravy_all += gravy * freq except: pass try: mean_list.append(gravy_all/float(total_seqs)) print mean_list except: pass with open(outfile + '_means.txt', 'w') as out: for item in mean_list: out.write(str(item) +'\n') with open(outfile + '.txt', 'w') as out: out.write('mean CDR3 gravy,standard deviation\n') out.write(str(np.mean(mean_list)) + ',' + str(np.std(mean_list)))
def _protein_parameters(self, sequence): """Calculates physicochemical properties for the amino acid sequence. Args: sequence: str, amino acid sequence. Returns: property_arr: np array, vector of properties. """ analysis = ProteinAnalysis(sequence) property_arr = [] property_arr.append(analysis.molecular_weight()) property_arr.append(analysis.aromaticity()) property_arr.append(analysis.instability_index()) property_arr.append(analysis.gravy()) property_arr.append(analysis.isoelectric_point()) secondary = analysis.secondary_structure_fraction() property_arr.append(secondary[0]) property_arr.append(secondary[1]) property_arr.append(secondary[2]) molar_extinction_coefficient = analysis.molar_extinction_coefficient() property_arr.append(molar_extinction_coefficient[0]) property_arr.append(molar_extinction_coefficient[1]) property_arr.append(self._net_charge(sequence)) return np.array(property_arr)
def protein_properties(seq): """Return a tuple with some protein biochemical properties seq is a Bio.Seq.Seq or str representing protein sequence """ pa = ProteinAnalysis(seq) aa_counts = pa.count_amino_acids() arom = pa.aromaticity() isoelec = pa.isoelectric_point() try: instability = pa.instability_index() except KeyError: instability = None try: gravy = pa.gravy() except KeyError: gravy = None return ProtProp(aa=str(seq), gravy=gravy, aromaticity=arom, isoelectric_point=isoelec, instability=instability, aa_counts=aa_counts)
def calculate_physiochemical_features(temp_dict, sequence): analyzed_seq = ProteinAnalysis(sequence) charge_at_pH7 = analyzed_seq.charge_at_pH(7) instability_index = analyzed_seq.instability_index() molecular_weight = analyzed_seq.molecular_weight() aromaticity = analyzed_seq.aromaticity() molar_extinction_coefficient = analyzed_seq.molar_extinction_coefficient() range_l, range_h = molar_extinction_coefficient molar_extinction_coefficient = (float(range_l) + float(range_h)) / 2 gravy = analyzed_seq.gravy( ) #Grand Average Hyrdopathy - Higher value = More Hydrophobic isoelectric_point = analyzed_seq.isoelectric_point() helix_fraction, turn_fraction, sheet_fraction = analyzed_seq.secondary_structure_fraction( ) physiochem_dict = { "Charge at pH7": charge_at_pH7, "Instability Index": instability_index, "Molecular Wt": molecular_weight, "Aromaticity": aromaticity, "Molar Extinction Coeff": molar_extinction_coefficient, "Gravy": gravy, "Isoelectric pt": isoelectric_point, "Helix Fraction": helix_fraction, "Turn Fraction": turn_fraction, "Sheet Fraction": sheet_fraction } temp_dict.update(physiochem_dict) #Adding separately because get_amino_acids_percent() generates a dictionary on its own aa_percent = analyzed_seq.get_amino_acids_percent() temp_dict.update(aa_percent)
def protAnalysis(self, content): result, resultFlexDic = dict(), dict() content = Parsers.normalizeSequence(content, self.sourceType) protein = ProteinAnalysis(content) result['proteinMWeight'] = protein.molecular_weight() result['proteinAroma'] = protein.aromaticity() result['proteinInstab'] = protein.instability_index() result['proteinIsoelec'] = protein.isoelectric_point() result['proteinGravy'] = protein.gravy() proteinStructure = protein.secondary_structure_fraction() protStruct = self.flatten('proteinSecstruc', proteinStructure) result = {**protStruct, **result} # merge result and protein Structure flexibility = protein.flexibility() flexibFlat = self.flatten('proteinFlex', flexibility) flexibAmino = self.flatten(list(content), flexibility) flattened = {**flexibFlat, **result} flattenedFlexDic = {**flexibAmino, **result} return result, flattened, flattenedFlexDic,
def physchem_props(data): """Calculate the physicochemical properties per protein in ara_d.""" new_table = [] header = "ID\tclass\tindex\tsequon\tsequence\tmol_weight\tgravy\taromaticity\tinstab_index\tiso_point\n" new_table.append(header) for line in data: split_line = line.rstrip().split('\t') seq = split_line[-2] # Sequon, not sequence # Calculates the properties if "X" in seq or '*' in seq or seq == '': continue # Skip non-usable sequences, only negs try: a_seq = ProteinAnalysis(seq) # Update ara_d with new physchem properties results = [ a_seq.molecular_weight(), a_seq.gravy(), a_seq.aromaticity(), a_seq.instability_index(), #a_seq.flexibility(), a_seq.isoelectric_point(), #a_seq.secondary_structure_fraction(), ] except: print(split_line) sys.exit(1) new_line = line.rstrip() + "\t{}\t{}\t{}\t{}\t{}\n".format(*results) new_table.append(new_line) return new_table
def protParam(seq): params = ProteinAnalysis(seq) mw = params.molecular_weight() c_aa = params.count_amino_acids() p_aa = params.get_amino_acids_percent() gravy = params.gravy() aromaticity = params.aromaticity() isoelectric_point = params.isoelectric_point() ext_coeff = sum([c_aa["W"] * 5690, c_aa["Y"] * 1280, c_aa["C"] * 120]) mgml = ext_coeff * (1. / mw) print("Amino acid count") pprint.pprint(c_aa) print("Amino acid percent") pprint.pprint(p_aa) print("Molecular weight") print("%f Da" % mw) print("Gravy") print(gravy) print("Isoelectric point") print(isoelectric_point) print("Aromaticity") print(aromaticity) print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)" % ext_coeff) print("")
def get_gravy_list(self): gravy_list = [] for seq in self.df.index: # for every seq, add gravy to list seq = ProteinAnalysis(seq) gravy = "{:.6f}".format(seq.gravy()) gravy_list.append(gravy) gravy_list = np.array(gravy_list) # convert to np array return self.normalize(gravy_list) # return normalized
def get_protein_analysis(aa): protein_analysis = ProteinAnalysis(aa) analyze = [protein_analysis.molecular_weight(), protein_analysis.aromaticity(), protein_analysis.instability_index(), protein_analysis.isoelectric_point(), protein_analysis.gravy()] + list( protein_analysis.secondary_structure_fraction()) return analyze
def get_gravy(self): """ Calculates Gravy from sequence (1 value) from biopython :return: dictionary with the value of gravy """ res = {} analysed_seq = ProteinAnalysis(self.ProteinSequence) res['Gravy'] = analysed_seq.gravy() return res
def biopython_protein_analysis(inseq): """Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string. For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html Args: inseq: Amino acid sequence Returns: dict: Dictionary of sequence properties. Some definitions include: instability_index: Any value above 40 means the protein is unstable (has a short half life). secondary_structure_fraction: Percentage of protein in helix, turn or sheet TODO: Finish definitions of dictionary """ inseq = ssbio.protein.sequence.utils.cast_to_str(inseq) analysed_seq = ProteinAnalysis(inseq) info_dict = {} info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids() info_dict[ 'amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent() info_dict['length-biop'] = analysed_seq.length info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight() info_dict['aromaticity-biop'] = analysed_seq.aromaticity() info_dict['instability_index-biop'] = analysed_seq.instability_index() # TODO: What is flexibility? info_dict['flexibility-biop'] = analysed_seq.flexibility() info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point() # grand average of hydrophobicity info_dict['gravy-biop'] = analysed_seq.gravy() # Separated secondary_structure_fraction into each definition # info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction() info_dict[ 'percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction( )[0] info_dict[ 'percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction( )[1] info_dict[ 'percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction( )[2] return info_dict
def find_composition(df_original): df_copy = df_original.copy() column_names = [] for ch in codes: column_names.append(ch + '_percent') column_names.append(ch + '_percent_first') column_names.append(ch + '_percent_last') column_names.append('len') column_names.append('weight') column_names.append('gravy') column_names.append('flex_mean') column_names.append('flex_std') column_names.append('ss_helix') column_names.append('ss_turn') column_names.append('ss_sheet') column_names.append('iep') column_names.append('aromaticity') df = pd.DataFrame(columns=column_names) for _, seq in enumerate(tqdm(df_copy['seq'])): df_temp = pd.Series() sequence = str(seq) analysed = ProteinAnalysis(sequence) analysed_first = ProteinAnalysis(sequence[:first_n]) analysed_last = ProteinAnalysis(sequence[-last_n:]) df_temp['len'] = analysed.length df_temp['ss_helix'], df_temp['ss_turn'], df_temp['ss_sheet'] = analysed.secondary_structure_fraction() df_temp['iep'] = analysed.isoelectric_point() # overall for aa, percent in analysed.get_amino_acids_percent().items(): df_temp[aa + '_percent'] = percent # # first N for aa, percent in analysed_first.get_amino_acids_percent().items(): df_temp[aa + '_percent_first'] = percent # last N for aa, percent in analysed_last.get_amino_acids_percent().items(): df_temp[aa + '_percent_last'] = percent df_temp['weight'] = analysed.molecular_weight() df_temp['gravy'] = analysed.gravy() df_temp['aromaticity'] = analysed.aromaticity() df_temp['flex_mean'] = np.mean(analysed.flexibility()) df_temp['flex_std'] = np.std(analysed.flexibility()) df = df.append(df_temp, ignore_index=True) return pd.concat([df_copy, df], axis=1)
def phyChemProps(seq): svv = [0 for x in range(10)] X = ProteinAnalysis(seq) svv[0] = X.aromaticity() svv[1] = X.secondary_structure_fraction()[0] svv[2] = X.secondary_structure_fraction()[1] svv[3] = X.secondary_structure_fraction()[2] svv[4] = X.gravy() svv[5] = X.instability_index() svv[6] = X.isoelectric_point() svv[7] = X.molecular_weight() svv[8] = X.molar_extinction_coefficient()[0] svv[9] = X.molar_extinction_coefficient()[1] return svv
def __init__(self, sequence): self.sequence = sequence self.sequence_length = len(sequence) analysis = ProteinAnalysis(sequence) self.amino_acid_percents = analysis.get_amino_acids_percent() self.amino_acids_composition = calculate_amino_acids_composition(sequence) self.aromaticity = analysis.aromaticity() self.instability = analysis.instability_index() self.flexibility = calculate_flexibility(sequence) protein_scale_parameters = [{'name': 'Hydrophilicity', 'dictionary': hw}, {'name': 'Surface accessibility', 'dictionary': em}, {'name': 'Janin Interior to surface transfer energy scale', 'dictionary': ja}, {'name': 'Bulkiness', 'dictionary': bulkiness}, {'name': 'Polarity', 'dictionary': polarity}, {'name': 'Buried residues', 'dictionary': buried_residues}, {'name': 'Average area buried', 'dictionary': average_area_buried}, {'name': 'Retention time', 'dictionary': retention_time}] self.protein_scales = calculate_protein_scales(analysis, protein_scale_parameters) self.isoelectric_point = analysis.isoelectric_point() self.secondary_structure_fraction = calculate_secondary_structure_fraction(analysis) self.molecular_weight = analysis.molecular_weight() self.kyte_plot = analysis.gravy() self.pefing = calculate_pefing(sequence) # next parameters are calculated using R.Peptides r('require(Peptides)') r('sequence = "{0}"'.format(sequence)) self.aliphatic_index = r('aindex(sequence)')[0] self.boman_index = r('boman(sequence)')[0] self.charges = calculate_charges(sequence, 1.0, 14.0, 0.5, 'Lehninger') self.hydrophobicity = r('seq(sequence)')[0] angles = [{'name': 'Alpha-helix', 'angle': -47}, {'name': '3-10-helix', 'angle': -26}, {'name': 'Pi-helix', 'angle': -80}, {'name': 'Omega', 'angle': 180}, {'name': 'Antiparallel beta-sheet', 'angle': 135}, {'name': 'Parallel beta-sheet', 'angle': 113}] if self.amino_acid_percents['P'] + self.amino_acid_percents['G'] > 0.3: angles.append({'name': 'Polygly-polypro helix', 'angle': 153}) self.hydrophobic_moments = calculate_hydrophobic_moments(sequence, angles) self.kidera_factors = calculate_kidera_factors(sequence) self.peptide_types = calculate_peptide_types(sequence, angles)
def biochemical_properties(sequence: str) -> Dict[str, Any]: # Define objects used for calculations analysis_object = ProteinAnalysis(sequence) descriptor_object = PyPro.GetProDes(sequence) sequence_object = Seq(sequence) # TODO(Ahmed): Verify that all these calculations are actually returning reasonable values # For example, it says the percent composition of every amino acid is zero when I run # calculate_biochem_properties.biochemical_properties('qwertyipasdfghklcvnm') return { 'Isoelectric point': analysis_object.isoelectric_point(), 'Molecular weight': analysis_object.molecular_weight(), # Daltons? Amu? g/mol? 'Aromaticity': analysis_object.aromaticity(), 'Instability index': analysis_object.instability_index(), 'GRAVY': analysis_object.gravy(), 'H-bonding percent': h_bonding_percent(sequence), 'Melting temp': melting_temp(sequence), 'LCC': lcc.lcc_simp(sequence) }
def amino_acid_analysis(self): """ Adds fraction of amino acid residues (defined in RESIDUES) to data frame. """ for res in RESIDUES: self.df["fraction_" + res] = ( self.df["sequence"].str.count(res) / self.df["sequence"].str.len() ) self.df["length"] = self.df["sequence"].str.len() for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0]): # for index, row in self.df.iterrows(): seq = row["sequence"] seqanalysis = ProteinAnalysis(seq) acidist = seqanalysis.get_amino_acids_percent() self.df.loc[index, "IEP"] = seqanalysis.isoelectric_point() if "X" not in seq and "B" not in seq: self.df.loc[index, "molecular_weight"] = seqanalysis.molecular_weight() if "U" not in seq and "X" not in seq and "B" not in seq: self.df.loc[index, "gravy"] = seqanalysis.gravy()
def get_features(seq): """get global features from a protein sequence Parameters ---------- seq : str protein sequence Return ---------- dictionary: global features of the protein sequence """ features = {} features['undefined_count'] = len([x for x in seq if x in ['X','B','Z',"'",'O','U']]) features['length'] = len(seq) features['perc_undefined_count'] = features['undefined_count']/features['length'] features['entropy'] = entropy(seq) features['ideal_entropy'] = entropy_ideal(len(seq)) features['perc_entropy'] = features['entropy']/features['ideal_entropy'] features['hydr_count'] = sum(1 for x in seq if x in hydrophobic_proteins) features['polar_count'] = sum(1 for x in seq if x in polar_proteins) features['buried'] = sum(buried[x] for x in seq if x in hydrophobic_proteins) seq = ''.join([x for x in seq if x not in ['X','B','Z',"'",'O','U']]) protein = ProteinAnalysis(seq) features['gravy'] = protein.gravy() features['molecular_weight'] = protein.molecular_weight() features['aromaticity'] = protein.aromaticity() features['instability_index'] = protein.instability_index() features['isoelectric_point'] = protein.isoelectric_point() features['helix'], features['turn'], features['sheet'] = protein.secondary_structure_fraction() features.update(protein.count_amino_acids()) # features.update(protein.get_amino_acids_percent()) return features
def GRAvy_ARomo(seq, genetic_code_=1, G=False, A=False): """calculating Gravy and Aroma for DNA sequence. Args: seq (str):DNA sequence genetic_code_(int): default = 1, The Genetic Codes number described by NCBI (https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) G (bool): default = False A (bool): default = False Returns: - Gravy value if arg(G) is True - Aroma value if arg(A) is True - None if both args are False """ from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.Seq import Seq try: seq = Seq(seq) except: pass translate_seq = str(seq.translate(table=genetic_code_)) protein_seq = translate_seq.replace("*", "") protein_seq = ProteinAnalysis(protein_seq) AROMO = protein_seq.aromaticity() gravy = protein_seq.gravy() if G and G == True: return gravy elif A and A == True: return AROMO
def seqs_to_features(self, seqs, no_seqs): """ Extract the features from the sequences.""" X = np.zeros((no_seqs, 32)) for i, s in enumerate(chain(*seqs)): # iterate over all sequences # get amino acid counts alphabet = 'ABCDEFGHIKLMNPQRSTUVWXY' # no JOZ for j, letter in enumerate(alphabet): X[i, j] = s.count(letter) / len(s) # other analysis analysis = ProteinAnalysis( s.replace('X', 'A').replace('B', 'A').replace('U', 'A')) X[i, -1] = analysis.molecular_weight() X[i, -2] = analysis.aromaticity() X[i, -3] = analysis.instability_index() X[i, -4] = analysis.isoelectric_point() helix_array_sheet_fracs = analysis.secondary_structure_fraction() X[i, -5] = helix_array_sheet_fracs[0] X[i, -6] = helix_array_sheet_fracs[1] X[i, -7] = helix_array_sheet_fracs[2] X[i, -8] = len(s) X[i, -9] = analysis.gravy() # mean hydrophobicity return X
def physchem_props(ara_d): """Calculate the physicochemical properties per protein in ara_d.""" c = 0 g = 0 for protein in ara_d: seq = ara_d[protein]["sequence"] # Calculates the properties if "X" in seq: continue # Skip non-usable sequences, only negs if '*' in seq: if ara_d[protein]["pos"] != []: print(protein) continue a_seq = ProteinAnalysis(seq) # Update ara_d with new physchem properties results = [ a_seq.molecular_weight(), a_seq.gravy(), a_seq.aromaticity(), a_seq.instability_index(), a_seq.flexibility(), a_seq.isoelectric_point(), a_seq.secondary_structure_fraction(), ] keys = [ "mol_weight", "gravy", "aromaticity", "instab_index", "flexi", "iso_point", "seq_struct", ] ara_d[protein]["Properties"] = {} for k, v in zip(keys, results): ara_d[protein]["Properties"][k] = v return ara_d
def add_protein_characteristics(df): df = df.copy() aa_list = [ 'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y' ] aa_dict = {} for aa in aa_list: aa_dict[aa] = [] prop_dict = { 'aromaticity': [], 'helix': [], 'turn': [], 'sheet': [], 'isoelectric_point': [], 'gravy': [] } #, 'flexibility': [], 'instability_index': []} for i, s in enumerate(df['sequence']): s = s.replace('B', 'D').replace('Z', 'E').replace('J', 'L').replace( 'X', 'G').replace('U', 'C').replace('O', 'K') pa = ProteinAnalysis(s) prop_dict['aromaticity'].append(pa.aromaticity()) prop_dict['isoelectric_point'].append(pa.isoelectric_point()) prop_dict['gravy'].append(pa.gravy()) # prop_dict['instability_index'].append(pa.instability_index()) # prop_dict['flexibility'].append(np.mean(pa.flexibility())) for fraction, ss in zip(pa.secondary_structure_fraction(), ['helix', 'turn', 'sheet']): prop_dict[ss].append(fraction) for k, v in pa.get_amino_acids_percent().items(): aa_dict[k].append(v) for k, v in aa_dict.items(): df[k] = v for k, v in prop_dict.items(): df[k] = v return df
def biopython_proteinanalysis_seq(seq, scaling=False): res = ProteinAnalysis(seq) d = {} flex = np.array(res.flexibility()) d['flex:min'], d['flex:max'], d['flex:std'] = flex.min(), flex.max( ), flex.std() d['gravy'] = res.gravy() d['instability_index'] = res.instability_index() d['isoelectric_point'] = res.isoelectric_point() r, c = res.molar_extinction_coefficient() d['molar_extinction_coefficient_reduced'], d[ 'molar_extinction_coefficient_cysteines'] = r, c d['molecular_weight'] = res.molecular_weight() d['percent_helix_naive'], d['percent_turn_naive'], d[ 'percent_strand_naive'] = res.secondary_structure_fraction() aap = res.get_amino_acids_percent() aas = sorted(aap.keys()) d.update({'percent:%s' % aa: aap[aa] for aa in aas}) d.update({ 'prop_res_%s' % key: sum([aap.get(x, 0) for x in value]) for key, value in list(property_residues.items()) }) return d
gravy=[] molweight=[] instidx=[] flex=[] for seq in sequences: X=ProteinAnalysis(str(seq)) isoelectricPt.append(X.isoelectric_point()) aromaticity.append(X.aromaticity()) aminoPercent.append(X.get_amino_acids_percent()) secstruct.append(X.secondary_structure_fraction()) # These features throw Key & Value Errors due to non standard amino acids # (i.e. out of the 20 standard ones) e.g. X, U etc try: gravy.append(X.gravy()) molweight.append(X.molecular_weight()) instidx.append(X.instability_index()) flex.append(X.flexibility()) hydrophob.append(X.protein_scale(ProtParamData.kd, 9, 0.4)) hydrophil.append(X.protein_scale(ProtParamData.hw, 9, 0.4)) surface.append(X.protein_scale(ProtParamData.em, 9, 0.4)) except (KeyError,ValueError): gravy.append(0) molweight.append(0) instidx.append(0) flex.append([0,0]) hydrophob.append([0,0]) hydrophil.append([0,0]) surface.append([0,0])
print('done') with temppathlib.TemporaryDirectory() as tmpdir: # unzip the file with all the test PDBs with zipfile.ZipFile(args.infile, "r") as zip_: zip_.extractall(tmpdir.path) for test_pdb in tmpdir.path.glob("*.pdb"): for record in SeqIO.parse(test_pdb, "pdb-atom"): sequence = str(record.seq).replace('X', 'G') protein = ProteinAnalysis(str(sequence)) p_len.append(len(sequence)) mol_w.append(protein.molecular_weight()) iso_p.append(protein.isoelectric_point()) smell.append(protein.aromaticity()) taste_factor.append(protein.gravy()) insta_ind.append(protein.instability_index()) char_at_acid.append(protein.charge_at_pH(1)) char_at_neutral.append(protein.charge_at_pH(7)) char_at_base.append(protein.charge_at_pH(14)) helter_skeler.append(protein.secondary_structure_fraction()[0]) turnip.append(protein.secondary_structure_fraction()[1]) garfield.append(protein.secondary_structure_fraction()[2]) for x in amino_acids: n = protein.count_amino_acids()[x] for y in d_count.keys(): if y[-1] == x: d_count[y].append(n) for a in amino_acids: m = protein.get_amino_acids_percent()[a] for b in d_perc.keys():
def extract(self): AA=["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y"] SC=["1","2","3","4","5","6","7"] tri_pep = [''.join(i) for i in itertools.product(AA, repeat = 3)] myseq="AILMVNQSTGPCHKRDEFWY" trantab2=myseq.maketrans("AILMVNQSTGPCHKRDEFWY","11111222233455566777") tetra_sc = [''.join(i) for i in itertools.product(SC, repeat = 4)] total_fasta=self.g_total_fasta sec_code=0 record_current=0 arr = numpy.empty((total_fasta,10409), dtype=numpy.float) names = numpy.empty((total_fasta,1), dtype=object) names_dic=dict() for record in SeqIO.parse(self.infile, "fasta"): data=(record_current/total_fasta) * 100 if (self.g_is_socket==1): self.g_socketio.emit('set bar', {'data': data},room=self.g_sid) else: print('extracting features of seq ' + str(record_current+1) + ' of ' + str(total_fasta),end='\r') #yield "event: update\ndata:" + str(data) + "\n\n" record_current += 1 #job.meta['current']=record_current #job.save_meta() ll=len(record.seq) seq_name='' if not self.prot_check(str(record.seq)): print("Warning: " + record.id + " is not a valid protein sequence") continue if record.id in names_dic: seq_name= record.id + '_' + str(names_dic[record.id]) names_dic[record.id]=names_dic[record.id]+1 else: seq_name= record.id names_dic[record.id]=1 seqq=record.seq.__str__().upper() seqqq=seqq.replace('X','A').replace('J','L').replace('*','A').replace('Z','E').replace('B','D') # X = ProteinAnalysis(record.seq.__str__().upper().replace('X','A').replace('J','L').replace('*','')) X = ProteinAnalysis(seqqq) myseq=seqq.translate(trantab2) tt= [X.isoelectric_point(), X.instability_index(),ll,X.aromaticity(), X.molar_extinction_coefficient()[0],X.molar_extinction_coefficient()[1], X.gravy(),X.molecular_weight()] tt_n = numpy.asarray(tt,dtype=numpy.float) tri_pep_count=[seqq.count(i)/(ll-2) for i in tri_pep] tri_pep_count_n = numpy.asarray(tri_pep_count,dtype=numpy.float) tetra_sc_count=[myseq.count(i)/(ll-3) for i in tetra_sc] tetra_sc_count_n = numpy.asarray(tetra_sc_count,dtype=numpy.float) cat_n= numpy.concatenate((tetra_sc_count_n,tri_pep_count_n,tt_n)) cat_n = cat_n.reshape((1,cat_n.shape[0])) arr[sec_code,:]=cat_n names[sec_code,0]=seq_name sec_code += 1 if (self.g_is_socket==1): self.g_socketio.emit('set bar', {'data': 100},room=self.g_sid) self.g_socketio.emit('done features',1,room=self.g_sid) print("\nDone") return (names,arr)
from Bio.SeqUtils.ProtParam import ProteinAnalysis my_seq = str(input("manual sequence from translate.py :")) analysed_seq = ProteinAnalysis(my_seq) answer1 = str(input("detect molecular weight y/n? :")) if answer1 == "y": mweight = analysed_seq.molecular_weight() print(mweight) answer2 = str(input("detect gravy y/n? :")) if answer2 == "y": gravy_protein = analysed_seq.gravy() print(gravy_protein) print(analysed_seq.count_amino_acids()) input("enter")
# for a in pe_list.index: # if pe_list[a] == 'Predicted': # u_list.append(reading['acc. code'][a]) if os.path.isfile('nP20k.fasta') == False: link = 'http://www.peptideatlas.org/tmp/nP20k.fasta.gz' resp = requests.get(link) with open('nP20k.fasta', 'wb') as f_output: f_output.write(resp.content) info = [] with open('nP20k.fasta', 'rU') as handle: for record in SeqIO.parse(handle, 'fasta'): if record.description.__contains__('PE=2'): analyzed_seq = ProteinAnalysis(str(record.seq)) tup = (record.id, '2', analyzed_seq.gravy(), textwrap.fill(record.description, 20)) info.append(tup) if record.description.__contains__('PE=3'): analyzed_seq = ProteinAnalysis(str(record.seq)) tup = (record.id, '3', analyzed_seq.gravy(), textwrap.fill(record.description, 20)) info.append(tup) if record.description.__contains__('PE=4'): analyzed_seq = ProteinAnalysis(str(record.seq)) tup = (record.id, '4', analyzed_seq.gravy(), textwrap.fill(record.description, 20)) info.append(tup) print(tabulate(info, headers=['Identifier', 'PE', 'GRAVY', 'Description']))
#!/usr/bin/env python import sys from Bio import SeqIO from Bio.SeqUtils.ProtParam import ProteinAnalysis sys.stdout.write("ID\tMW\tIP\tgravy\tlength\tinstability\tmonoisotpoic\tSequence\n") for record in SeqIO.parse(sys.stdin, "fasta"): a = ProteinAnalysis(str(record.seq)) properties = list() properties.append(record.id) properties.append(a.molecular_weight()) properties.append(a.isoelectric_point()) properties.append(a.gravy()) properties.append(a.length) properties.append(a.instability_index()) properties.append(a.aromaticity()) # always last column to make the output more readable properties.append(a.sequence) sys.stdout.write( '\t'.join(map(str, properties))+"\n" )
def get_phanns_input(fasta_list, d2vmodel): # d2vmodel = pickle.load(open('d2v_model1.p','rb')) AA = [ "A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y" ] SC = ["1", "2", "3", "4", "5", "6", "7"] tri_pep = [''.join(i) for i in itertools.product(AA, repeat=3)] tetra_sc = [''.join(i) for i in itertools.product(SC, repeat=4)] prot_class = 0 myseq = "AILMVNQSTGPCHKRDEFWY" trantab2 = myseq.maketrans("AILMVNQSTGPCHKRDEFWY", "11111222233455566777") kmer_size = 3 this_prot = 0 vectors = [] classes = [] for file in fasta_list: print('####################' + file) # file_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))),"fasta",file + "_all_clustered.fasta") for record in SeqIO.parse(file, "fasta"): ll = len(record.seq) seqq = record.seq.__str__().upper() seqqq = seqq.replace('X', 'A').replace('J', 'L').replace( '*', 'A').replace('Z', 'E').replace('B', 'D') X = ProteinAnalysis(seqqq) tt = [ X.isoelectric_point(), X.instability_index(), ll, X.aromaticity(), X.molar_extinction_coefficient()[0], X.molar_extinction_coefficient()[1], X.gravy(), X.molecular_weight() ] tt_n = np.asarray(tt, dtype=np.float) myseq = seqq.translate(trantab2) #count tripeptides tri_pep_count = [seqq.count(i) / (ll - 2) for i in tri_pep] tri_pep_count_n = np.asarray(tri_pep_count, dtype=np.float) #count tetra side chains tetra_sc_count = [myseq.count(i) / (ll - 3) for i in tetra_sc] tetra_sc_count_n = np.asarray(tetra_sc_count, dtype=np.float) #get embedding vector vec = d2vmodel.infer_vector([ seqqq[k:k + kmer_size] for k in range(0, len(seqqq), kmer_size) ]) for s in range(1, kmer_size): vec = vec + d2vmodel.infer_vector([ seqqq[k:k + kmer_size] for k in range(s, len(seqqq), kmer_size) ]) vec = vec / kmer_size cat_n = np.concatenate( (tri_pep_count_n, tetra_sc_count_n, tt_n, vec)) vectors.append((cat_n, record)) this_prot += 1 if (this_prot % 500 == 0): print("processing sequence # " + str(this_prot), end="\r") prot_class += 1 this_prot = 0 return vectors
#!/usr/bin/env python import sys from Bio import SeqIO from Bio.SeqUtils.ProtParam import ProteinAnalysis sys.stdout.write("ID\tMW\tIP\tgravy\tlength\tinstability\tmonoisotpoic\tSequence\n") for record in SeqIO.parse(sys.stdin, "fasta"): a = ProteinAnalysis(str(record.seq)) properties = list() properties.append(record.id) properties.append(a.molecular_weight()) properties.append(a.isoelectric_point()) properties.append(a.gravy()) properties.append(a.length) properties.append(a.instability_index()) properties.append(a.aromaticity()) # always last column to make the output more readable properties.append(a.sequence) sys.stdout.write("\t".join(map(str, properties)) + "\n")
plt.title("Distribution of Protein Molecular Weights") plt.savefig("plotMolecularWeights.pdf") plt.clf() # plt.show() gravy_index = [] aromaticity = [] instability_index = [] # flexibility = [] isoelectric_point = [] secondary_structure_fraction = [] for protein in sequences_a: analysed_seq = ProteinAnalysis(str(protein.seq).replace("X", "")) gravy_index.append([sys.argv[1], analysed_seq.gravy()]) aromaticity.append([sys.argv[1], analysed_seq.aromaticity()]) instability_index.append([sys.argv[1], analysed_seq.instability_index()]) # flexibility.append([sys.argv[1], analysed_seq.flexibility()]) isoelectric_point.append([sys.argv[1], analysed_seq.isoelectric_point()]) secondary_structure_fraction.append( [sys.argv[1], analysed_seq.secondary_structure_fraction()]) for protein in sequences_b: analysed_seq = ProteinAnalysis(str(protein.seq).replace("X", "")) gravy_index.append([sys.argv[2], analysed_seq.gravy()]) aromaticity.append([sys.argv[2], analysed_seq.aromaticity()]) instability_index.append([sys.argv[2], analysed_seq.instability_index()]) # flexibility.append([sys.argv[2], analysed_seq.flexibility()]) isoelectric_point.append([sys.argv[2], analysed_seq.isoelectric_point()])
def main(): aa = [ 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' ] dipeptide = [ 'AA', 'AC', 'AD', 'AE', 'AF', 'AG', 'AH', 'AI', 'AK', 'AL', 'AM', 'AN', 'AP', 'AQ', 'AR', 'AS', 'AT', 'AV', 'AW', 'AY', 'CA', 'CC', 'CD', 'CE', 'CF', 'CG', 'CH', 'CI', 'CK', 'CL', 'CM', 'CN', 'CP', 'CQ', 'CR', 'CS', 'CT', 'CV', 'CW', 'CY', 'DA', 'DC', 'DD', 'DE', 'DF', 'DG', 'DH', 'DI', 'DK', 'DL', 'DM', 'DN', 'DP', 'DQ', 'DR', 'DS', 'DT', 'DV', 'DW', 'DY', 'EA', 'EC', 'ED', 'EE', 'EF', 'EG', 'EH', 'EI', 'EK', 'EL', 'EM', 'EN', 'EP', 'EQ', 'ER', 'ES', 'ET', 'EV', 'EW', 'EY', 'FA', 'FC', 'FD', 'FE', 'FF', 'FG', 'FH', 'FI', 'FK', 'FL', 'FM', 'FN', 'FP', 'FQ', 'FR', 'FS', 'FT', 'FV', 'FW', 'FY', 'GA', 'GC', 'GD', 'GE', 'GF', 'GG', 'GH', 'GI', 'GK', 'GL', 'GM', 'GN', 'GP', 'GQ', 'GR', 'GS', 'GT', 'GV', 'GW', 'GY', 'HA', 'HC', 'HD', 'HE', 'HF', 'HG', 'HH', 'HI', 'HK', 'HL', 'HM', 'HN', 'HP', 'HQ', 'HR', 'HS', 'HT', 'HV', 'HW', 'HY', 'IA', 'IC', 'ID', 'IE', 'IF', 'IG', 'IH', 'II', 'IK', 'IL', 'IM', 'IN', 'IP', 'IQ', 'IR', 'IS', 'IT', 'IV', 'IW', 'IY', 'KA', 'KC', 'KD', 'KE', 'KF', 'KG', 'KH', 'KI', 'KK', 'KL', 'KM', 'KN', 'KP', 'KQ', 'KR', 'KS', 'KT', 'KV', 'KW', 'KY', 'LA', 'LC', 'LD', 'LE', 'LF', 'LG', 'LH', 'LI', 'LK', 'LL', 'LM', 'LN', 'LP', 'LQ', 'LR', 'LS', 'LT', 'LV', 'LW', 'LY', 'MA', 'MC', 'MD', 'ME', 'MF', 'MG', 'MH', 'MI', 'MK', 'ML', 'MM', 'MN', 'MP', 'MQ', 'MR', 'MS', 'MT', 'MV', 'MW', 'MY', 'NA', 'NC', 'ND', 'NE', 'NF', 'NG', 'NH', 'NI', 'NK', 'NL', 'NM', 'NN', 'NP', 'NQ', 'NR', 'NS', 'NT', 'NV', 'NW', 'NY', 'PA', 'PC', 'PD', 'PE', 'PF', 'PG', 'PH', 'PI', 'PK', 'PL', 'PM', 'PN', 'PP', 'PQ', 'PR', 'PS', 'PT', 'PV', 'PW', 'PY', 'QA', 'QC', 'QD', 'QE', 'QF', 'QG', 'QH', 'QI', 'QK', 'QL', 'QM', 'QN', 'QP', 'QQ', 'QR', 'QS', 'QT', 'QV', 'QW', 'QY', 'RA', 'RC', 'RD', 'RE', 'RF', 'RG', 'RH', 'RI', 'RK', 'RL', 'RM', 'RN', 'RP', 'RQ', 'RR', 'RS', 'RT', 'RV', 'RW', 'RY', 'SA', 'SC', 'SD', 'SE', 'SF', 'SG', 'SH', 'SI', 'SK', 'SL', 'SM', 'SN', 'SP', 'SQ', 'SR', 'SS', 'ST', 'SV', 'SW', 'SY', 'TA', 'TC', 'TD', 'TE', 'TF', 'TG', 'TH', 'TI', 'TK', 'TL', 'TM', 'TN', 'TP', 'TQ', 'TR', 'TS', 'TT', 'TV', 'TW', 'TY', 'VA', 'VC', 'VD', 'VE', 'VF', 'VG', 'VH', 'VI', 'VK', 'VL', 'VM', 'VN', 'VP', 'VQ', 'VR', 'VS', 'VT', 'VV', 'VW', 'VY', 'WA', 'WC', 'WD', 'WE', 'WF', 'WG', 'WH', 'WI', 'WK', 'WL', 'WM', 'WN', 'WP', 'WQ', 'WR', 'WS', 'WT', 'WV', 'WW', 'WY', 'YA', 'YC', 'YD', 'YE', 'YF', 'YG', 'YH', 'YI', 'YK', 'YL', 'YM', 'YN', 'YP', 'YQ', 'YR', 'YS', 'YT', 'YV', 'YW', 'YY' ] sequences = pandas.read_csv('protein_data.csv', header=None) lengths = [] weights = [] for protein in sequences.itertuples(): protein_length = len(str(protein[1])) # length of protein sequence lengths.append(protein_length) analyzed_protein = ProteinAnalysis(str(protein[1])) ambigious_match = re.findall("X+|Z+", protein[1]) if ambigious_match: molecular_weight = "?" else: molecular_weight = analyzed_protein.molecular_weight() weights.append(molecular_weight) # remove bad amino acids from sequences for i in range(len(sequences)): sequences[0][i] = sequences[0][i].replace('B', '') sequences[0][i] = sequences[0][i].replace('U', '') sequences[0][i] = sequences[0][i].replace('X', '') sequences[0][i] = sequences[0][i].replace('Z', '') pandas.DataFrame(sequences).to_csv('updated_protein_data.csv', index_label=None, header=None, index=None) # use amino acid composition results from pfeature to generate most common amino acid and dipeptide data = pandas.read_csv('updated_protein_data.csv', header=None) data = numpy.asarray(data) most_frequent_di = [] most_frequent = [] for i in range(len(data)): max = 0 col = 0 for j in range(len(dipeptide)): c = data[i][0].count(dipeptide[j]) if (c > max): max = c col = j most_frequent_di.append(dipeptide[col]) for j in range(len(aa)): c = data[i][0].count(aa[j]) if (c > max): max = c col = j most_frequent.append(aa[col]) # more features amino_acid = {} first_aa = [] last_aa = [] arom = [] ii = [] ip = [] mec_rc = [] mec_db = [] ssf_helix = [] ssf_turn = [] ssf_sheet = [] gravy = [] ph_0 = [] ph_7 = [] ph_14 = [] A = [] C = [] D = [] E = [] F = [] G = [] H = [] I = [] K = [] L = [] M = [] N = [] P = [] Q = [] R = [] S = [] T = [] V = [] W = [] Y = [] classes = [] data = pandas.read_csv('updated_protein_data.csv', header=None) for protein in data.itertuples(): analyzed_protein = ProteinAnalysis(str(protein[1])) amino_acid = (analyzed_protein.count_amino_acids()) A.append(amino_acid.get('A')) C.append(amino_acid.get('C')) D.append(amino_acid.get('D')) E.append(amino_acid.get('E')) F.append(amino_acid.get('F')) G.append(amino_acid.get('G')) H.append(amino_acid.get('H')) I.append(amino_acid.get('I')) K.append(amino_acid.get('K')) L.append(amino_acid.get('L')) M.append(amino_acid.get('M')) N.append(amino_acid.get('N')) P.append(amino_acid.get('P')) Q.append(amino_acid.get('Q')) R.append(amino_acid.get('R')) S.append(amino_acid.get('S')) T.append(amino_acid.get('T')) V.append(amino_acid.get('V')) W.append(amino_acid.get('W')) Y.append(amino_acid.get('Y')) first_aa.append(str(protein[1])[0]) last_aa.append(str(protein[1])[-1]) arom.append(analyzed_protein.aromaticity()) ii.append(analyzed_protein.instability_index()) ip.append(analyzed_protein.isoelectric_point()) mec_rc.append(analyzed_protein.molar_extinction_coefficient()[0]) mec_db.append(analyzed_protein.molar_extinction_coefficient()[1]) ssf_helix.append(analyzed_protein.secondary_structure_fraction()[0]) ssf_turn.append(analyzed_protein.secondary_structure_fraction()[1]) ssf_sheet.append(analyzed_protein.secondary_structure_fraction()[2]) gravy.append(analyzed_protein.gravy()) ph_0.append(analyzed_protein.charge_at_pH(0.0)) ph_7.append(analyzed_protein.charge_at_pH(7.0)) ph_14.append(analyzed_protein.charge_at_pH(14.0)) classes.append(protein[2]) features = pandas.DataFrame() features["LENGTH"] = lengths #features["MOLECULAR WEIGHT"] = weights features["most frequent aa"] = most_frequent #features["first amino acids"] = first_aa features["last amino acid"] = last_aa features["most frequence dipeptide"] = most_frequent_di features["aromaticity"] = arom features["instability index"] = ii features["isolectric point"] = ip features["molecular extinction coefficient - reduced cysteines"] = mec_rc features["molecular extinction coefficient - disulfid bridges"] = mec_db features["secondary structure fraction helix"] = ssf_helix features["secondary structure fraction turn"] = ssf_turn features["secondary structure fraction sheet"] = ssf_sheet features["gravy"] = gravy features["charge at ph 0"] = ph_0 features["charge at ph 7"] = ph_7 features["charge at ph 14"] = ph_14 features['A'] = A features['C'] = C features['D'] = D features['E'] = E features['F'] = F features['G'] = G features['H'] = H features['I'] = I features['K'] = K features['L'] = L features['M'] = M features['N'] = N features['P'] = P features['Q'] = Q features['R'] = R features['S'] = S features['T'] = T features['V'] = V features['W'] = W features['Y'] = Y features["CLASS"] = classes features.to_csv('features.csv', index=None)