Beispiel #1
0
def protein_analysis():
    if session.username == None:
        redirect(URL(r=request, c='account', f='log_in'))
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    form = FORM(
        TABLE(
            TR(
                "Amino acid sequence:  ",
                TEXTAREA(_type="text",
                         _name="sequence",
                         requires=IS_NOT_EMPTY())),
            INPUT(_type="submit", _value="SUBMIT")))
    if form.accepts(request.vars, session):
        session['sequence'] = seqClean(form.vars.sequence.upper())
        X = ProteinAnalysis(session['sequence'])
        session['aa_count'] = X.count_amino_acids()
        session['percent_aa'] = X.get_amino_acids_percent()
        session['mw'] = X.molecular_weight()
        session['aromaticity'] = X.aromaticity()
        session['instability'] = X.instability_index()
        session['flexibility'] = X.flexibility()
        session['pI'] = X.isoelectric_point()
        session['sec_struct'] = X.secondary_structure_fraction()
        redirect(URL(r=request, f='protein_analysis_output'))
    return dict(form=form)
Beispiel #2
0
def prot_feats(filename):
    XX=[]
    ids=[]


    for rec in SeqIO.parse(filename, "fasta"):
        f=[]
        X = ProteinAnalysis(str(rec.seq))
#        import pdb; pdb.set_trace()
        try:
            X.molecular_weight() #throws an error if 'X' in sequence. we skip such sequences
            f=list(prot_feats_seq(str(rec.seq)))
    #
            XX.append(f)
            ids.append(rec.id)
            
        except:
#            print ("exception")
            continue






    XX=np.array(XX)
#    import pdb; pdb.set_trace()

    return XX,ids
Beispiel #3
0
def binaryFeatureTable(PosSeqFiles, NegSeqFiles):
    seqDicts = []

    #add sequences from each file in positive group
    sequenceClass = 1
    for file in PosSeqFiles:
        records = readfasta(file)
        for rec in records:
            seqDict = ProteinAnalysis(str(rec.seq)).get_amino_acids_percent()
            seqDict['Class'] = sequenceClass
            seqDict['Length'] = len(rec.seq)
            seqDict['ID'] = rec.id
            seqDicts.append(seqDict)

    #add sequences from each file in negative group
    sequenceClass = 0
    for file in NegSeqFiles:
        records = readfasta(file)
        for rec in records:
            seqDict = ProteinAnalysis(str(rec.seq)).get_amino_acids_percent()
            seqDict['Class'] = sequenceClass
            seqDict['Length'] = len(rec.seq)
            seqDict['ID'] = rec.id
            seqDicts.append(seqDict)

    return pd.DataFrame(seqDicts)
Beispiel #4
0
def find_gravy_stats(folders, outfile, condition, regex = None, frequency = False):
    mean_list = []
    for folder in folders:
        with open(folder[0] + '/5_AA-sequences.txt') as f:
            gravy_all = 0
            total_seqs = 0
            reader = csv.DictReader(f, delimiter = '\t')
            for row in reader:
                try:
                    if row['Functionality'] == 'productive' and condition(row['CDR3-IMGT']):
                        protein = Prot(row['CDR3-IMGT'])
                        gravy = protein.gravy()
                        if frequency:
                            pat = re.compile(regex)
                            info = pat.match(row['Sequence ID'])
                            freq = int(info.group(1))
                        else:
                            freq = 1
                        total_seqs += freq
                        gravy_all += gravy * freq
                except:
                    pass
            try:    
                mean_list.append(gravy_all/float(total_seqs))
                print mean_list
            except:
                pass
    with open(outfile + '_means.txt', 'w') as out:
        for item in mean_list:
            out.write(str(item) +'\n')
    with open(outfile + '.txt', 'w') as out:
        out.write('mean CDR3 gravy,standard deviation\n')
        out.write(str(np.mean(mean_list)) + ',' + str(np.std(mean_list)))
Beispiel #5
0
    def sample_protein(self):
        codons = len(self.parameters.b2c.codons) *[0]
        code = ""
        for tribase in self.tribases:
            bases = tribase.bases
            codon = [[0,0,0,0], [0,0,0,0], [0,0,0,0]]
            for i in range(len(bases)):
                base = bases[i]
                r = int(100*random())+1
                cumsum = 0
                for j in range(len(base)):
                    cumsum += base[j]
                    if(cumsum >= r):
                        codon[i][j] = 1
                        break

            t = Tribase(codon, self.parameters.b2c)
            code += translate_triplets(codon)
            codons = [i + j for i, j in zip(codons, t.codons)]
        PA = ProteinAnalysis(translate(code))
        gc = GC(code)
        try:
            w = PA.molecular_weight()
        except:
            w = 0
        return codons, gc, w
def get_biopython_features(X):
    res = np.zeros((X.shape[0], 6))
    for i,seq in enumerate(X):
        analysed_seq = ProteinAnalysis(seq)
        res[i] = np.array([analysed_seq.molecular_weight()]+[analysed_seq.instability_index()] + [analysed_seq.isoelectric_point()] + list(analysed_seq.secondary_structure_fraction()))
        
    return res
    def test(self, positive_file, negative_file, sequence_position=10):
        # for my test files sequence position = 10
        test_features = []
        test_labels = []
        with open(positive_file) as f:
            for i in f:
                if ">" not in i and i[sequence_position] == self.amino_acid:
                    temp_window = ProteinAnalysis(
                        windower(i, sequence_position,
                                 self.window).strip("\t"))
                    feat = featurify(temp_window, (2 * self.window + 1))
                    test_features.append(feat)
                    test_labels.append(1)
        with open(negative_file) as f:
            for i in f:

                if ">" not in i and i[
                        sequence_position] == self.amino_acid and "X" not in i and "U" not in i:
                    temp_window = ProteinAnalysis(
                        windower(i, sequence_position,
                                 self.window).strip("\t"))
                    feat = featurify(temp_window, (2 * self.window + 1))
                    test_features.append(feat)
                    test_labels.append(0)
        temp = list(zip(test_features, test_labels))
        random.shuffle(temp)
        test_features, test_labels = zip(*temp)

        test_results = self.clf.predict(test_features)
        #print("cross val"+str(cross_val_score(self.clf, test_features, test_labels, cv=5)))
        report(results=test_results, answers=test_labels, classy=self.clf)
def get_protein_features(seq):
    seq = correct(seq)
    prot_analysis = ProteinAnalysis(seq)
    prot_weight = molecular_weight(seq)
    pI = prot_analysis.isoelectric_point()
    aa_count = prot_analysis.count_amino_acids()
    neg_charged_residues = aa_count['D'] + aa_count['E']
    pos_charged_residues = aa_count['K'] + aa_count['R']
    extinction_coefficient_1 = aa_count['Y'] * 1490 + aa_count['W'] * 5500
    extinction_coefficient_2 = aa_count['Y'] * 1490 + aa_count[
        'W'] * 5500 + aa_count['C'] * 125
    instability_idx = instability_index(seq)
    gravy = hydrophobicity(seq)
    secondary_structure_fraction = [
        frac for frac in prot_analysis.secondary_structure_fraction()
    ]

    names = [
        'length', 'weight', 'pI', 'neg_charged_residues',
        'pos_charged_residues', 'extinction_coeff1', 'extinction_coeff2',
        'instability_index', 'gravy', 'helix', 'turn', 'sheet'
    ]

    return names, [
        len(seq), prot_weight, pI, neg_charged_residues, pos_charged_residues,
        extinction_coefficient_1, extinction_coefficient_2, instability_idx,
        gravy, *secondary_structure_fraction
    ]
Beispiel #9
0
def feat_extract(sequences):
    list_dict_feat = []
    for sequence in sequences:

        protein = ProteinAnalysis(sequence)
        sequence_feat = defaultdict(float)
        sequence_len = len(sequence)

        sequence_feat["sequence_length"] = sequence_len
        sequence_feat["aromaticty"] = protein.aromaticity()
        sequence_feat["isoeletric_point"] = protein.isoelectric_point()
        #sequence_feat["flexibility"] = protein.flexibility()
        if ('X' not in sequence) and ('O' not in sequence) and (
                'U' not in sequence) and ('B' not in sequence):
            sequence_feat["molecular_weight"] = protein.molecular_weight()
        for letter in sequence:
            sequence_feat["relative_fre_{}".format(letter)] += 1 / sequence_len
            for property in dic_properties:
                if letter in dic_properties[property]:
                    sequence_feat['freq_{}'.format(property)] += 1
        for letter in sequence[0:50]:
            sequence_feat["relative_fre_start{}".format(letter)] += 1 / 50
        for letter in sequence[-51:-1]:
            sequence_feat["relative_fre_end{}".format(letter)] += 1 / 50
        list_dict_feat.append(sequence_feat)
    return list_dict_feat
Beispiel #10
0
 def calc_isoelectric_point(self) -> float:
     """
     using biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam-pysrc.html
     :return: calculates the sequence's isoelectric point
     """
     protein_analysis = ProteinAnalysis(self.get_seq())
     return protein_analysis.isoelectric_point()
Beispiel #11
0
 def get_secondary_structure(self):
     x = ProteinAnalysis(self.sequence)
     sec_stru = x.secondary_structure_fraction()
     helix = "{0:0.2f}".format(sec_stru[0])
     turn = "{0:0.2f}".format(sec_stru[1])
     sheet = "{0:0.2f}".format(sec_stru[2])
     return helix, turn, sheet
Beispiel #12
0
def _toPeptide(sequence, molecule, genetic_code=1, to_stop=True):
    '''
    Private function - Takes a sequence (DNA/RNA/amino acid) and 
    process it according to return a ProteinAnalysis object.

    @param sequence String: Nucleotide (DNA/RNA) or amino acid 
    sequence.
    @param molecule String: Defines the type of molecule. Three 
    options are allowed: 'peptide' for amino acid sequences, 'DNA' for 
    DNA sequences (requires transcription and translation), and 'RNA' 
    for RNA sequence (requires translation).
    @param genetic_code Integer: Genetic code number to be used for 
    translation. Default = 1 (Standard Code). For more information, 
    see <https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>
    @param to_stop Boolean: Flag to stop translation when first stop 
    codon is encountered. Default = True.
    @return: Bio.SeqUtils.ProtParam.ProteinAnalysis object
    '''
    if molecule.lower() == 'peptide':
        peptide = ProteinAnalysis(sequence)
    elif molecule.lower() == 'rna':
        rna = str(sequence)
        rna = Seq(rna, generic_rna)
        peptide = rna.translate(genetic_code, to_stop=to_stop)
        peptide = ProteinAnalysis(str(peptide))
    elif molecule.lower() == 'dna':
        dna = str(sequence)
        dna = Seq(dna, generic_dna)
        rna = dna.transcribe()
        peptide = rna.translate(genetic_code, to_stop=to_stop)
        peptide = ProteinAnalysis(str(peptide))
    return peptide
Beispiel #13
0
def print_features(fasta_file, data_dict, annot):
	if annot=="coding": annot=1
	elif annot=="noncoding": annot=0

	for seq in SeqIO.parse(fasta_file,"fasta"):
	  seqid = seq.id
	  seqDNA=seq.seq
	  seqDNA=seqDNA.upper()
	  seqlen=len(seqDNA)
	  seqCDS,orf_integrity = FindCDS(seqDNA).longest_orf()
	#  seqProt=PA(str(Seq(seqCDS).translate().strip("*")))
	  Prot=PA(str(seqCDS.translate().strip("*")))
	  seqProt=Prot.sequence
	  orflen=len(seqProt)
	  if len(seqProt)> 0: isoelectric_point = Prot.isoelectric_point()
	  else: isoelectric_point = 0.0	  
	  gc=(seqDNA.count("G")+seqDNA.count("C"))*100.0/len(seqDNA)
	  
	  data_dict["readID"].append(seqid)
	  data_dict["class"].append(annot)
	  data_dict["len"].append(seqlen)
	  data_dict["orflen"].append(orflen)
	  data_dict["pI"].append(isoelectric_point)
	  data_dict["GC%"].append(gc)
	  
	return data_dict
def my_own_filtering(input_file, output_file, filt_gc=45, filt_arom=0.01):

    sequences = {}
    c = 0

    with open(input_file, "r") as content:

        for record in SeqIO.parse(content, "fasta"):
            c += 1

            # calculate GC content using Bio

            calc_gc = SeqUtils.GC(record.seq)

            # calculate aromaticity using Bio

            prot_seq = record.seq.translate()
            X = ProteinAnalysis(str(prot_seq))
            calc_arom = X.aromaticity()

            # so, now you can filter
            if calc_gc >= filt_gc and calc_arom >= filt_arom:
                sequences[record.id] = record.se

    # write a new fasta file with aminoacids
    records = []
    for seq_id, seq in sequences.items():
        records.append(SeqRecord(seq.translate(), id=seq_id, description=""))

    write_file = open('my_fasta', 'w')
    SeqIO.write(records, write_file, 'fasta')
    write_file.close()

    # print the percentage
    print(len(records) / c)
Beispiel #15
0
def protparams(aa_seq, vstarts, vstops):
    """Compute a set of parameters for a polypepeptide,
    which would helps assess the potenial of this peptide as a crystalization candidate.
    """
    MWs = []
    pIs = []
    epsilons = []
    for start in vstarts:
        for stop in vstops:
            if int(start) < int(stop):
                params = PA(aa_seq[int(start):int(stop)]
                            )  # works with string or Seq objects
                MW = params.molecular_weight()
                MW = round(MW / 1000, 1)  # in kiloDalton, rounded to 1 decimal
                pI = round(params.isoelectric_point(), 1)
                # To calculate the epsilon, we use this formula from protparam (web.expasy.org/protparam)
                # Epsilon (Prot) = N(Tyr)*Ext(Tyr) + N(Trp)*Ext(Trp) + N(Cystine)*Ext(Cystine) / MW in Dalton
                aa_dict = params.count_amino_acids(
                )  # returns a dict {'aa' : count } where aa is one letter code for the aminoacid
                epsilon = round((aa_dict['Y'] * 1490 + aa_dict['W'] * 5500 +
                                 aa_dict['C'] * 125) / (MW * 1000), 2)
                MWs.append(MW)
                pIs.append(pI)
                epsilons.append(epsilon)
    return MWs, pIs, epsilons
Beispiel #16
0
def sequence_vector(temp_window: str, window: int = 6, chemical=1):
    """
    This vector takes the sequence and has each amino acid represented by an int
    0 represents nonstandard amino acids or as fluff for tails/heads of sequences
    Strip is a list which can be modified as user needs call for
    """
    temp_window = clean(temp_window)
    temp_window = windower(sequence=temp_window, position=int(len(temp_window)*.5), wing_size=window)

    vec = []
    aa = {"G": 1, "A": 2, "L": 3, "M": 4, "F": 5, "W": 6, "K": 7, "Q": 8, "E": 9, "S": 10, "P": 11, "V": 12, "I": 13,
          "C": 14, "Y": 15, "H": 16, "R": 17, "N": 18, "D": 19, "T": 20, "X": 0}

    for i in temp_window:
        vec.append(aa[i])
    if len(vec) != (window*2)+1:
        t = len(vec)
        for i in range((window*2)+1-t):
            vec.append(0)
    # Hydrophobicity is optional
    if chemical == 1:
        s = ProteinAnalysis(temp_window)
        vec.append(s.gravy())
        vec.append(s.instability_index())
        vec.append(s.aromaticity())

    return vec
def calculate_residue_features(temp_dict, sequence):
    analyzed_seq = ProteinAnalysis(sequence)
    aa_percent = analyzed_seq.get_amino_acids_percent()

    hydrophobicity = 0
    hydrophilicity = 0
    interior__surface_transfer_energy_scale = 0
    surface_fractional_probability = 0

    for key in aa_percent.keys():
        hydrophobicity += aa_percent[key] * kd[key]
        hydrophilicity += aa_percent[key] * hw[key]
        surface_fractional_probability += aa_percent[key] * em[key]
        interior__surface_transfer_energy_scale += aa_percent[key] * ja[key]

    temp_dict.update({
        "Hydrophobicity":
        hydrophobicity,
        "Hydrophilicity":
        hydrophilicity,
        "Surface Fractional Probability":
        surface_fractional_probability,
        "I2S Transfer Energy Scale":
        interior__surface_transfer_energy_scale
    })
    temp_dict.update(aa_percent)
Beispiel #18
0
    def analyzeCleaves(self):

        #i used to iterate through cleave sites
        #j used to iterate for miss cleaves. Skips j cleave site(s) when calculating the peptide from cleave sites
        
        for i in range(len(self.sites)):
            end = False
            for j in range(self.misses+1):
                l = self.peptide[:self.sites[i]+1]
                try:
                    r = self.peptide[self.sites[i+j+1]+1:]
                    dp = self.peptide[self.sites[i]+1:self.sites[i+j+1]+1]
                except IndexError:
                    #When code reaches this block, it means the end of the input string has been found
                    #Set end to true to stop going through missed cleaves, no more exist
                    r = ''
                    dp = self.peptide[self.sites[i]+1:]
                    end = True
                if i == 0:
                    l = self.peptide[:self.sites[i+j]+1]
                    if self.checkLenWeight(l):
                        self.dpeps.append([l,len(l),ProteinAnalysis(str(l)).molecular_weight(),j,'',dp+r,str(1)+'-'+str(len(l))])
                if self.checkLenWeight(dp):
                    self.dpeps.append([dp,len(dp),ProteinAnalysis(str(dp)).molecular_weight(),j,l,r,str(self.sites[i]+2)+'-'+str(self.sites[i]+len(dp)+1)])
                if end:
                    break
 def normal_charge_properties(self):
     df = pd.read_csv(self.train_fpi, sep='\t', index_col=0)
     df = df[df['y'] == 0]
     seqs = list(df['Sequence'])
     all_deltas = []
     net_charges = []
     frac_charges = []
     all_seq_in = ''
     for seq in seqs:
         ms = motif_seq.LcSeq(seq, self.k, self.lca, 'lca')
         in_seq, out_seq = ms.seq_in_motif()
         in_kmer, out_kmer = ms.overlapping_kmer_in_motif()
         if len(in_kmer) > 20:
             ka = kappa.KappaKmers(out_kmer, out_seq)
             delta = ka.deltaForm()
             if ka.NCPR() > -0.1 and ka.NCPR() < 0.1:
                 if delta < 0.1:
                     ns = norm_score.NormScore()
                     score = ns.lc_norm_score([seq])[0]
                     if score > 20:
                         if ka.FCR() < 0.2:
                             all_seq_in += in_seq
     analysed_seq = ProteinAnalysis(all_seq_in)
     aa_perc = analysed_seq.get_amino_acids_percent()
     print(aa_perc)
Beispiel #20
0
def aa_comp_calc():
    peptides = [
        'A', 'G', 'P', 'S', 'T', 'C', 'F', 'W', 'Y', 'H', 'R', 'K', 'M', 'I',
        'L', 'V', 'N', 'D', 'E', 'Q'
    ]
    if not os.path.isdir(args.output):
        os.mkdir(args.output)
    with open(args.input, 'r') as infile, open(f'{args.output}/aa_comp.tsv',
                                               'w') as outfile:
        outfile.write('Taxon\t' + '\t'.join(peptides) + '\n')

        # Reads in input file
        for record in SeqIO.parse(infile, format=args.in_format):
            outfile.write(f'{record.id}\t')
            analysed_seq = ProteinAnalysis(str(record.seq))
            count_dict = analysed_seq.count_amino_acids()
            length = len(
                str(record.seq).replace("-", "").replace("X",
                                                         "").replace("*", ""))
            out_str = ''

            # Loops through peptides and checks to see if it is in count_dict
            for pep in peptides:
                if pep in count_dict.keys():
                    out_str += f'{float(count_dict[pep]) / length}\t'
                else:
                    out_str += '0\t'

            outfile.write(out_str.strip() + '\n')
Beispiel #21
0
def prot_feats_seq(seq):

    aa=['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']


    f=[]



    X = ProteinAnalysis(str(seq))

    X.molecular_weight() #throws an error if 'X' in sequence. we skip such sequences
    p=X.get_amino_acids_percent()

    dp=[]
    for a in aa:
        dp.append(p[a])
    dp=np.array(dp)
    dp=normalize(np.atleast_2d(dp), norm='l2', copy=True, axis=1, return_norm=False)
    f.extend(dp[0])
    tm=np.array(twomerFromSeq(str(seq)))
    tm=normalize(np.atleast_2d(tm), norm='l2', copy=True, axis=1,return_norm=False)

    f.extend(tm[0])
    thm=np.array(threemerFromSeq(str(seq)))
    thm=normalize(np.atleast_2d(thm), norm='l2', copy=True, axis=1,return_norm=False)
    f.extend(thm[0])


    return np.array(f)
Beispiel #22
0
def percentages_from_proteins(path):
    file=open(path)
    names_list=[]
    sequence_list=[]
    sources_list = []
    desc_list = []
    taxo_list = []
    keyw_list = []
    taxid_list = []
    for record in  parse(file, "genbank"):
      cdsnum=0
      for feat in record.features:
               prot=record.seq
               analysed_seq = ProteinAnalysis(str(prot)) #creating another class ProteinAnalysis
               sequence_list.append(analysed_seq.get_amino_acids_percent()) #invoking method on this class, it returns a dictionary, we store it in the list
               names_list.append(str(record.name)+ "_CDS#" + str(cdsnum))                    
               sources_list.append(record.annotations['source'])
               keyw_list.append(record.annotations['keywords'])
               taxo_list.append(record.annotations['taxonomy'])
               desc_list.append(record.description)
               taxid_list.append(record.annotations["organism"])
               cdsnum+=1
    #List of dictionaties to the numpy array
    aas = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    nseqs = len(sequence_list)
    percents=np.zeros((nseqs,20))
    for i in range(nseqs):
        percdict = sequence_list[i]
        for an in range(20):
             percents[i,an]= percdict[ aas[an] ]
    return percents, names_list, sources_list, desc_list, taxo_list, keyw_list, taxid_list, sequence_list
Beispiel #23
0
def seq_properties(file_path):
    """Apply protein analysis on a fasta file to get analyzed amino acid profile 
    
    Args: 
        file_path [str]: File directory for the fasta file
        
    Returns: 
        total_percent_dict [dict]: Amino acid with counts dict
    """

    record = SeqIO.read(file_path, 'fasta')
    analyzed_seq = ProteinAnalysis(str(record.seq))

    c = analyzed_seq.get_amino_acids_percent()

    acidic_percent = count_prop(c, acidic_aa)
    basic_percent = count_prop(c, basic_aa)
    hydroxylic_percent = count_prop(c, hydroxylic_aa)
    amidic_percent = count_prop(c, amidic_aa)
    aliphatic_percent = count_prop(c, aliphatic_aa)
    aromatic_percent = count_prop(c, aromatic_aa)

    total_percent_dict = {
        "Acidic": acidic_percent,
        "Basic": basic_percent,
        "Hydroxilic": hydroxylic_percent,
        "Amidic": amidic_percent,
        "Aliphatic": aliphatic_percent,
        "Aromatic": aromatic_percent
    }
    return total_percent_dict
 def transform(self, X):
     vec = np.zeros((len(X), len(VALID_AMINO_ACIDS)))
     for i in range(len(X)):
         pa = ProteinAnalysis(str(X[i]))
         for j, a in enumerate(VALID_AMINO_ACIDS):
             vec[i, j] = pa.get_amino_acids_percent().get(a, 0.0)
     return vec
Beispiel #25
0
def aa_composition(seq):

    protein = ProteinAnalysis(seq)

    aa = protein.count_amino_acids()

    aacomp = 'A:\t%i,' % aa['A']
    aacomp += 'C:\t%i,' % aa['C']
    aacomp += 'E:\t%i,' % aa['E']
    aacomp += 'D:\t%i,' % aa['D']
    aacomp += 'G:\t%i,' % aa['G']
    aacomp += 'F:\t%i,' % aa['F']
    aacomp += 'I:\t%i,' % aa['I']
    aacomp += 'H:\t%i,' % aa['H']
    aacomp += 'K:\t%i,' % aa['K']
    aacomp += 'M:\t%i,' % aa['M']
    aacomp += 'L:\t%i,' % aa['L']
    aacomp += 'N:\t%i,' % aa['N']
    aacomp += 'Q:\t%i,' % aa['Q']
    aacomp += 'P:\t%i,' % aa['P']
    aacomp += 'S:\t%i,' % aa['S']
    aacomp += 'R:\t%i,' % aa['R']
    aacomp += 'T:\t%i,' % aa['T']
    aacomp += 'W:\t%i,' % aa['W']
    aacomp += 'V:\t%i,' % aa['V']
    aacomp += 'Y:\t%i,' % aa['Y']

    aacomp = aacomp.split(",")
    return aacomp
Beispiel #26
0
def getProps(f):
    """
    Code for getting the molecular weight and other properties using Biopython
    """
    L = myPDB.loader(f)
    aseq = ProteinAnalysis(L.seq)
    return aseq.molecular_weight(), np.max(aseq.flexibility()), np.sum(L.ASA)
def analyze(seq, name):
    analysed = ProteinAnalysis(seq)
    print(name)
    print("pI: ")
    print(analysed.isoelectric_point())
    print("AA percent: ")
    print(analysed.get_amino_acids_percent())
Beispiel #28
0
def download(filelist: list, q: Queue, lock: Lock, cursor: sqlite3.Cursor, conn: sqlite3.Connection, dir_name: str):
    """
    :param filelist:
    :param q:
    :param lock:
    :param cursor:
    :param conn:
    :param dir_name:
    """
    with open('status_tmp.txt', 'w') as f:
        f.write('')
    for file in filelist:
        if file in open('status_tmp.txt').readlines():
            continue
        pdbl = PDBList()
        pdbl.retrieve_pdb_file(file, pdir=os.path.join(dir_name, file), file_format='pdb')
        if not os.path.exists(os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file))):
            print("File with ID PDB: {:s} not found!".format(file))
            continue
        parser = PDBParser()
        structure = parser.get_structure('{:s}', os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file)))
        name = parser.header.get('name', '')
        head = parser.header.get('head', '')
        method = parser.header.get('structure_method', '')
        res = parser.header.get('resolution', '')
        ncomp = 0
        nchain = 0
        eclist = []
        for values in parser.header['compound'].values():
            ncomp += 1
            nchain += len(values['chain'].split(','))
            eclist.append(values.get('ec', '') or values.get('ec_number', ''))
        ec = ", ".join(eclist)
        nres = 0
        mmass = 0
        ppb = PPBuilder()
        for pp in ppb.build_peptides(structure):
            seq = pp.get_sequence()
            nres += len(seq)
            seqan = ProteinAnalysis(str(seq))
            mmass += int(seqan.molecular_weight())
        lock.acquire()
        try:
            cursor.execute("""INSERT INTO Structures (IDPDB, NAME, HEAD, METHOD, RESOLUTION, NCOMP, NCHAIN,
NRES, MMASS, EC) VALUES ("{:s}", "{:s}", "{:s}", "{:s}", {:.2f}, {:d}, {:d},{:d}, {:d}, "{:s}")""".format(
                file, name, head, method, res, ncomp, nchain, nres, mmass, ec))
        except sqlite3.DatabaseError as err:
            print("Error: ", err)
            continue
        else:
            print("Download Done for ID PDB: {:s}".format(file))
            conn.commit()
            q.put(file)
        finally:
            lock.release()
            with open('status_tmp.txt', 'at') as f:
                f.write((file + '\n'))
    os.remove('status_tmp.txt')
    q.put(None)
Beispiel #29
0
 def get_gravy_list(self):
     gravy_list = []
     for seq in self.df.index:  # for every seq, add gravy to list
         seq = ProteinAnalysis(seq)
         gravy = "{:.6f}".format(seq.gravy())
         gravy_list.append(gravy)
     gravy_list = np.array(gravy_list)  # convert to np array
     return self.normalize(gravy_list)  # return normalized
def get_aa_percentage_vectors(X):
    res = np.zeros((X.shape[0], 20))
    for i, seq in enumerate(X):
        analysed_seq = ProteinAnalysis(seq)
        res[i] = pd.Series(analysed_seq.get_amino_acids_percent())[
            aas  # to ensure the same order every time just in case
        ].values
    return res
Beispiel #31
0
 def test_alternative_weights(self):
     "Test Lanthipeptide.alt_weights"
     self.lant.core = "MAGICHATS"
     analysis = ProteinAnalysis("MAGICHATS", monoisotopic=False)
     weight = analysis.molecular_weight()
     # One Ser/Thr is assumed to be dehydrated, but not the other
     weight -= 18.02
     self.assertEqual([weight], self.lant.alternative_weights)
Beispiel #32
0
def getMF(subSeq):
    listofaminoacids = []
    #Dictionary for each amino acid with atoms for each
    A = {'C':3, 'H':7, 'N':1, 'O':2, 'S':0}
    R = {'C':6, 'H':14,'N':4, 'O':2, 'S':0}
    N = {'C':4, 'H':8, 'N':2, 'O':3, 'S':0}
    D = {'C':4, 'H':7, 'N':1, 'O':4, 'S':0}
    C = {'C':3, 'H':7, 'N':1, 'O':2, 'S':1}
    Q = {'C':5, 'H':10,'N':2, 'O':3, 'S':0}
    E = {'C':5, 'H':9, 'N':1, 'O':4, 'S':0}
    G = {'C':2, 'H':5, 'N':1, 'O':2, 'S':0}
    H = {'C':6, 'H':9, 'N':3, 'O':2, 'S':0}
    I = {'C':6, 'H':13,'N':1, 'O':2, 'S':0}
    L = {'C':6, 'H':13,'N':1, 'O':2, 'S':0}
    K = {'C':6, 'H':14,'N':2, 'O':2, 'S':0}
    M = {'C':5, 'H':11,'N':1, 'O':2, 'S':1}
    F = {'C':9, 'H':11,'N':1, 'O':2, 'S':0}
    P = {'C':5, 'H':9, 'N':1, 'O':2, 'S':0}
    S = {'C':3, 'H':7, 'N':1, 'O':3, 'S':0}
    T = {'C':4, 'H':9, 'N':1, 'O':3, 'S':0}
    W = {'C':11,'H':12,'N':2, 'O':2, 'S':0}
    Y = {'C':9, 'H':11,'N':1, 'O':3, 'S':0}
    V = {'C':5, 'H':11,'N':1, 'O':2, 'S':0}
    
    dictOfAmino = {'A':A,'R':R,'N':N,'D':D,'C':C,'Q':Q, 'E':E, 'G':G,'H':H,'I':I,'L':L,'K':K,'M':M,'F':F,'P':P,'S':S,'T':T,'W':W,'Y':Y,'V':V}
    mySeq = subSeq
    analysis = ProteinAnalysis(mySeq)
    listofaminoacids.append(analysis.count_amino_acids())

    for i in listofaminoacids:
        carbonTotal = 0
        hydrogenTotal = 0
        oxygenTotal = 0
        nitrogenTotal = 0
        sulfurTotal = 0
        peptideBonds = 0
        
        for value in i:
                for amino in dictOfAmino:
                        
                        if value == amino:
                                peptideBonds = peptideBonds + i[value]
                                thisAmino = {}
                                thisAmino = dictOfAmino[amino]
                                carbonTotal = carbonTotal + (i[value]*thisAmino['C'])
                                hydrogenTotal = hydrogenTotal + (i[value]*thisAmino['H'])
                                oxygenTotal = oxygenTotal + (i[value]*thisAmino['O'])
                                nitrogenTotal = nitrogenTotal + (i[value]*thisAmino['N'])
                                sulfurTotal = sulfurTotal + (i[value]*thisAmino['S'])
                                                             

        #Correcting totals for peptide bond loss of water
        peptideBonds = peptideBonds - 1
        hydrogenTotal = hydrogenTotal -(peptideBonds*2)
        oxygenTotal = oxygenTotal - (peptideBonds*1)
        outString = "C" + str(carbonTotal) + "H" + str(hydrogenTotal) + "N" + str(nitrogenTotal) + "O" + str(oxygenTotal) + "S" + str(sulfurTotal)
        return outString
Beispiel #33
0
def generate_plot(key, my_seq):
    analysed_seq = ProteinAnalysis(my_seq)
    l = len(my_seq)

    window_size = 21
    
    scale = analysed_seq.protein_scale(param_dict=amino_acids, window=window_size, edge=0.75)

    x = range((window_size+1)/2,len(scale)+(window_size+1)/2)

    lookahead = 7
    minp, maxp = peakdetect(scale, lookahead=(lookahead+1)/2)

    start = min(x)-1

    xpeaks = [xp[0]+(window_size+1)/2 for xp in minp]
    ypeaks = [scale[xpi-(window_size+1)/2] for xpi in xpeaks]

    t_x = np.array(scale)
    added_min = np.where(t_x < 0.9)[0]

    print(added_min)
    

    xdpeaks = [xdp[0]+(window_size+1)/2 for xdp in maxp]
    ydpeaks = [scale[xdpi-(window_size+1)/2] for xdpi in xdpeaks]

    num_pos = np.where(np.array(ydpeaks) < 0.9)[0].size 
    print(num_pos)
    if num_pos == 0 and len(added_min) != 0:
        added_val = [scale[i] for i in list(added_min)]
        minimum = added_val.index(min(added_val))-start+2
        print(added_min[minimum]) 
        print(added_val[minimum]) 
        xdpeaks.append(added_min[minimum])
        ydpeaks.append(added_val[minimum])

    print("maxs:",np.array(xpeaks)+start)
    print("mins:",np.array(xdpeaks)+start)
    #print(scale)
    plt.clf()
    plt.plot(x,scale,'b', xpeaks, ypeaks ,'ro', xdpeaks, ydpeaks ,'go')
    plt.grid(True)
    #plt.axis([0,max(x), min(scale)-0.05*min(scale), max(scale)+0.05*max(scale)])
    #plt.axis([0,max(x), 0.85, max(scale)+0.05*max(scale)])
    plt.legend( ['Scores for '+key])#,'local maxima', 'local minima' ])
    plt.xlabel('Position')
    plt.ylabel('Score')
    plt.savefig('figs/'+key+'.png')
Beispiel #34
0
def properties(toxin_faa,antitoxin_faa,out):

    # Build a dictionary of {locus:[{properties:values},{properties:values}]}
    from collections import defaultdict
    loci = defaultdict(list)
    from Bio import SeqIO
    for f in [toxin_faa,antitoxin_faa]:
        # Parse FASTA files
        with open(f,'rU') as handle:
            for record in SeqIO.parse(handle,'fasta'):
                locus,start = getNameAndPosition(record)
                if not start:
                    continue
                aaseq = str(record.seq).strip("*")
                # Omit sequences with missing positions or premature stops
                # give them 0 as flag for missing data instead
                if "*" not in aaseq and "X" not in aaseq:
                    data = ProteinAnalysis(aaseq)
                    loci[locus].append({ 'start':  start,
                                         'pI':     data.isoelectric_point(),
                                         'weight': data.molecular_weight(),
                                         'instability': data.instability_index() })
                else:
                    loci[locus].append({ 'start': start,
                                         'pI': 0, 'weight':0 ,
                                         'instability': 0 })

        
    # Order genes in a locus positionally
    loci = orderPairs(loci)

    # Write to output fil
    outfile = ".".join([out,"properties","txt"])
    with open(outfile,'w') as o:
        header = "\t".join(["locus",
                            "gene1_pI","gene2_pI",
                            "gene1_weight","gene2_weight",
                            "gene1_instability","gene2_instability" ])

        o.write("#"+ header.upper() + "\n")
        for locus, gene in loci.iteritems():
            if len(gene) != 2:
                continue
            line = map(str, [ locus,gene[0]['pI'],gene[1]['pI'],
                              gene[0]['weight'],gene[1]['weight'],
                              gene[0]['instability'],gene[1]['instability'] ])
            o.write("\t".join(line)+"\n")
    return outfile
Beispiel #35
0
def draw_sequence(sequence, mode = 'simple', alphabet = None):
        
    if mode == 'protparams':
        returndiv = DIV()
        from Bio.SeqUtils.ProtParam import ProteinAnalysis
        seq_div=DIV(_style='font-family:monospace',_class='raw-sequence')
        spacer=len(str(len(sequence)))+1
        for i,pos in enumerate(sequence):
            if i==0:
                seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ','&nbsp;')))
            if i%10==0 and i!=0:
                seq_div.append(' ')
            if i%60==0 and i!=0:
                seq_div.append(XML((str(i)).ljust(spacer).replace(' ','&nbsp;')))
                seq_div.append(BR())
                seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ','&nbsp;')))
            seq_div.append(SPAN(pos,_class='seq-position',_title = i+1))
        returndiv.append(seq_div)
        returndiv.append(H3('Protein Parameters'))
        params_table = TABLE(_style= "width:200px;")
        
        protpar=ProteinAnalysis(sequence)
        params_table.append(TR(SPAN('Length:',_class = 'line-header'), '%i aa'%len(sequence)))
        try:
            params_table.append(TR(SPAN('MW:',_class = 'line-header'), '%i KDa'%round(protpar.molecular_weight()/1000,0)))
        except KeyError:
            pass
        try:
            params_table.append(TR(SPAN('pI:',_class = 'line-header'), '%1.2f'%protpar.isoelectric_point()))
        except KeyError:
            pass
        returndiv.append(params_table)
        return returndiv
        
    if mode == 'simple':
        seq_div=DIV(_style='font-family:monospace',_class='raw-sequence')
        spacer=len(str(len(sequence)))+1
        for i,pos in enumerate(sequence):
            if i==0:
                seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ','&nbsp;')))
            if i%10==0 and i!=0:
                seq_div.append(' ')
            if i%60==0 and i!=0:
                seq_div.append(XML((str(i)).ljust(spacer).replace(' ','&nbsp;')))
                seq_div.append(BR())
                seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ','&nbsp;')))
            seq_div.append(SPAN(pos,_class='seq-position', _title = i+1))
        return seq_div
def main():
	ieps = []
	seqid = []
	inputfile = "/isi/olga/xin/Halophile_project/output/20160421/SS37_aa.faa"
	outputfile = "/isi/olga/xin/Halophile_project/output/20160421/SS37_reads_isp.txt"
	f = open(inputfile, 'rU')
	sequences = SeqIO.parse(f, "fasta")
	for record in sequences:
		seqid.append(record.id)
		seq = str(record.seq)
		seq_pa = ProteinAnalysis(seq)
		ie = seq_pa.isoelectric_point() 
		ieps.append(ie)
	read_ieps = np.column_stack((seqid, ieps))
	df = pd.DataFrame(read_ieps)
	df.to_csv(outputfile, sep = '\t', header = False)
Beispiel #37
0
def getMW_average(subSeq):
    peptideBonds = 0
    molecularWeight = 0.0
    waterLoss = 18.015
    
    listofaminoacids = []

    #AVERAGE MW FOR EACH AMINO ACID CURRENTLY
    dictOfAmino = {'A':71.0788,
                   'R':156.1875,
                   'N':114.1038,
                   'D':115.0886,
                   'C':103.1388,
                   'Q':128.1307,
                   'E':129.1155,
                   'G':57.0519,
                   'H':137.1411,
                   'I':113.1594,
                   'L':113.1594,
                   'K':128.1741,
                   'M':131.1926,
                   'F':147.1766,
                   'P':97.1167,
                   'S':87.0782,
                   'T':101.1051,
                   'W':186.2132,
                   'Y':163.1760,
                   'V':99.1326}
    mySeq = subSeq
    analysis = ProteinAnalysis(mySeq)
    listofaminoacids.append(analysis.count_amino_acids())

    for i in listofaminoacids:
        for value in i:
            for amino in dictOfAmino:
                if value == amino:
                    peptideBonds = peptideBonds + i[value]
                    #print dictOfAmino[value]
                    #print i[value]
                    molecularWeight = molecularWeight + (i[value]*dictOfAmino[value])

    #peptideBonds = peptideBonds - 1 
    #molecularWeight = molecularWeight - (peptideBonds*waterLoss)
    molecularWeight =  molecularWeight+waterLoss
    return molecularWeight
Beispiel #38
0
def getMW_mono(subSeq):
    peptideBonds = 0
    molecularWeight = 0.0
    waterLoss = 18.015
    
    listofaminoacids = []

    #MONOISOTOPIC MW FOR EACH AMINO ACID CURRENTLY
    dictOfAmino = {'A':71.03711,
                   'R':156.10111,
                   'N':114.04293,
                   'D':115.02694,
                   'C':103.00919,
                   'Q':128.05858,
                   'E':129.04259,
                   'G':57.02146,
                   'H':137.05891,
                   'I':113.08406,
                   'L':113.08406,
                   'K':128.09496,
                   'M':131.04049,
                   'F':147.06841,
                   'P':97.05276,
                   'S':87.03203,
                   'T':101.04768,
                   'W':186.07931,
                   'Y':163.06333,
                   'V':99.06841}
    mySeq = subSeq
    analysis = ProteinAnalysis(mySeq)
    listofaminoacids.append(analysis.count_amino_acids())

    for i in listofaminoacids:
        for value in i:
            for amino in dictOfAmino:
                if value == amino:
                    peptideBonds = peptideBonds + i[value]
                    #print dictOfAmino[value]
                    #print i[value]
                    molecularWeight = molecularWeight + (i[value]*dictOfAmino[value])

    #peptideBonds = peptideBonds - 1 
    #molecularWeight = molecularWeight - (peptideBonds*waterLoss)
    molecularWeight =  molecularWeight+waterLoss
    return molecularWeight
Beispiel #39
0
def protParam(seq):
    params = ProteinAnalysis(seq)
    mw = params.molecular_weight()
    c_aa = params.count_amino_acids()
    p_aa = params.get_amino_acids_percent()
    gravy = params.gravy()
    aromaticity = params.aromaticity()
    isoelectric_point = params.isoelectric_point()
    ext_coeff = sum([c_aa["W"]*5690,c_aa["Y"]*1280,c_aa["C"]*120])
    mgml = ext_coeff * (1./mw)
    
    print("Amino acid count")
    pprint.pprint(c_aa)
    print("Amino acid percent")
    pprint.pprint(p_aa)
    print("Molecular weight")
    print("%f Da"%mw)
    print("Gravy")
    print(gravy)
    print("Isoelectric point")
    print(isoelectric_point)
    print("Aromaticity")
    print(aromaticity)
    print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)"%ext_coeff)
    print("")
def get_protein_analysis(aa):
    protein_analysis = ProteinAnalysis(aa)
    analyze = [protein_analysis.molecular_weight(), 
        protein_analysis.aromaticity(),
        protein_analysis.instability_index(),
        protein_analysis.isoelectric_point(),
        protein_analysis.gravy()] + list(
        protein_analysis.secondary_structure_fraction())
    return analyze
Beispiel #41
0
	def analyzeAMP(self):
                from Bio.SeqUtils.ProtParam import ProteinAnalysis
		
		self.netcharge()
		self.hphobFract()
                #self.aaPerc = self.pepParam.get_amino_acids_percent()
                self.pepParam = ProteinAnalysis(self.seq)

		self.data = {'charge': self.net,
				'length': self.length,
				'hydrophobic':self.hpf,
				'aminoacids': self.pepParam.get_amino_acids_percent()}
		
		return self.data
Beispiel #42
0
def protein_analysis():
    if session.username == None: redirect(URL(r=request,f='../account/log_in'))
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    form = FORM(TABLE(
            TR("Amino acid sequence:  ",
               TEXTAREA(_type="text", _name="sequence",
                        requires=IS_NOT_EMPTY())),
            INPUT(_type="submit", _value="SUBMIT")))
    if form.accepts(request.vars,session):
        session['sequence'] = seqClean(form.vars.sequence.upper())
        X = ProteinAnalysis(session['sequence'])
        session['aa_count'] = X.count_amino_acids()
        session['percent_aa'] = X.get_amino_acids_percent()
        session['mw'] = X.molecular_weight()
        session['aromaticity'] = X.aromaticity()
        session['instability'] = X.instability_index()
        session['flexibility'] = X.flexibility()
        session['pI'] = X.isoelectric_point()
        session['sec_struct'] = X.secondary_structure_fraction()
        redirect(URL(r=request, f='protein_analysis_output'))
    return dict(form=form)
Beispiel #43
0
def main(): #programm, mis kysib valgu fasta faili ja annab selle kohta parameetrid
    fasta = input()
    sequence = read_fasta(fasta)
    print(sequence)
    analysed_seq = ProteinAnalysis(str(sequence))
    print("\n","Molekulaarmass:",analysed_seq.molecular_weight())
    print("\n","Aminohapete arv:",analysed_seq.count_amino_acids())
    print("\n","Isoelektriline punkt:",analysed_seq.isoelectric_point())
    text_file = open("Valgu_parameetrid.txt", "w")
    text_file.write(str(analysed_seq.molecular_weight()))
    text_file.write("\n")
    text_file.write(str(analysed_seq.count_amino_acids()))
    text_file.write("\n")
    text_file.write(str(analysed_seq.isoelectric_point()))
    text_file.close()
Beispiel #44
0
    def __init__(self, sequence):
        self.sequence = sequence
        self.sequence_length = len(sequence)
        analysis = ProteinAnalysis(sequence)

        self.amino_acid_percents = analysis.get_amino_acids_percent()
        self.amino_acids_composition = calculate_amino_acids_composition(sequence)
        self.aromaticity = analysis.aromaticity()
        self.instability = analysis.instability_index()
        self.flexibility = calculate_flexibility(sequence)
        protein_scale_parameters = [{'name': 'Hydrophilicity', 'dictionary': hw},
                                    {'name': 'Surface accessibility', 'dictionary': em},
                                    {'name': 'Janin Interior to surface transfer energy scale', 'dictionary': ja},
                                    {'name': 'Bulkiness', 'dictionary': bulkiness},
                                    {'name': 'Polarity', 'dictionary': polarity},
                                    {'name': 'Buried residues', 'dictionary': buried_residues},
                                    {'name': 'Average area buried', 'dictionary': average_area_buried},
                                    {'name': 'Retention time', 'dictionary': retention_time}]
        self.protein_scales = calculate_protein_scales(analysis, protein_scale_parameters)
        self.isoelectric_point = analysis.isoelectric_point()
        self.secondary_structure_fraction = calculate_secondary_structure_fraction(analysis)
        self.molecular_weight = analysis.molecular_weight()
        self.kyte_plot = analysis.gravy()
        self.pefing = calculate_pefing(sequence)

        # next parameters are calculated using R.Peptides
        r('require(Peptides)')
        r('sequence = "{0}"'.format(sequence))
        self.aliphatic_index = r('aindex(sequence)')[0]
        self.boman_index = r('boman(sequence)')[0]
        self.charges = calculate_charges(sequence, 1.0, 14.0, 0.5, 'Lehninger')
        self.hydrophobicity = r('seq(sequence)')[0]
        angles = [{'name': 'Alpha-helix', 'angle': -47},
                  {'name': '3-10-helix', 'angle': -26},
                  {'name': 'Pi-helix', 'angle': -80},
                  {'name': 'Omega', 'angle': 180},
                  {'name': 'Antiparallel beta-sheet', 'angle': 135},
                  {'name': 'Parallel beta-sheet', 'angle': 113}]
        if self.amino_acid_percents['P'] + self.amino_acid_percents['G'] > 0.3:
            angles.append({'name': 'Polygly-polypro helix', 'angle': 153})
        self.hydrophobic_moments = calculate_hydrophobic_moments(sequence, angles)
        self.kidera_factors = calculate_kidera_factors(sequence)
        self.peptide_types = calculate_peptide_types(sequence, angles)
W = {'C':11,'H':12,'N':2, 'O':2, 'S':0}
Y = {'C':9, 'H':11,'N':1, 'O':3, 'S':0}
V = {'C':5, 'H':11,'N':1, 'O':2, 'S':0}



dictOfAmino = {'A':A,'R':R,'N':N,'D':D,'C':C,'Q':Q, 'E':E, 'G':G,'H':H,'I':I,'L':L,'K':K,'M':M,'F':F,'P':P,'S':S,'T':T,'W':W,'Y':Y,'V':V}

print "Note output file is appended if same file is selected twice molecular formulas \n for both runs will be present in output file"
fileName = raw_input("Protein FASTA file to generate molecular formulas for: ")
outFileName = raw_input("Output file name (include .txt): ")

fasta_file = open(fileName, "rU")
for record in SeqIO.parse(fasta_file, "fasta"):
	myseq = str(record.seq)
	analysis = ProteinAnalysis(myseq)
	listofaminoacids.append(analysis.count_amino_acids())


	
for i in listofaminoacids:
        carbonTotal = 0
        hydrogenTotal = 0
        oxygenTotal = 0
        nitrogenTotal = 0
        sulfurTotal = 0
        peptideBonds = 0
        
        for value in i:
                for amino in dictOfAmino:
                        
Beispiel #46
0
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import ProtParamData
from Bio import SeqIO
with open('../../samples/pdbaa') as fh:
   for rec in SeqIO.parse(fh,'fasta'):
       myprot = ProteinAnalysis(str(rec.seq))
       print(myprot.count_amino_acids())
       print(myprot.get_amino_acids_percent())
       print(myprot.molecular_weight())
       print(myprot.aromaticity())
       print(myprot.instability_index())
       print(myprot.flexibility())
       print(myprot.isoelectric_point())
       print(myprot.secondary_structure_fraction())
       print(myprot.protein_scale(ProtParamData.kd, 9, .4))
def main(argv):
		## we use ArgumentParser, which requires 2.7
		if sys.version_info < (2, 7):
			raise "This script requires python 2.7 or greater"

		## add weight filtering functionality if BioPython is available
		try:
			from Bio.SeqUtils.ProtParam import ProteinAnalysis	
			has_biopython = 1
		except :
			has_biopython = 0
			
  		
		parser = argparse.ArgumentParser(description='Add abundance to FASTA files.')
		parser.add_argument('infile', type=argparse.FileType('r'), help='Input FASTA file')
		parser.add_argument('outfile', type=argparse.FileType('w'), help='Output FASTA file')
		
		parser.add_argument('--mu', dest='mu', action='store', default=3, help='mean of gaussian in log space')
		parser.add_argument('--sigma', dest='sigma', action='store', default=1, help='sd of gaussian in log space')
		parser.add_argument('--sample', dest='sample', action='store', default=0, help='Number of entries to keep (for sampling a bigger FASTA file)')
		parser.add_argument('--random', dest='random', action='store_true', help='Randomly shuffle entries before sampling (only if --sample is given). If not given, the first \'X\' samples are used.')
		if (has_biopython):
			parser.add_argument('--weight_low', dest='weight_low', action='store', default=0, help='minimum molecular weight of protein')
			parser.add_argument('--weight_up', dest='weight_up', action='store', default=0, help='Maximum molecular weight of protein (use 0 for unlimited)')
		else:
			print "Warning: protein weight filtering not supported, as BioPython module is not installed."
			
		## argument parsing
		args = parser.parse_args()
		fileobj = args.infile
		fileoutobj = args.outfile
		sample_size = int(args.sample)
		sample_random = bool(args.random)
		if (has_biopython):
			weight_low = float(args.weight_low)
			weight_up = float(args.weight_up)
			if (weight_up <= 0): weight_up = sys.float_info.max
			
		
		## list of final entries
		fasta_entries = []
		
		for entry in nextEntry(fileobj):
				header = entry.header
				## check if it contains 'intensity'?
				rep = re.compile(r"\[# *(.*) *#\]")
				m = rep.search(header)
				header_new = ""
				other = []
				if (m):
					header_new = header.replace(m.group(0), "") ## delete meta
					for element in m.group(1).split(','):
							#print "element:", element
							if (element.find("intensity") == -1):
									other.append(element)
				else:
					header_new = header	## nothing to replace

				## create new metainfo array
				i = "intensity=" + str(sampleAbundance(float(args.mu), float(args.sigma)))
				other.append(i)

				entry.header = header_new.rstrip() + "[# " + (", ").join(other) + " #]"
				
				if (has_biopython):
					sequence = "".join(entry.sequence.split("\n"))
					##
					## BioPython does not like some AA letters - they need replacement
					##
					## replace "U" (Selenocystein) with "C" (Cystein)
					sequence = sequence.replace("U","C")
					## replace "X" (unknown) with "P" (Proline) [arbitrary choice - but weight of 115 is very close to averagine]
					sequence = sequence.replace("X","P")
					## replace "B" (Asparagine or aspartic acid) with "N" (Asparagine)
					sequence = sequence.replace("B","N")
					## replace "Z" (Glutamine or glutamic acid) with "Q" (Glutamine)
					sequence = sequence.replace("Z","Q")
					## replace "Z" (Glutamine or glutamic acid) with "Q" (Glutamine)
					sequence = sequence.replace("Z","Q")
					## replace "J" (Leucine or Isoleucine) with "L" (Leucine)
					sequence = sequence.replace("J","L")
					analysed_seq = ProteinAnalysis(sequence)
					weight = analysed_seq.molecular_weight()
					if (not(weight_low <= weight and weight <= weight_up)):
						continue
				
				
				fasta_entries.append(entry.header + "\n" + entry.sequence)
				
				## only read to sample size (the rest is thrown away anyways)
				if (sample_size > 0 and not(sample_random)):
					if (len(fasta_entries) >= sample_size):
						break
					
				
		## select subset (if required)		
		if (sample_size > 0):
			indices = range(0,len(fasta_entries))
			## random sampling only makes sense if we take a subset
			if (sample_random and sample_size < len(fasta_entries)):
				random.shuffle(indices)
			indices = [indices[i] for i in range(0,sample_size)]
			fasta_entries = [fasta_entries[i] for i in indices]
			
		## write to file
		for entry in fasta_entries:
			fileoutobj.write(entry)		
        protein_name = get_protein_name(line)
        protein_names_and_segments[protein_name] = get_segments(line)
        protein_names_and_sequences[protein_name] = ''
    else:
        sequence = protein_names_and_sequences.get(protein_name)
        sequence += line.strip('\n' and '\r' and '\r\n')
        protein_names_and_sequences[protein_name] = sequence

for key in protein_names_and_segments.keys():

    for segment in protein_names_and_segments.get(key):

        segment_sequence = protein_names_and_sequences.get(key)[segment[0]
                                                                - 1:segment[1]]
        x += segment_sequence
        y = ProteinAnalysis(str(x))
        z = y.get_amino_acids_percent()


# visual for command line
print 'parsing ' + FILE_INPUT + '\n'

# build the output file as CSV
with open('percent_AA_per_seg_OUTPUT.csv', 'wb') as f:
    w = csv.writer(f)
    w.writerows(z.items())

# opens the ouput file
file = '/Users/simonkeng/senior-research-project/percent_AA_per_seg_OUTPUT.csv'
open_file(file)
Beispiel #49
0
class amp:
	"stores all data of peptide"
	def __init__(self,readed):
		self.seq = readed[1]
		self.length = len(readed[1])
		self.name = readed[0]
		
	def netcharge(self): #i don't thonk biopython calculates net charge
		self.pos = 'KRH'
		self.neg = 'DE'
		self.net = 0
		self.posRe = 0
		for i in self.seq:
			if i in self.pos: 
				self.net += 1
			# no 	self.posRe += 1 #need it for searching 
			if i in self.neg: 
				self.net -= 1
			else: continue
	def hphobFract(self): #i don't know if biopython calculates just froaction of hphobs
                hph = 'ACFGILMPV'
                self.hpf = 0.
                for i in self.seq:
                        if i in hph: self.hpf += 1
                        else: continue
		self.hpn = self.hpf
		self.hpf = self.hpf/self.length

	def analyzeAMP(self):
                from Bio.SeqUtils.ProtParam import ProteinAnalysis
		
		self.netcharge()
		self.hphobFract()
                #self.aaPerc = self.pepParam.get_amino_acids_percent()
                self.pepParam = ProteinAnalysis(self.seq)

		self.data = {'charge': self.net,
				'length': self.length,
				'hydrophobic':self.hpf,
				'aminoacids': self.pepParam.get_amino_acids_percent()}
		
		return self.data

	def detectAMP(self):
		from Bio.SeqUtils.ProtParam import ProteinAnalysis
		import re
		import ConfigParser
		import numpy as np
		parser = ConfigParser.SafeConfigParser()
		parser.read('config.ini')

		"floating window and search for values"
		lowNet = parser.getfloat('Parameters','lowNet') #0
		midNet = parser.getfloat('Parameters','midNet')#2
		highNet = parser.getfloat('Parameters','highNet')#6
		lowHpf = parser.getfloat('Parameters','lowHpf')#0.5
		highHpf = parser.getfloat('Parameters','highHpf')#0.9
		lowCompCoeff = parser.getfloat('Parameters','lowCompCoeff')#0.85
		highCompCoeff = parser.getfloat('Parameters','highCompCoeff')#1.5
		baseWind = parser.getint('Parameters','baseWind')#15
#		maxWind = parser.getfloat('Parameters','maxWind')#100
		thresh = parser.getint('Parameters','thresh')#6
		minLen = parser.getint('Parameters','minLen')#10
		# 		  C    R     W    H    K    D    E 
		baseCompose = [0.01,0.06,0.005,0.02,0.06,0.05,0.07]
		ampCompose  = [0.06,0.09,0.01, 0.02,0.1, 0.02,0.03]
		changes = [i[1]/i[0] for i in zip(baseCompose,ampCompose)]
		upAvg = np.average(changes[:-2])
		downAvg = np.average(changes[-2:])
		self.result = [0 for i in self.seq]
		if self.length > baseWind*2:
			for i in range(self.length-baseWind):
				self.subPep = amp(['subPep',self.seq[i:i+baseWind]])
				self.subPep.netcharge()
				self.subPep.hphobFract()
				#print self.subPep.net, self.subPep.hpf, i, i+baseWind
				self.pepParam = ProteinAnalysis(self.subPep.seq)
				self.aaPerc = self.pepParam.get_amino_acids_percent()
				self.subPepComp = [self.aaPerc[aminame] for aminame in ['C','R','W','H','K','D','E']]
				self.subPepChanges = [k[1]/k[0] for k in zip(baseCompose,self.subPepComp)]
				self.upSubAvg = np.average(self.subPepChanges[:-2])
				self.downSubAvg = np.average(self.subPepChanges[-2:])
#really #really hate such muliticondidtional 
				#print downAvg,',,,,,',self.downSubAvg
				if (((lowNet < self.subPep.net < highNet and\
				    self.subPep.hpf > lowHpf) or\
				   (midNet < self.subPep.net ) or \
				   (self.subPep.hpf > highHpf)) and\
				    self.upSubAvg > lowCompCoeff*upAvg) or\
				    self.upSubAvg > highCompCoeff*upAvg:
					for aa in range(i,i+baseWind):
						self.result[aa] += 1
				
				else:
					continue
		else:
			self.subPep = self
			self.subPep.netcharge()
			self.subPep.hphobFract()
			self.pepParam = ProteinAnalysis(self.subPep.seq)
			self.aaPerc = self.pepParam.get_amino_acids_percent()
			self.subPepComp = [self.aaPerc[aminame] for aminame in ['C','R','W','H','K','D','E']]
                        self.subPepChanges = [k[1]/k[0] for k in zip(baseCompose,self.subPepComp)]      
                        self.upSubAvg = np.average(self.subPepChanges[:-2])
                        self.downSubAvg = np.average(self.subPepChanges[-2:])
			#print downAvg,',,,,,',self.downSubAvg
			if ((lowNet < self.subPep.net < highNet and\
                           self.subPep.hpf > lowHpf) or\
                          (midNet < self.subPep.net) or \
                          (self.subPep.hpf > lowCompCoeff*upAvg)) and\
                           self.upSubAvg > highCompCoeff*upAvg:
				self.result = [i+1 for i in self.result]
			else:
				pass
		self.thrRes = []
		for val in self.result:
			if val > thresh: self.thrRes.append(1)
			else: self.thrRes.append(0)
		self.strRes= ''.join([str(i) for i in self.thrRes])
		self.matches = re.split('0*',self.strRes)
		#for match in self.matches:
		self.matches = [match for match in self.matches if len(match) > minLen]
		
		if len(self.matches) > 0:
#			print 'found peptide of length ',len(self.matches[0]) 
			return 'found peptide of length ' + str(len(self.matches[0]))
			
		else:
			return 'nothing found'
		
	def plotPred(self):
		import matplotlib.pylab as pl
		try:
			checker = self.result[0]
		except:
			self.detectAMP()
		
		pl.plot(self.thrRes,'.-')
		pl.savefig('testy.pdf')
Beispiel #50
0
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import ProtParamData
import sys
import json

inp = json.loads(sys.argv[1])

seq = inp["Sequence"]

X = ProteinAnalysis(seq)

data = dict()

if "MW" in inp["Options"]:
	data["MW"] = X.molecular_weight()

if "EC280" in inp["Options"]:
	aa_count = X.count_amino_acids()
	if "hasDisulfide" in inp["Options"]:
		data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"] + 62.5 * aa_count["C"]
	else:
		data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"]

if "PI" in inp["Options"]:
	data["PI"] = X.isoelectric_point()

if "AACont" in inp["Options"]:
	ratios = X.get_amino_acids_percent()
	data["AACont"] = {aa: ratios[aa] * 100. for aa in ratios}

print json.dumps(data)
Beispiel #51
0


text_out = QtGui.QTextEdit('Ribosomal Protein CSV : format (Protein Description,Mwt,pI)') # text out widget




data_mwt = []
y_axis = []
x_axis = data_mwt


for record in SeqIO.parse(seq_file, "fasta"):      #for record in SeqIO.parse(seq_file, "fasta"):
    temp_seq=str(record.seq)
    analysis_seq=ProteinAnalysis(temp_seq)
    if ("ribosomal protein" in record.description or "ribosomal subunit" in record.description):
    #if ("ribosomal protein" in record.description or "ribosomal subunit" in record.description or "Ribosomal" in record.description):
        
        if (analysis_seq.molecular_weight() < 20000):
            data_mwt.append('%.2f'%(analysis_seq.molecular_weight()))
            y_axis.append(1)
            
            text_out.setTextColor(QColor('blue'))
            text_out.append(str(len(data_mwt)) + "," + record.description + "," + '%.2f'%(analysis_seq.molecular_weight()) + "," + '%.2f'%(analysis_seq.isoelectric_point()))
            
            
            
        
        #new=sorted(data_mwt)
        #data_mwt.append(list(zip(['%.2f'%(analysis_seq.molecular_weight())])))   
Beispiel #52
0
class Peptide(PolyIon):
    """Peptide represents single protein chains in solution.

    Peptides properties are based entirely on analysis of the sequence of the
    peptide.
    """

    _state = {'name': 'Name of the peptide.',
              'sequence': 'Amino acid sequence of the peptide.'
              }

    _sequence = None
    _analysis = None

    # TODO: move h to function or constants. Unify with pitts?
    _h_max = 1
    _h_min = 2./3.
    _h = 5./6.

    def __init__(self, name=None, sequence=None):
        self._name = name
        self._sequence = sequence
        self._analysis = ProteinAnalysis(str(self.sequence))

    @property
    def molecular_weight(self):
        return SeqUtils.molecular_weight(self.sequence, 'protein')

    def charge(self, pH=None, ionic_strength=None, temperature=None,
               moment=1):
        """Return the time-averaged charge of the peptide.

        :param pH
        :param ionic_strength
        :param temperature
        """
        pH, ionic_strength, temperature = \
            self._resolve_context(pH, ionic_strength, temperature)

        amino_acid_count = self._analysis.count_amino_acids()

        pos_pKs = dict(positive_pKs)
        neg_pKs = dict(negative_pKs)

        nterm = self.sequence[0]
        cterm = self.sequence[-1]

        if nterm in pKnterminal:
            pos_pKs['Nterm'] = pKnterminal[nterm]
        if cterm in pKcterminal:
            neg_pKs['Cterm'] = pKcterminal[cterm]

        charge = IsoelectricPoint(self.sequence,
                                  amino_acid_count)._chargeR(pH,
                                                             pos_pKs,
                                                             neg_pKs)
        return charge**moment

    def isoelectric_point(self, ionic_strength=None, temperature=None):
        """Return the isoelectric point of the peptide."""
        # _, ionic_strength, temperature = \
        #     self._resolve_context(None, ionic_strength, temperature)
        return self._analysis.isoelectric_point()

    def volume(self):
        """Return the approximate volume of the folded peptide in m^3."""
        v = self.molecular_weight / avogadro / self.density() / lpm3 / gpkg
        return v

    def radius(self):
        """Return the approximate radius of the folded peptide in m."""
        return (self.volume() * 3. / 4. / pi) ** (1. / 3.)

    def density(self):
        """Return the approximate density of the folded peptide in kg/L."""
        return 1.410 + 0.145 * exp(-self.molecular_weight / 13.)

    def mobility(self, pH=None, ionic_strength=None, temperature=None):
        """Return the effective mobility of the ion in m^2/V/s.

        If a context solution is available, mobility uses the full Onsager-Fuoss
        correction to mobility. Otherwise, the Robinson-Stokes model is used.

        :param pH
        :param ionic_strength
        :param temperature
        """
        pH, ionic_strength, temperature = \
            self._resolve_context(pH, ionic_strength, temperature)

        mobility = self.charge(pH) * elementary_charge /\
            (6 * pi * self._solvent.viscosity(temperature) * self.radius() *
             (1 + self.radius() /
              self._solvent.debye(ionic_strength, temperature)
              )
             ) * self._h
        return mobility
Beispiel #53
0
 def __init__(self, name=None, sequence=None):
     self._name = name
     self._sequence = sequence
     self._analysis = ProteinAnalysis(str(self.sequence))
#!/usr/bin/env python

import sys
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

sys.stdout.write("ID\tMW\tIP\tgravy\tlength\tinstability\tmonoisotpoic\tSequence\n")

for record in SeqIO.parse(sys.stdin, "fasta"):
    a = ProteinAnalysis(str(record.seq))

    properties = list()
    properties.append(record.id)
    properties.append(a.molecular_weight())
    properties.append(a.isoelectric_point())
    properties.append(a.gravy())
    properties.append(a.length)
    properties.append(a.instability_index())
    properties.append(a.aromaticity())
    # always last column to make the output more readable
    properties.append(a.sequence)
    sys.stdout.write( '\t'.join(map(str, properties))+"\n" )

Beispiel #55
0
	def detectAMP(self):
		from Bio.SeqUtils.ProtParam import ProteinAnalysis
		import re
		import ConfigParser
		import numpy as np
		parser = ConfigParser.SafeConfigParser()
		parser.read('config.ini')

		"floating window and search for values"
		lowNet = parser.getfloat('Parameters','lowNet') #0
		midNet = parser.getfloat('Parameters','midNet')#2
		highNet = parser.getfloat('Parameters','highNet')#6
		lowHpf = parser.getfloat('Parameters','lowHpf')#0.5
		highHpf = parser.getfloat('Parameters','highHpf')#0.9
		lowCompCoeff = parser.getfloat('Parameters','lowCompCoeff')#0.85
		highCompCoeff = parser.getfloat('Parameters','highCompCoeff')#1.5
		baseWind = parser.getint('Parameters','baseWind')#15
#		maxWind = parser.getfloat('Parameters','maxWind')#100
		thresh = parser.getint('Parameters','thresh')#6
		minLen = parser.getint('Parameters','minLen')#10
		# 		  C    R     W    H    K    D    E 
		baseCompose = [0.01,0.06,0.005,0.02,0.06,0.05,0.07]
		ampCompose  = [0.06,0.09,0.01, 0.02,0.1, 0.02,0.03]
		changes = [i[1]/i[0] for i in zip(baseCompose,ampCompose)]
		upAvg = np.average(changes[:-2])
		downAvg = np.average(changes[-2:])
		self.result = [0 for i in self.seq]
		if self.length > baseWind*2:
			for i in range(self.length-baseWind):
				self.subPep = amp(['subPep',self.seq[i:i+baseWind]])
				self.subPep.netcharge()
				self.subPep.hphobFract()
				#print self.subPep.net, self.subPep.hpf, i, i+baseWind
				self.pepParam = ProteinAnalysis(self.subPep.seq)
				self.aaPerc = self.pepParam.get_amino_acids_percent()
				self.subPepComp = [self.aaPerc[aminame] for aminame in ['C','R','W','H','K','D','E']]
				self.subPepChanges = [k[1]/k[0] for k in zip(baseCompose,self.subPepComp)]
				self.upSubAvg = np.average(self.subPepChanges[:-2])
				self.downSubAvg = np.average(self.subPepChanges[-2:])
#really #really hate such muliticondidtional 
				#print downAvg,',,,,,',self.downSubAvg
				if (((lowNet < self.subPep.net < highNet and\
				    self.subPep.hpf > lowHpf) or\
				   (midNet < self.subPep.net ) or \
				   (self.subPep.hpf > highHpf)) and\
				    self.upSubAvg > lowCompCoeff*upAvg) or\
				    self.upSubAvg > highCompCoeff*upAvg:
					for aa in range(i,i+baseWind):
						self.result[aa] += 1
				
				else:
					continue
		else:
			self.subPep = self
			self.subPep.netcharge()
			self.subPep.hphobFract()
			self.pepParam = ProteinAnalysis(self.subPep.seq)
			self.aaPerc = self.pepParam.get_amino_acids_percent()
			self.subPepComp = [self.aaPerc[aminame] for aminame in ['C','R','W','H','K','D','E']]
                        self.subPepChanges = [k[1]/k[0] for k in zip(baseCompose,self.subPepComp)]      
                        self.upSubAvg = np.average(self.subPepChanges[:-2])
                        self.downSubAvg = np.average(self.subPepChanges[-2:])
			#print downAvg,',,,,,',self.downSubAvg
			if ((lowNet < self.subPep.net < highNet and\
                           self.subPep.hpf > lowHpf) or\
                          (midNet < self.subPep.net) or \
                          (self.subPep.hpf > lowCompCoeff*upAvg)) and\
                           self.upSubAvg > highCompCoeff*upAvg:
				self.result = [i+1 for i in self.result]
			else:
				pass
		self.thrRes = []
		for val in self.result:
			if val > thresh: self.thrRes.append(1)
			else: self.thrRes.append(0)
		self.strRes= ''.join([str(i) for i in self.thrRes])
		self.matches = re.split('0*',self.strRes)
		#for match in self.matches:
		self.matches = [match for match in self.matches if len(match) > minLen]
		
		if len(self.matches) > 0:
#			print 'found peptide of length ',len(self.matches[0]) 
			return 'found peptide of length ' + str(len(self.matches[0]))
			
		else:
			return 'nothing found'
Beispiel #56
0
def iso_e(protS):
    """return the isoelectric point of protS string protein sequence"""
    from Bio.SeqUtils.ProtParam import ProteinAnalysis

    protA = ProteinAnalysis(protS)
    return protA.isoelectric_point()
Beispiel #57
0
import collections
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import sys

for rec in SeqIO.parse(sys.argv[1], "fasta"):
    x = ProteinAnalysis(str(rec.seq))
#    if sys.argv[2] is "sort":
    for key, val in sorted(x.iteritems(), key=lambda (k,v): (v,k)):
        print "%s %s:%s" % (key, value)
#    else:
#        print rec.id, x.count_amino_acids()
def main(databasePassword, schemaProteins, tableProteinInfo, tableStability):

    # Define N-terminus half life values (explanation http://en.wikipedia.org/wiki/N-end_rule and the ProtParam tool).
    halfLife = {'A' : 4.4, 'C' : 1.2, 'D' : 1.1, 'E' : 1.0, 'F' : 1.1, 'G' : 30.0, 'H' : 3.5, 'I' : 20.0, 'K' : 1.3,
                'L' : 5.5, 'M' : 30.0, 'N' : 1.4, 'P' : 20.0, 'Q' : 0.8, 'R' : 1.0, 'S' : 1.9, 'T' : 7.2,
                'V' : 100.0, 'W' : 2.8, 'Y' : 2.8}

    # Extract all the sequences stored in the database.
    conn, cursor = mysql.openConnection(databasePassword, schemaProteins)
    cursor = mysql.tableSELECT(cursor, 'UPAccession, Sequence', tableProteinInfo)
    results = cursor.fetchall()

    # Calculate the half life and instability index for each protein.
    stabilityTuples = []
    for i in results:
        sequence = i[1]
        if halfLife.has_key(sequence[0]):
            protHalfLife = halfLife[sequence[0]]
        else:
            # This will occur when the N-terminal is not an amino acid with an associated half-life value (e.g. X, B, etc.)
            protHalfLife = -1
        analysedSeq = ProteinAnalysis(sequence)
        try:
            instabilityIndex = analysedSeq.instability_index()
        except:
            instabilityIndex = -1
            print '\tContains invalid aa code: ', i[0]
        stabilityTuples.append(tuple([i[0], protHalfLife, instabilityIndex]))

    cursor.execute('TRUNCATE TABLE ' + tableStability)
    values = '(' + ('%s,' * len(stabilityTuples[0]))
    values = values[:-1] + ')'
    mysql.tableINSERT(cursor, tableStability, values, stabilityTuples)
    mysql.closeConnection(conn, cursor)

#def instability_index(prot, sequence):
#
#    # A two dimentional dictionary for calculating the instability index.
#    # Guruprasad K., Reddy B.V.B., Pandit M.W.    Protein Engineering 4:155-161(1990).
#    # It is based on dipeptide values therefore the vale for the dipeptide DG is DIWV['D']['G'].
#    DIWV = {'A': {'A': 1.0, 'C': 44.94, 'E': 1.0, 'D': -7.49,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': -7.49,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0,
#                  'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0},
#            'C': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 20.26,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 33.60,
#                  'K': 1.0, 'M': 33.60, 'L': 20.26, 'N': 1.0,
#                  'Q': -6.54, 'P': 20.26, 'S': 1.0, 'R': 1.0,
#                  'T': 33.60, 'W': 24.68, 'V': -6.54, 'Y': 1.0},
#            'E': {'A': 1.0, 'C': 44.94, 'E': 33.60, 'D': 20.26,
#                  'G': 1.0, 'F': 1.0, 'I': 20.26, 'H': -6.54,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': 1.0,
#                  'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0},
#            'D': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0,
#                  'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 1.0, 'P': 1.0, 'S': 20.26, 'R': -6.54,
#                  'T': -14.03, 'W': 1.0, 'V': 1.0, 'Y': 1.0},
#            'F': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 13.34,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0,
#                  'K': -14.03, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0,
#                  'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 33.601},
#            'I': {'A': 1.0, 'C': 1.0, 'E': 44.94, 'D': 1.0,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 13.34,
#                  'K': -7.49, 'M': 1.0, 'L': 20.26, 'N': 1.0,
#                  'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0,
#                  'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0},
#            'G': {'A': -7.49, 'C': 1.0, 'E': -6.54, 'D': 1.0,
#                  'G': 13.34, 'F': 1.0, 'I': -7.49, 'H': 1.0,
#                  'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': -7.49,
#                  'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0,
#                  'T': -7.49, 'W': 13.34, 'V': 1.0, 'Y': -7.49},
#            'H': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': -9.37, 'F': -9.37, 'I': 44.94, 'H': 1.0,
#                  'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 24.68,
#                  'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0,
#                  'T': -6.54, 'W': -1.88, 'V': 1.0, 'Y': 44.94},
#            'K': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': -7.49, 'F': 1.0, 'I': -7.49, 'H': 1.0,
#                  'K': 1.0, 'M': 33.60, 'L': -7.49, 'N': 1.0,
#                  'Q': 24.64, 'P': -6.54, 'S': 1.0, 'R': 33.60,
#                  'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0},
#            'M': {'A': 13.34, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 58.28,
#                  'K': 1.0, 'M': -1.88, 'L': 1.0, 'N': 1.0,
#                  'Q': -6.54, 'P': 44.94, 'S': 44.94, 'R': -6.54,
#                  'T': -1.88, 'W': 1.0, 'V': 1.0, 'Y': 24.68},
#            'L': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0,
#                  'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0,
#                  'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 33.60, 'P': 20.26, 'S': 1.0, 'R': 20.26,
#                  'T': 1.0, 'W': 24.68, 'V': 1.0, 'Y': 1.0},
#            'N': {'A': 1.0, 'C': -1.88, 'E': 1.0, 'D': 1.0,
#                  'G': -14.03, 'F': -14.03, 'I': 44.94, 'H': 1.0,
#                  'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': -6.54, 'P': -1.88, 'S': 1.0, 'R': 1.0,
#                  'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 1.0},
#            'Q': {'A': 1.0, 'C': -6.54, 'E': 20.26, 'D': 20.26,
#                  'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0,
#                  'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 1.0,
#                  'T': 1.0, 'W': 1.0, 'V': -6.54, 'Y': -6.54},
#            'P': {'A': 20.26, 'C': -6.54, 'E': 18.38, 'D': -6.54,
#                  'G': 1.0, 'F': 20.26, 'I': 1.0, 'H': 1.0,
#                  'K': 1.0, 'M': -6.54, 'L': 1.0, 'N': 1.0,
#                  'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': -6.54,
#                  'T': 1.0, 'W': -1.88, 'V': 20.26, 'Y': 1.0},
#            'S': {'A': 1.0, 'C': 33.60, 'E': 20.26, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 44.94, 'S': 20.26, 'R': 20.26,
#                  'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0},
#            'R': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 20.26,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 13.34, 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 58.28,
#                  'T': 1.0, 'W': 58.28, 'V': 1.0, 'Y': -6.54},
#            'T': {'A': 1.0, 'C': 1.0, 'E': 20.26, 'D': 1.0, 'G': -7.49, 'F': 13.34, 'I': 1.0, 'H': 1.0,
#                  'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': -14.03, 'Q': -6.54, 'P': 1.0, 'S': 1.0, 'R': 1.0,
#                  'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0},
#            'W': {'A': -14.03, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -9.37, 'F': 1.0, 'I': 1.0, 'H': 24.68,
#                  'K': 1.0, 'M': 24.68, 'L': 13.34, 'N': 13.34, 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0,
#                  'T': -14.03, 'W': 1.0, 'V': -7.49, 'Y': 1.0},
#            'V': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': -14.03, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 1.0,
#                  'K': -1.88, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0,
#                  'T': -7.49, 'W': 1.0, 'V': 1.0, 'Y': -6.54},
#            'Y': {'A': 24.68, 'C': 1.0, 'E': -6.54, 'D': 24.68, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 13.34,
#                  'K': 1.0, 'M': 44.94, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 13.34, 'S': 1.0, 'R': -15.91,
#                  'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 13.34},
#            }
#
#    score = 0.0
#    for i in range(len(sequence) - 1):
#        if DIWV.has_key(sequence[i]):
#            if DIWV[sequence[i]].has_key(sequence[i+1]):
#                score += DIWV[sequence[i]][sequence[i+1]]
#    return (10.0 / len(sequence)) * score
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
import sys
handle = open(sys.argv[1], 'rU')
records = list(SeqIO.parse(handle, "fasta"))
for record in records:
	prot = ProteinAnalysis(str(record.seq))
	print prot.isoelectric_point()
Beispiel #60
-1
 def calc_isoelectric_point(self) -> float:
     """
     using http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam-pysrc.html
     :return: calculates the sequence's isoelectric point
     """
     protein_analysis = ProteinAnalysis(self.get_seq())
     return protein_analysis.isoelectric_point()