uniq_pept_count = []
pept_probab = []
#
aa_before = []
aa_after = []
for pept in uniq_pept:
    pep_dat_pept,uids = extract_uids(pept,pep_info)
    if uids:
        interesting_peptide.append(pept)
        uids_list.append(uids)
        #################################
        _1,prot_len_fasta,prot_seq_fasta = get_single_fasta(uids,fasta)
        uid_of_maxlen_list.append(_1)
        prot_len.append(prot_len_fasta)
        prot_fasta.append(str(prot_seq_fasta.seq))
        peptide_start_in_protein = ms.stupid_aligner(pept,prot_seq_fasta) # 1-based ...
        peptide_stop_in_protein = peptide_start_in_protein + len(pept) # 1-based ...
        pept_positions.append(peptide_start_in_protein)
        prot_name.append(prot_seq_fasta.description.replace(',',' ')) # long protein name here ...
        # uniq peptide count taken from pep_dat_pept, for definition look up extract_uids...
        uniq_pept_count_val = pep_dat_pept['Exclusive unique peptide count'].unique()[0]
        # uniq_pept_count_val, = pep_dat_pept['Exclusive unique peptide count'][pep_dat_pept['Exclusive unique peptide count']>0].unique()
        uniq_pept_count.append(uniq_pept_count_val)
        # some kind of peptide probability (like a quality score from experimental data)?!
        pept_probab_val, = pep_dat_pept['Best Peptide identification probability'].unique()
        pept_probab.append(pept_probab_val)
        #################################
        # BEWARE: 1-BASED INDEXING ALL THE WAY ACROSS SO FAR...
        # peptide can start right at N-terminus, so there will be no AminoAcid preceding it, call it a START
        aa_before.append(str(prot_seq_fasta.seq)[peptide_start_in_protein-2] if peptide_start_in_protein>1 else 'START')
        # peptide can end right at C-terminus, so there will be no AminoAcid after it, call it an END
Esempio n. 2
0
#   to be continued ...
#   to be continued ...

# extracting UID from protein accession numbers ...
# this way we return None for the Unknown entries ...
extract_uid = lambda line: line.split('|')[1] if len(line.split('|'))>1 else None
# get a single unique Uniprot ID ...
pep_info['uid'] = pep_info['Protein accession numbers'].apply(extract_uid)
# fetch protein sequence for each of the Uid-s ...
fetching = False
if fetching:
    print "fetching from uniprot.org ..."
    pep_info['fasta'] = pep_info['uid'].apply(lambda _: ms.get_uniprot(session,_))
    print "fetching complete"
    # Align peptide sequence to the extracted protein sequence and find the peptide starting position ...
    pep_info['my_start'] = pep_info[ ['Peptide sequence','fasta'] ].apply(lambda _:ms.stupid_aligner(*_),axis='columns')



# c = ['Protein name',
# 'Protein accession numbers',
# 'Database sources',
# 'Exclusive unique peptide count',
# 'Peptide sequence',
# 'Previous amino acid',
# 'Next amino acid',
# 'Peptide start index',
# 'Peptide stop index',
# 'Star Category',
# 'Assigned',
# 'Other Proteins',
uniq_pept_count = []
pept_probab = []
#
aa_before = []
aa_after = []
for pept in uniq_pept:
    pep_dat_pept,uids = extract_uids(pept,pep_info)
    if uids:
        interesting_peptide.append(pept)
        uids_list.append(uids)
        #################################
        _1,prot_len_fasta,prot_seq_fasta = get_single_fasta(uids,fasta)
        uid_of_maxlen_list.append(_1)
        prot_len.append(prot_len_fasta)
        prot_fasta.append(str(prot_seq_fasta.seq))
        peptide_start_in_protein = ms.stupid_aligner(pept,prot_seq_fasta)
        peptide_stop_in_protein = peptide_start_in_protein + len(pept)
        pept_positions.append(ms.stupid_aligner(pept,prot_seq_fasta))
        prot_name.append(prot_seq_fasta.description.replace(',',' ')) # long protein name here ...
        # uniq peptide count taken from pep_dat_pept, for definition look up extract_uids...
        uniq_pept_count_val = pep_dat_pept['Exclusive unique peptide count'].unique()[0]
        # uniq_pept_count_val, = pep_dat_pept['Exclusive unique peptide count'][pep_dat_pept['Exclusive unique peptide count']>0].unique()
        uniq_pept_count.append(uniq_pept_count_val)
        # some kind of peptide probability (like a quality score from experimental data)?!
        pept_probab_val, = pep_dat_pept['Best Peptide identification probability'].unique()
        pept_probab.append(pept_probab_val)
        #################################
        aa_before.append(str(prot_seq_fasta.seq)[peptide_start_in_protein-1])
        aa_after.append(str(prot_seq_fasta.seq)[peptide_stop_in_protein+1] if peptide_stop_in_protein+1<prot_len_fasta else 'END')
#########################################
dict_df = {