def create_peptides_fasta(input_path, fasta_input, peps_df, extention=15):

    writer = FastaWriter(open(
        input_path + 'peptides_extanded_by' + str(extention) + '_from' +
        fasta_input, 'w'),
                         wrap=None)
    writer.write_header()

    for record in SeqIO.parse(open(input_path + fasta_input, "r"), "fasta"):

        prot = record.seq.translate()
        for i, row in peps_df[peps_df['seq_id'] == record.id].iterrows():
            rna_pep_coor = row['in_frame_coordinates_base0'].split('_')
            pep_start = int(rna_pep_coor[1]) / 3
            pep_end = int(rna_pep_coor[2]) / 3
            seq_start = max(0, pep_start - extention)
            seq_end = min(pep_end + extention, len(prot))
            extented_pep = prot[seq_start:pep_start] + row[
                'biological_peptide'] + prot[min(pep_end +
                                                 1, len(prot)):seq_end]
            if not row['edited']:
                seq_id = record.id + '_original_' + str(
                    seq_start * 3) + '_' + str(
                        seq_end * 3) + '_pep_id_' + str(i)
            else:
                seq_id = record.id + '_' + str(seq_start * 3) + '_' + str(
                    seq_end * 3) + '_editing_range' + row[
                        'permutation_coor_base0'] + '_pep_id_' + str(i)
            writer.write_record(
                SeqRecord(extented_pep, id=seq_id, description=''))

    writer.write_footer()
Beispiel #2
0
def cut_fasta_by_len(fa_file, len_cutoff, outdir, prefix, suffix):
    # https://stackoverflow.com/questions/273192/how-can-i-create-a-directory-if-it-does-not-exist
    # Defeats race condition when another thread created the path
    #if not os.path.exists(outdir):
    #    os.mkdir(outdir)
    try:
        os.makedirs(outdir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    cut_fa_file = os.path.join(outdir,
                               prefix + ".ge" + str(len_cutoff) + suffix)
    if os.path.exists(cut_fa_file) and (os.path.getsize(cut_fa_file) > 0):
        return cut_fa_file

    if fa_file.endswith(".gz"):
        in_h = gzip.open(fa_file, 'rt')
    else:
        in_h = open(fa_file, 'r')
    with open(cut_fa_file, 'w') as out_h:
        #for rec in SeqIO.parse(in_h, 'fasta'):
        #    if len(rec.seq) >= len_cutoff:
        #        SeqIO.write(rec, out_h, 'fasta')
        # yes, the SeqIO.parse() API is more simple to use, easy to understand
        # but, try different method, you will find something
        writer = FastaWriter(out_h)
        writer.write_header()
        for rec in FastaIterator(in_h):
            if len(rec) >= len_cutoff:
                writer.write_record(rec)
        writer.write_footer()
    in_h.close()
    return cut_fa_file
def create_proteins_for_each_peptide(input_path,
                                     fasta_input,
                                     output_path,
                                     final_peptides,
                                     allow_change_in_cleavage_sites=False):
    """
    for each sequence create the native protein
    and create a version of thath protein for each peptide
    """

    final_edited_peptides = final_peptides[final_peptides['edited']]

    #create a seq-id:sequence dictionary from input fasta file
    sequences_dict = {}
    for record in SeqIO.parse(open(input_path + fasta_input, "r"), "fasta"):
        sequences_dict.update({record.id: record.seq})

    writer = FastaWriter(open(
        output_path + 'proteins_per_peptide_from_' + fasta_input, 'w'),
                         wrap=None)
    writer.write_header()

    for key, mrna_sequence, in sequences_dict.items():

        #first print the native protein
        comb_id = key + '|original'
        protein = mrna_sequence.translate()
        writer.write_record(SeqRecord(protein, id=comb_id, description=''))

        edited_peptides = final_edited_peptides[final_edited_peptides['seq_id']
                                                == key]

        n = 1
        for index, row in edited_peptides.iterrows():

            #flag editing combination for print\dont print in proteins file
            edit_prot = True
            if not allow_change_in_cleavage_sites and edit_prot:
                if final_peps_df.loc[
                        index,
                        'N_terminus'] != 'no_change' or final_peps_df.loc[
                            index,
                            'C_terminus'] != 'no_change' or final_peps_df.loc[
                                index, 'cancelled_cs_in_pep']:
                    edit_prot = False

            if edit_prot:
                permutation_coor = tuple(
                    int(x) for x in row['permutation_coor_base0'].split('_')
                    if x != '')
                protein = mrna_sequence[:permutation_coor[0]].translate(
                ) + row['biological_extended_peptide'] + mrna_sequence[
                    permutation_coor[1] + 1:]
                comb_id = key + '|edited_' + str(n) + '\t' + str(
                    row['editing_combinations_relative_to_coding_seq_base0'])
                writer.write_record(
                    SeqRecord(protein, id=comb_id, description=''))
                n += 1

    writer.write_footer()
Beispiel #4
0
def writeFasta(fb,seqList):
    if len(seqList) <= 0:
        raise ValueError("No data to Persist.")
    writer = FastaWriter(fb)
    writer.write_header()
    for record in seqList:
        writer.write_record(record)
    writer.write_footer()
def create_fully_edited_proteins_fasta(input_path, fasta_input, output_path):
    """
    for each sequence create a native protein version and a fully edited version
    """

    mm_headers = {}
    [
        mm_headers.update({mm: re.compile(r'(?<=' + mm + '_base0:\s).*?]')})
        for mm in all_mm
    ]

    writer = FastaWriter(open(
        output_path + 'fully_edited_and_native_proteins_from_' + fasta_input,
        'w'),
                         wrap=None)
    writer.write_header()

    for record in SeqIO.parse(open(input_path + fasta_input, "r"), "fasta"):

        sites_dict = {}
        [
            sites_dict.update({
                mm:
                sorted(
                    eval(
                        find_by_regex_in_header(record.description,
                                                mm_headers[mm])))
            }) for mm in all_mm
        ]
        sites_number = sum([len(sites_dict[mm]) for mm in all_mm])
        length = len(record.seq)
        comb = tuple([sites_dict[mm] for mm in all_mm])

        protein_basic_description = ''
        #translate native protein
        seq_id = record.id + '_original'
        protein = record.seq.translate()
        writer.write_record(
            SeqRecord(protein,
                      id=seq_id,
                      description=protein_basic_description))

        if sites_number:
            seq_id = record.id + '_fully_edited'
            protein_description = protein_basic_description + '| editing_combinations_base0_wrt_to_coding_sequence: ' + str(
                comb)
            edited_seq = Seq(
                edit_rna_as_peptide(str(record.seq), (0, length - 1), comb),
                generic_dna)
            protein = edited_seq.translate()
            writer.write_record(
                SeqRecord(protein, id=seq_id, description=protein_description))
            if len(edited_seq) % 3:
                print(record.id)
                print(len(record.seq))
                print(len(edited_seq))

    writer.write_footer()
def trierFastaByDomain(tgtDomain,fastaDict,step2List,writeFileName,formatFunc):
    fb     = open(writeFileName,'w')
    writer = FastaWriter(fb)
    writer.write_header()
    for record in step2List:
        score,gName,domain,gID,ARC,RF,reverse,begin,end,desc = formatFunc(record)
        if domain == tgtDomain:
            if fastaDict.get(gID) <> None:
                writer.write_record(fastaDict.get(gID))
            '''
            else:
                print "[%s] n'existe pas dans le fiche"%(gID)
            '''
    writer.write_footer()
    fb.close()
Beispiel #7
0
def reheader_fasta(fa_in, fa_out, header_function, in_gz, gz):
    if in_gz:
        in_h = gzip.open(fa_in, 'rt')
    else:
        in_h = open(fa_in, 'r')
    if gz:
        out_h = bgzf.BgzfWriter(fa_out, 'wb')
    else:
        out_h = open(fa_out, 'w')
    writer = FastaWriter(out_h)
    writer.write_header()
    for rec in FastaIterator(in_h, title2ids=header_function):
        writer.write_record(rec)
    writer.write_footer()
    out_h.close()
    in_h.close()
Beispiel #8
0
 def make_qiime_output(self):
     # Prepare fasta writer #
     handle = open(self.qiime_fasta.path, 'w')
     writer = FastaWriter(handle, wrap=0)
     writer.write_header()
     # Counter #
     counter = defaultdict(int)
     # Do it #
     for r in self.only_used.parse_barcodes():
         sample_name = r.first.sample.short_name
         counter[sample_name] += 1
         r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id)
         bar_seq = r.read.seq[0:self.pool.bar_len]
         r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq)
         writer.write_record(r.read[self.trim_fwd:-self.trim_rev])
     # Close #
     writer.write_footer()
     handle.close()
def cleanUpFasta(fname,fastaDict,step2List,Step2FormatSepFunc,seuil=1e-3):
    with open(fname,'w') as fb:
        writer = FastaWriter(fb)
        writer.write_header()
        for line in step2List:
            try:
                score,gName,domain,gId,ARC,RF,reverse,begin,end,desc = Step2FormatSepFunc(line)
                if score > seuil:
                    # print "[%s] score [%f] > seuil [%f].\n"%(gName,score,seuil)
                    continue
                code = fastaDict[gName].seq.tostring()
                if reverse:
                    code = code[::-1]
                record = SeqRecord(Seq(code[begin:end],generic_dna),name=gName,id=gId,description=desc)
                writer.write_record(record)
            except KeyError:
                print "[%s] not exists in fasta dictionary.\n"%gName
                continue
        writer.write_footer()
Beispiel #10
0
 def make_qiime_output(self):
     # Prepare fasta writer #
     handle = open(self.qiime_fasta.path, 'w')
     writer = FastaWriter(handle, wrap=0)
     writer.write_header()
     # Counter #
     counter = defaultdict(int)
     # Do it #
     for r in self.only_used.parse_barcodes():
         sample_name = r.first.sample.short_name
         counter[sample_name] += 1
         r.read.id = '%s_%i %s' % (sample_name, counter[sample_name],
                                   r.read.id)
         bar_seq = r.read.seq[0:self.pool.bar_len]
         r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq,
                                                                   bar_seq)
         writer.write_record(r.read[self.trim_fwd:-self.trim_rev])
     # Close #
     writer.write_footer()
     handle.close()
Beispiel #11
0
def create_in_frame_rna_file_from_anovar_results_and_coding_mrna_seqs_final_sites_dfs(fasta_file,output_name,out_path,mm_df_dict,stop_as_bad_records,met_as_good_records,last_is_stop,variants_to_use = []):
    """
    input - coding sequences as fasta file
            
            sites (wrt to coding sequence) dataframe - result of read_editing_sites_wrt_coding_seqs
            after ucsc_id column is set to index 
            different dataframes for different mm types
    
    output - fasta file in the format of proteomics simulator 
             some of the values in the header will be useless because the input includes that coding sequences
             so this function does not trim the sequences.
    """
    
    n_bad = 0
    n_good = 0
    sites_good = 0
    sites_bad = 0
    
    writer =  FastaWriter(open(out_path + output_name + '.fasta' , 'w'), wrap=None)
    writer_bad = FastaWriter(open(out_path + 'bad_seqs_' + output_name + '.fasta' , 'w'), wrap=None)
    writer.write_header()
    writer_bad.write_header()
    
    for record in SeqIO.parse(open(fasta_file, "r"), "fasta"):
        
        mm_loc_dict = {}
        
        split_header = record.id.split(';')
        rec_id = split_header[0] + ';' + split_header[1]
        use_variant = True
        
        if len(variants_to_use): #if a not-empty list is passed for variants_to_use, flag variants that are not in list so they will not be included in uotput
            if rec_id not in variants_to_use:
                use_variant = False
        
        if use_variant:
                        
            for mm in all_mm:
                if mm_df_dict[mm] is None:
                    mm_list = []
                else:
                    sites = mm_df_dict[mm]
                    try:
                        mm_list = [int(k)-1 for k in sites.loc[[rec_id]]['position_base1']]
                    except KeyError:
                        mm_list = []
                mm_loc_dict.update({mm:mm_list})

#            prot_start_nuc = 1
#            prot_end_nuc = len(final_sequence)
#            if last_is_stop:
#                prot_end_nuc = prot_end_nuc-3    
#            prot_start = 'first_met_in_original_orf'
#            prot_end = 'original_sense_strand_orf_end'
#            strand = '+'
#            orf_start = 1
#            orf_end = len(record.seq) - 3
             
            mm_str = ''
            for mm in mm_loc_dict:
                mm_str+= '| '+mm+'_base0: '+ str(mm_loc_dict[mm])
                    
#            description_str = mm_str + ' | prot_start: ' + str(prot_start) + ' | prot_end: ' + str(prot_end) + ' | strand: ' + strand + ' | prot_start_nuc: ' + str(prot_start_nuc) + ' | prot_end_nuc: ' + str(prot_end_nuc) + ' | original_orf_start: ' + str(orf_start) + ' | original_orf_end: ' + str(orf_end)
            description_str = mm_str
            
            if last_is_stop:    
                final_sequence = str(record.seq[0:-3]).replace('a','A').replace('g','G').replace('t','T').replace('c','C')
            else:
                final_sequence = str(record.seq).replace('a','A').replace('g','G').replace('t','T').replace('c','C')
        
            good_record = True
            if stop_as_bad_records:
                if '*' in Seq(str(final_sequence), generic_dna).translate():
                    good_record = False
            if met_as_good_records:
                if Seq(str(record.seq[0:3]), generic_dna).translate() != 'M':
                    good_record = False
            if last_is_stop:
                if Seq(str(record.seq[-3:len(record.seq)]), generic_dna).translate() != '*':
                    good_record = False
                
            if not good_record:
                writer_bad.write_record(record)
                n_bad+=1
                sites_bad+=sum([len(mm_loc_dict[mm]) for mm in all_mm])
            else:
                if len(final_sequence)%3:
                    final_sequence=final_sequence[0:-len(final_sequence)%3]
                current_record = SeqRecord(Seq(final_sequence,generic_dna), id = rec_id, description = description_str)
                writer.write_record(current_record)
                n_good+=1
                sites_good+=sum([len(mm_loc_dict[mm]) for mm in all_mm])
    
    writer.write_footer()
    if n_bad:    
        writer_bad.write_footer()
    
    
    print(str(n_good) + ' good sequence with ' + str(sites_good) + 'sites')
    print(str(n_bad) + ' bad sequence with ' + str(sites_bad) + 'sites')
original_file=sys.argv[1]
otu_table=sys.argv[2]
project_file=sys.argv[3]

import itertools
from Bio import SeqIO
from Bio.SeqIO.FastaIO import FastaWriter

total_fasta = SeqIO.parse(open(original_file,"rU"), "fasta")
project_fasta = open(project_file,'w')
project_fasta.close() 
project_fasta = open(project_file,'a')

## read in the csv file and get header names
import csv
table_normalized_otus = open(otu_table, 'rb')
reader = csv.reader(table_normalized_otus, delimiter="\t")
headers = reader.next()
print headers


writer = FastaWriter(project_fasta, wrap=None)    
writer.write_header()

for records in total_fasta:
    # print records.name
    if records.name in headers:
        writer.write_record(records)

writer.write_footer()
def create_edited_proteins_all_represented_combinations(
        input_path,
        fasta_input,
        output_path,
        final_peps_df,
        max_edits_per_pep=None,
        allow_change_in_cleavage_sites=False):
    """
    for each sequence create the native protein
    and create a version of that protein for each editing combination represented by that each edited peptide
    """

    #create a seq-id:sequence dictionary from input fasta file
    sequences_dict = {}
    for record in SeqIO.parse(open(input_path + fasta_input, "r"), "fasta"):
        sequences_dict.update({record.id: record.seq})

    writer = FastaWriter(open(
        output_path + 'proteins_per_combination_from_' + fasta_input, 'w'),
                         wrap=None)
    writer.write_header()

    #creating a dataframe of all editing cominations per protein
    #    comps_editing_combs = final_peps_df.groupby('seq_id').agg({'editing_combinations_relative_to_sense_orf_base0':lambda x: sorted([comb for sublist in list(x) for comb in sublist])})
    comps_editing_combs = final_peps_df.groupby('seq_id')[
        'editing_combinations_relative_to_coding_seq_base0'].aggregate(
            lambda x: list(x))
    #for each seq_id, iterate over all editing combinations and creat edited peptides
    final_peps_df = final_peps_df.drop_duplicates(
        subset='seq_id', keep='first'
    )  #removing duplicates as only data in seq_id level is now needed
    final_peps_df.set_index('seq_id', inplace=True)

    for index, combs_nested_list in comps_editing_combs.iteritems():

        written_combs = []
        n = 1
        protein_basic_description = ''
        length = len(sequences_dict[index])
        flattened_comb_list = [c for l in combs_nested_list for c in l]

        for comb in flattened_comb_list:

            #flag editing combination for print\dont print in proteins file
            edit_prot = True
            if max_edits_per_pep != None:
                if len([site for edit_type in comb
                        for site in edit_type]) > max_edits_per_pep:
                    edit_prot = False
            if not allow_change_in_cleavage_sites and edit_prot:
                if final_peps_df.loc[
                        index,
                        'N_terminus'] != 'no_change' or final_peps_df.loc[
                            index,
                            'C_terminus'] != 'no_change' or final_peps_df.loc[
                                index, 'cancelled_cs_in_pep']:
                    edit_prot = False

            #editing proteins and writing to file if combination not already writen and combination do not exceed editing events
            if comb not in written_combs and edit_prot:
                if comb == ([], [], [], [], [], [], [], [], [], [], [],
                            []):  #the original sequence
                    comb_id = index + '_original'
                    protein = sequences_dict[index].translate()
                    protein_description = protein_basic_description
                else:
                    comb_id = index + '_edited_' + str(n)
                    protein_description = protein_basic_description + '| editing_combinations_base0_wrt_to_coding_sequence: ' + str(
                        comb)
                    protein = Seq(
                        edit_rna_as_peptide(str(sequences_dict[index]),
                                            (0, length - 1), comb),
                        generic_dna).translate()
                    n += 1
                written_combs.append(comb)
                writer.write_record(
                    SeqRecord(protein,
                              id=comb_id,
                              description=protein_description))

    writer.write_footer()