def create_peptides_fasta(input_path, fasta_input, peps_df, extention=15): writer = FastaWriter(open( input_path + 'peptides_extanded_by' + str(extention) + '_from' + fasta_input, 'w'), wrap=None) writer.write_header() for record in SeqIO.parse(open(input_path + fasta_input, "r"), "fasta"): prot = record.seq.translate() for i, row in peps_df[peps_df['seq_id'] == record.id].iterrows(): rna_pep_coor = row['in_frame_coordinates_base0'].split('_') pep_start = int(rna_pep_coor[1]) / 3 pep_end = int(rna_pep_coor[2]) / 3 seq_start = max(0, pep_start - extention) seq_end = min(pep_end + extention, len(prot)) extented_pep = prot[seq_start:pep_start] + row[ 'biological_peptide'] + prot[min(pep_end + 1, len(prot)):seq_end] if not row['edited']: seq_id = record.id + '_original_' + str( seq_start * 3) + '_' + str( seq_end * 3) + '_pep_id_' + str(i) else: seq_id = record.id + '_' + str(seq_start * 3) + '_' + str( seq_end * 3) + '_editing_range' + row[ 'permutation_coor_base0'] + '_pep_id_' + str(i) writer.write_record( SeqRecord(extented_pep, id=seq_id, description='')) writer.write_footer()
def cut_fasta_by_len(fa_file, len_cutoff, outdir, prefix, suffix): # https://stackoverflow.com/questions/273192/how-can-i-create-a-directory-if-it-does-not-exist # Defeats race condition when another thread created the path #if not os.path.exists(outdir): # os.mkdir(outdir) try: os.makedirs(outdir) except OSError as e: if e.errno != errno.EEXIST: raise cut_fa_file = os.path.join(outdir, prefix + ".ge" + str(len_cutoff) + suffix) if os.path.exists(cut_fa_file) and (os.path.getsize(cut_fa_file) > 0): return cut_fa_file if fa_file.endswith(".gz"): in_h = gzip.open(fa_file, 'rt') else: in_h = open(fa_file, 'r') with open(cut_fa_file, 'w') as out_h: #for rec in SeqIO.parse(in_h, 'fasta'): # if len(rec.seq) >= len_cutoff: # SeqIO.write(rec, out_h, 'fasta') # yes, the SeqIO.parse() API is more simple to use, easy to understand # but, try different method, you will find something writer = FastaWriter(out_h) writer.write_header() for rec in FastaIterator(in_h): if len(rec) >= len_cutoff: writer.write_record(rec) writer.write_footer() in_h.close() return cut_fa_file
def create_proteins_for_each_peptide(input_path, fasta_input, output_path, final_peptides, allow_change_in_cleavage_sites=False): """ for each sequence create the native protein and create a version of thath protein for each peptide """ final_edited_peptides = final_peptides[final_peptides['edited']] #create a seq-id:sequence dictionary from input fasta file sequences_dict = {} for record in SeqIO.parse(open(input_path + fasta_input, "r"), "fasta"): sequences_dict.update({record.id: record.seq}) writer = FastaWriter(open( output_path + 'proteins_per_peptide_from_' + fasta_input, 'w'), wrap=None) writer.write_header() for key, mrna_sequence, in sequences_dict.items(): #first print the native protein comb_id = key + '|original' protein = mrna_sequence.translate() writer.write_record(SeqRecord(protein, id=comb_id, description='')) edited_peptides = final_edited_peptides[final_edited_peptides['seq_id'] == key] n = 1 for index, row in edited_peptides.iterrows(): #flag editing combination for print\dont print in proteins file edit_prot = True if not allow_change_in_cleavage_sites and edit_prot: if final_peps_df.loc[ index, 'N_terminus'] != 'no_change' or final_peps_df.loc[ index, 'C_terminus'] != 'no_change' or final_peps_df.loc[ index, 'cancelled_cs_in_pep']: edit_prot = False if edit_prot: permutation_coor = tuple( int(x) for x in row['permutation_coor_base0'].split('_') if x != '') protein = mrna_sequence[:permutation_coor[0]].translate( ) + row['biological_extended_peptide'] + mrna_sequence[ permutation_coor[1] + 1:] comb_id = key + '|edited_' + str(n) + '\t' + str( row['editing_combinations_relative_to_coding_seq_base0']) writer.write_record( SeqRecord(protein, id=comb_id, description='')) n += 1 writer.write_footer()
def writeFasta(fb,seqList): if len(seqList) <= 0: raise ValueError("No data to Persist.") writer = FastaWriter(fb) writer.write_header() for record in seqList: writer.write_record(record) writer.write_footer()
def create_fully_edited_proteins_fasta(input_path, fasta_input, output_path): """ for each sequence create a native protein version and a fully edited version """ mm_headers = {} [ mm_headers.update({mm: re.compile(r'(?<=' + mm + '_base0:\s).*?]')}) for mm in all_mm ] writer = FastaWriter(open( output_path + 'fully_edited_and_native_proteins_from_' + fasta_input, 'w'), wrap=None) writer.write_header() for record in SeqIO.parse(open(input_path + fasta_input, "r"), "fasta"): sites_dict = {} [ sites_dict.update({ mm: sorted( eval( find_by_regex_in_header(record.description, mm_headers[mm]))) }) for mm in all_mm ] sites_number = sum([len(sites_dict[mm]) for mm in all_mm]) length = len(record.seq) comb = tuple([sites_dict[mm] for mm in all_mm]) protein_basic_description = '' #translate native protein seq_id = record.id + '_original' protein = record.seq.translate() writer.write_record( SeqRecord(protein, id=seq_id, description=protein_basic_description)) if sites_number: seq_id = record.id + '_fully_edited' protein_description = protein_basic_description + '| editing_combinations_base0_wrt_to_coding_sequence: ' + str( comb) edited_seq = Seq( edit_rna_as_peptide(str(record.seq), (0, length - 1), comb), generic_dna) protein = edited_seq.translate() writer.write_record( SeqRecord(protein, id=seq_id, description=protein_description)) if len(edited_seq) % 3: print(record.id) print(len(record.seq)) print(len(edited_seq)) writer.write_footer()
def trierFastaByDomain(tgtDomain,fastaDict,step2List,writeFileName,formatFunc): fb = open(writeFileName,'w') writer = FastaWriter(fb) writer.write_header() for record in step2List: score,gName,domain,gID,ARC,RF,reverse,begin,end,desc = formatFunc(record) if domain == tgtDomain: if fastaDict.get(gID) <> None: writer.write_record(fastaDict.get(gID)) ''' else: print "[%s] n'existe pas dans le fiche"%(gID) ''' writer.write_footer() fb.close()
def reheader_fasta(fa_in, fa_out, header_function, in_gz, gz): if in_gz: in_h = gzip.open(fa_in, 'rt') else: in_h = open(fa_in, 'r') if gz: out_h = bgzf.BgzfWriter(fa_out, 'wb') else: out_h = open(fa_out, 'w') writer = FastaWriter(out_h) writer.write_header() for rec in FastaIterator(in_h, title2ids=header_function): writer.write_record(rec) writer.write_footer() out_h.close() in_h.close()
def make_qiime_output(self): # Prepare fasta writer # handle = open(self.qiime_fasta.path, 'w') writer = FastaWriter(handle, wrap=0) writer.write_header() # Counter # counter = defaultdict(int) # Do it # for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name counter[sample_name] += 1 r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id) bar_seq = r.read.seq[0:self.pool.bar_len] r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq) writer.write_record(r.read[self.trim_fwd:-self.trim_rev]) # Close # writer.write_footer() handle.close()
def cleanUpFasta(fname,fastaDict,step2List,Step2FormatSepFunc,seuil=1e-3): with open(fname,'w') as fb: writer = FastaWriter(fb) writer.write_header() for line in step2List: try: score,gName,domain,gId,ARC,RF,reverse,begin,end,desc = Step2FormatSepFunc(line) if score > seuil: # print "[%s] score [%f] > seuil [%f].\n"%(gName,score,seuil) continue code = fastaDict[gName].seq.tostring() if reverse: code = code[::-1] record = SeqRecord(Seq(code[begin:end],generic_dna),name=gName,id=gId,description=desc) writer.write_record(record) except KeyError: print "[%s] not exists in fasta dictionary.\n"%gName continue writer.write_footer()
def create_in_frame_rna_file_from_anovar_results_and_coding_mrna_seqs_final_sites_dfs(fasta_file,output_name,out_path,mm_df_dict,stop_as_bad_records,met_as_good_records,last_is_stop,variants_to_use = []): """ input - coding sequences as fasta file sites (wrt to coding sequence) dataframe - result of read_editing_sites_wrt_coding_seqs after ucsc_id column is set to index different dataframes for different mm types output - fasta file in the format of proteomics simulator some of the values in the header will be useless because the input includes that coding sequences so this function does not trim the sequences. """ n_bad = 0 n_good = 0 sites_good = 0 sites_bad = 0 writer = FastaWriter(open(out_path + output_name + '.fasta' , 'w'), wrap=None) writer_bad = FastaWriter(open(out_path + 'bad_seqs_' + output_name + '.fasta' , 'w'), wrap=None) writer.write_header() writer_bad.write_header() for record in SeqIO.parse(open(fasta_file, "r"), "fasta"): mm_loc_dict = {} split_header = record.id.split(';') rec_id = split_header[0] + ';' + split_header[1] use_variant = True if len(variants_to_use): #if a not-empty list is passed for variants_to_use, flag variants that are not in list so they will not be included in uotput if rec_id not in variants_to_use: use_variant = False if use_variant: for mm in all_mm: if mm_df_dict[mm] is None: mm_list = [] else: sites = mm_df_dict[mm] try: mm_list = [int(k)-1 for k in sites.loc[[rec_id]]['position_base1']] except KeyError: mm_list = [] mm_loc_dict.update({mm:mm_list}) # prot_start_nuc = 1 # prot_end_nuc = len(final_sequence) # if last_is_stop: # prot_end_nuc = prot_end_nuc-3 # prot_start = 'first_met_in_original_orf' # prot_end = 'original_sense_strand_orf_end' # strand = '+' # orf_start = 1 # orf_end = len(record.seq) - 3 mm_str = '' for mm in mm_loc_dict: mm_str+= '| '+mm+'_base0: '+ str(mm_loc_dict[mm]) # description_str = mm_str + ' | prot_start: ' + str(prot_start) + ' | prot_end: ' + str(prot_end) + ' | strand: ' + strand + ' | prot_start_nuc: ' + str(prot_start_nuc) + ' | prot_end_nuc: ' + str(prot_end_nuc) + ' | original_orf_start: ' + str(orf_start) + ' | original_orf_end: ' + str(orf_end) description_str = mm_str if last_is_stop: final_sequence = str(record.seq[0:-3]).replace('a','A').replace('g','G').replace('t','T').replace('c','C') else: final_sequence = str(record.seq).replace('a','A').replace('g','G').replace('t','T').replace('c','C') good_record = True if stop_as_bad_records: if '*' in Seq(str(final_sequence), generic_dna).translate(): good_record = False if met_as_good_records: if Seq(str(record.seq[0:3]), generic_dna).translate() != 'M': good_record = False if last_is_stop: if Seq(str(record.seq[-3:len(record.seq)]), generic_dna).translate() != '*': good_record = False if not good_record: writer_bad.write_record(record) n_bad+=1 sites_bad+=sum([len(mm_loc_dict[mm]) for mm in all_mm]) else: if len(final_sequence)%3: final_sequence=final_sequence[0:-len(final_sequence)%3] current_record = SeqRecord(Seq(final_sequence,generic_dna), id = rec_id, description = description_str) writer.write_record(current_record) n_good+=1 sites_good+=sum([len(mm_loc_dict[mm]) for mm in all_mm]) writer.write_footer() if n_bad: writer_bad.write_footer() print(str(n_good) + ' good sequence with ' + str(sites_good) + 'sites') print(str(n_bad) + ' bad sequence with ' + str(sites_bad) + 'sites')
original_file=sys.argv[1] otu_table=sys.argv[2] project_file=sys.argv[3] import itertools from Bio import SeqIO from Bio.SeqIO.FastaIO import FastaWriter total_fasta = SeqIO.parse(open(original_file,"rU"), "fasta") project_fasta = open(project_file,'w') project_fasta.close() project_fasta = open(project_file,'a') ## read in the csv file and get header names import csv table_normalized_otus = open(otu_table, 'rb') reader = csv.reader(table_normalized_otus, delimiter="\t") headers = reader.next() print headers writer = FastaWriter(project_fasta, wrap=None) writer.write_header() for records in total_fasta: # print records.name if records.name in headers: writer.write_record(records) writer.write_footer()
def create_edited_proteins_all_represented_combinations( input_path, fasta_input, output_path, final_peps_df, max_edits_per_pep=None, allow_change_in_cleavage_sites=False): """ for each sequence create the native protein and create a version of that protein for each editing combination represented by that each edited peptide """ #create a seq-id:sequence dictionary from input fasta file sequences_dict = {} for record in SeqIO.parse(open(input_path + fasta_input, "r"), "fasta"): sequences_dict.update({record.id: record.seq}) writer = FastaWriter(open( output_path + 'proteins_per_combination_from_' + fasta_input, 'w'), wrap=None) writer.write_header() #creating a dataframe of all editing cominations per protein # comps_editing_combs = final_peps_df.groupby('seq_id').agg({'editing_combinations_relative_to_sense_orf_base0':lambda x: sorted([comb for sublist in list(x) for comb in sublist])}) comps_editing_combs = final_peps_df.groupby('seq_id')[ 'editing_combinations_relative_to_coding_seq_base0'].aggregate( lambda x: list(x)) #for each seq_id, iterate over all editing combinations and creat edited peptides final_peps_df = final_peps_df.drop_duplicates( subset='seq_id', keep='first' ) #removing duplicates as only data in seq_id level is now needed final_peps_df.set_index('seq_id', inplace=True) for index, combs_nested_list in comps_editing_combs.iteritems(): written_combs = [] n = 1 protein_basic_description = '' length = len(sequences_dict[index]) flattened_comb_list = [c for l in combs_nested_list for c in l] for comb in flattened_comb_list: #flag editing combination for print\dont print in proteins file edit_prot = True if max_edits_per_pep != None: if len([site for edit_type in comb for site in edit_type]) > max_edits_per_pep: edit_prot = False if not allow_change_in_cleavage_sites and edit_prot: if final_peps_df.loc[ index, 'N_terminus'] != 'no_change' or final_peps_df.loc[ index, 'C_terminus'] != 'no_change' or final_peps_df.loc[ index, 'cancelled_cs_in_pep']: edit_prot = False #editing proteins and writing to file if combination not already writen and combination do not exceed editing events if comb not in written_combs and edit_prot: if comb == ([], [], [], [], [], [], [], [], [], [], [], []): #the original sequence comb_id = index + '_original' protein = sequences_dict[index].translate() protein_description = protein_basic_description else: comb_id = index + '_edited_' + str(n) protein_description = protein_basic_description + '| editing_combinations_base0_wrt_to_coding_sequence: ' + str( comb) protein = Seq( edit_rna_as_peptide(str(sequences_dict[index]), (0, length - 1), comb), generic_dna).translate() n += 1 written_combs.append(comb) writer.write_record( SeqRecord(protein, id=comb_id, description=protein_description)) writer.write_footer()