def write_annotations_to_output(output_dic, output_file, summary_file): ''' Given output dic of annotated sequences, write to outputfile. We want to include all the information that is already in summary file so we use summary file to read each row and add annotations. Each row in summary file may write 0 or more lines depending on how many uniprot annotations match to the amino acid sequence. ''' # initialize writefile as write obj outfile = open(output_file, 'wb') mywriter = csv.writer(outfile, delimiter='\t') # define column names # summary file colnames gene_name_colname, miso_event_colname, reading_frame_colname, \ nucleotide_seq_colname, amino_acid_seq_colname, gene_id_colname, \ transcript_id_colname, exon_number_colname = \ get_summary_file_colnames() summary_colnames = [gene_name_colname, miso_event_colname, reading_frame_colname, nucleotide_seq_colname, amino_acid_seq_colname, gene_id_colname, transcript_id_colname, exon_number_colname] # get annotation colnames start_colname, end_colname, descript_colname = get_uniprot_subkeys() exon_start_colname = 'exon_start' exon_end_colname = 'exon_end' feature_colname = 'feature' annotation_colnames = [feature_colname, start_colname, end_colname, descript_colname, exon_start_colname, exon_end_colname] # Write header to output file. Order matters. outheader = summary_colnames + annotation_colnames mywriter.writerow(outheader) # init writecount. writecount = 0 # create read file obj for summary file with open(summary_file, 'rb') as readfile: myreader = csv.reader(readfile, delimiter='\t') readheader = myreader.next() for row in myreader: # get amino acid seq, our key used to access output dic annotes aa_seq = row[readheader.index(amino_acid_seq_colname)] # if no associatd annotations, skip to next. if aa_seq not in output_dic: continue ''' # iterate over list in list of features, write corresponding annotes # along with summary information. Expect multiple annotations (or none) # for each miso event. ''' for i in range(0, len(output_dic[aa_seq][feature_colname])): row_to_write = [] # initialize for summary_colname in summary_colnames: row_to_write.append(row[readheader.index(summary_colname)]) for annote_colname in annotation_colnames: row_to_write.append(output_dic[aa_seq][annote_colname][i]) mywriter.writerow(row_to_write) writecount += 1 outfile.close() return writecount
def write_annotations_to_output(output_dic, output_file, summary_file): ''' Given output dic of annotated sequences, write to outputfile. We want to include all the information that is already in summary file so we use summary file to read each row and add annotations. Each row in summary file may write 0 or more lines depending on how many uniprot annotations match to the amino acid sequence. ''' # initialize writefile as write obj outfile = open(output_file, 'wb') mywriter = csv.writer(outfile, delimiter='\t') # define column names # summary file colnames gene_name_colname, miso_event_colname, reading_frame_colname, \ nucleotide_seq_colname, amino_acid_seq_colname, gene_id_colname, \ transcript_id_colname, exon_number_colname = \ get_summary_file_colnames() summary_colnames = [ gene_name_colname, miso_event_colname, reading_frame_colname, nucleotide_seq_colname, amino_acid_seq_colname, gene_id_colname, transcript_id_colname, exon_number_colname ] # get annotation colnames start_colname, end_colname, descript_colname = get_uniprot_subkeys() exon_start_colname = 'exon_start' exon_end_colname = 'exon_end' feature_colname = 'feature' annotation_colnames = [ feature_colname, start_colname, end_colname, descript_colname, exon_start_colname, exon_end_colname ] # Write header to output file. Order matters. outheader = summary_colnames + annotation_colnames mywriter.writerow(outheader) # init writecount. writecount = 0 # create read file obj for summary file with open(summary_file, 'rb') as readfile: myreader = csv.reader(readfile, delimiter='\t') readheader = myreader.next() for row in myreader: # get amino acid seq, our key used to access output dic annotes aa_seq = row[readheader.index(amino_acid_seq_colname)] # if no associatd annotations, skip to next. if aa_seq not in output_dic: continue ''' # iterate over list in list of features, write corresponding annotes # along with summary information. Expect multiple annotations (or none) # for each miso event. ''' for i in range(0, len(output_dic[aa_seq][feature_colname])): row_to_write = [] # initialize for summary_colname in summary_colnames: row_to_write.append(row[readheader.index(summary_colname)]) for annote_colname in annotation_colnames: row_to_write.append(output_dic[aa_seq][annote_colname][i]) mywriter.writerow(row_to_write) writecount += 1 outfile.close() return writecount
def append_dic_if_feature_within_start_end(exon_start, exon_end, amino_acid_seq, uniprot_dic, gene_key, feature, output_dic): ''' Given start and end, check if a particular feature within a gene inside uniprot_dic matches start and ends in the feature annotation. Return all instances where it matches in a dictionary object. amino acid sequence comes from a particular exon. Dictionary format: {amino_acid_sequence: {feature: {[start], [end], [description]}}} ''' # get uniprot subkeys for accessing feature starts, stops and descriptions start_subkey, end_subkey, descript_subkey = get_uniprot_subkeys() # define additional subkeys: exon_start and exon_end and feature exon_start_subkey = 'exon_start' exon_end_subkey = 'exon_end' feature_subkey = 'feature' # initialize match_count match_count = 0 # get start, end, description from uniprot dic feature_start_list = uniprot_dic[gene_key][feature][start_subkey] feature_end_list = uniprot_dic[gene_key][feature][end_subkey] descript_list = uniprot_dic[gene_key][feature][descript_subkey] ''' # iterate feature start/end in parallel, ask if it is within # the exon start/end range. Criteria for if it is NOT within range is: exon_start > feature_end exon_end < feature_start ''' for feature_start, feature_end, descript in zip(feature_start_list, feature_end_list, descript_list): if exon_start > feature_end or exon_end < feature_start: # feature outside of relevant range, go to next start/end continue else: # feature within relevant range, store to output dic # intialize relevant keynames if not yet initialized already. output_keyname = amino_acid_seq if output_keyname not in output_dic: output_dic[output_keyname] = {} for subkey in [start_subkey, end_subkey, descript_subkey, exon_start_subkey, exon_end_subkey, feature_subkey]: output_dic[output_keyname][subkey] = [] else: # already initialized, so simply append subvals to list. pass # store values into subkey for subkey, subval in \ zip([start_subkey, end_subkey, descript_subkey, exon_start_subkey, exon_end_subkey, feature_subkey], [feature_start, feature_end, descript, exon_start, exon_end, feature]): output_dic[output_keyname][subkey].append(subval) match_count += 1 return output_dic, match_count
def append_dic_if_feature_within_start_end(exon_start, exon_end, amino_acid_seq, uniprot_dic, gene_key, feature, output_dic): ''' Given start and end, check if a particular feature within a gene inside uniprot_dic matches start and ends in the feature annotation. Return all instances where it matches in a dictionary object. amino acid sequence comes from a particular exon. Dictionary format: {amino_acid_sequence: {feature: {[start], [end], [description]}}} ''' # get uniprot subkeys for accessing feature starts, stops and descriptions start_subkey, end_subkey, descript_subkey = get_uniprot_subkeys() # define additional subkeys: exon_start and exon_end and feature exon_start_subkey = 'exon_start' exon_end_subkey = 'exon_end' feature_subkey = 'feature' # initialize match_count match_count = 0 # get start, end, description from uniprot dic feature_start_list = uniprot_dic[gene_key][feature][start_subkey] feature_end_list = uniprot_dic[gene_key][feature][end_subkey] descript_list = uniprot_dic[gene_key][feature][descript_subkey] ''' # iterate feature start/end in parallel, ask if it is within # the exon start/end range. Criteria for if it is NOT within range is: exon_start > feature_end exon_end < feature_start ''' for feature_start, feature_end, descript in zip(feature_start_list, feature_end_list, descript_list): if exon_start > feature_end or exon_end < feature_start: # feature outside of relevant range, go to next start/end continue else: # feature within relevant range, store to output dic # intialize relevant keynames if not yet initialized already. output_keyname = amino_acid_seq if output_keyname not in output_dic: output_dic[output_keyname] = {} for subkey in [ start_subkey, end_subkey, descript_subkey, exon_start_subkey, exon_end_subkey, feature_subkey ]: output_dic[output_keyname][subkey] = [] else: # already initialized, so simply append subvals to list. pass # store values into subkey for subkey, subval in \ zip([start_subkey, end_subkey, descript_subkey, exon_start_subkey, exon_end_subkey, feature_subkey], [feature_start, feature_end, descript, exon_start, exon_end, feature]): output_dic[output_keyname][subkey].append(subval) match_count += 1 return output_dic, match_count