def get_info_from_fimo_output(fimo_path, writeobj, fimo_dic, remove_repeats=False): ''' Read fimo output, it's not ordered in any sane way, so we will do it with no shortcuts. Create a dictionary, keys will be pattern name, values will be list of qvalues, Store into dictionary the following: pattern name (easier to retrieve later) median qvalue n_occurences Remove repeats: True by default. It will not count rows of same pattern name and same sequence name as previous row. ''' # Def colname constants pattern_name_colname = '#pattern name' qval_colname = 'q-value' seq_name_colname = 'sequence name' pattern_name_subkey = 'pattern_name' qval_med_subkey = 'median_q_value' n_occurs_subkey = 'n_motif_occurences' with open(fimo_path, 'rb') as readfile: myreader = csv.reader(readfile, delimiter='\t') header = myreader.next() for row in myreader: ''' Check every row to make sure seq name and pattern name are different. If both are the same, then we don't want to include it in our analysis (too many RBPs) ''' seq_name = row[header.index(seq_name_colname)] pattern_name = row[header.index(pattern_name_colname)] qval = row[header.index(qval_colname)] # Create new key for pattern name if it doesnt exist in dic # Create qval subkey with empty list. if pattern_name not in fimo_dic: fimo_dic[pattern_name] = {} fimo_dic[pattern_name][qval_colname] = [] fimo_dic[pattern_name][seq_name_colname] = [] # Check that seq name does not already exist in our dic. # if it is already in our dic, then move to next row. if seq_name in fimo_dic[pattern_name][seq_name_colname]: continue # Append empty list with qvalues from each row... fimo_dic[pattern_name][qval_colname].append(qval) fimo_dic[pattern_name][seq_name_colname].append(seq_name) # Get median q-value, n_motif_occurences and pattern name to dictionary. for pat_name, qval_dic in fimo_dic.iteritems(): # Get pattern name into dictionary info fimo_dic[pat_name][pattern_name_subkey] = pat_name # calculate med qval qval_list = [float(i) for i in qval_dic[qval_colname]] med_qval = stats_functions.median(qval_list) # Store med qval to dic fimo_dic[pat_name][qval_med_subkey] = med_qval # Get n_occurences fimo_dic[pat_name][n_occurs_subkey] = len(qval_list) return fimo_dic
def get_info_from_fimo_output(fimo_path, writeobj, fimo_dic, remove_repeats=False): """ Read fimo output, it's not ordered in any sane way, so we will do it with no shortcuts. Create a dictionary, keys will be pattern name, values will be list of qvalues, Store into dictionary the following: pattern name (easier to retrieve later) median qvalue n_occurences Remove repeats: True by default. It will not count rows of same pattern name and same sequence name as previous row. """ # Def colname constants pattern_name_colname = "#pattern name" qval_colname = "q-value" seq_name_colname = "sequence name" pattern_name_subkey = "pattern_name" qval_med_subkey = "median_q_value" n_occurs_subkey = "n_motif_occurences" with open(fimo_path, "rb") as readfile: myreader = csv.reader(readfile, delimiter="\t") header = myreader.next() for row in myreader: """ Check every row to make sure seq name and pattern name are different. If both are the same, then we don't want to include it in our analysis (too many RBPs) """ seq_name = row[header.index(seq_name_colname)] pattern_name = row[header.index(pattern_name_colname)] qval = row[header.index(qval_colname)] # Create new key for pattern name if it doesnt exist in dic # Create qval subkey with empty list. if pattern_name not in fimo_dic: fimo_dic[pattern_name] = {} fimo_dic[pattern_name][qval_colname] = [] fimo_dic[pattern_name][seq_name_colname] = [] # Check that seq name does not already exist in our dic. # if it is already in our dic, then move to next row. if seq_name in fimo_dic[pattern_name][seq_name_colname]: continue # Append empty list with qvalues from each row... fimo_dic[pattern_name][qval_colname].append(qval) fimo_dic[pattern_name][seq_name_colname].append(seq_name) # Get median q-value, n_motif_occurences and pattern name to dictionary. for pat_name, qval_dic in fimo_dic.iteritems(): # Get pattern name into dictionary info fimo_dic[pat_name][pattern_name_subkey] = pat_name # calculate med qval qval_list = [float(i) for i in qval_dic[qval_colname]] med_qval = stats_functions.median(qval_list) # Store med qval to dic fimo_dic[pat_name][qval_med_subkey] = med_qval # Get n_occurences fimo_dic[pat_name][n_occurs_subkey] = len(qval_list) return fimo_dic
def get_info_from_fimo_output2(fimo_path, writeobj, fimo_dic, convert_to_fraction=False): ''' Second attempt at getting info from fimo. I want gene name to show up only ONCE for a given sequence. ''' # Def colname constants pattern_name_colname = '#pattern name' qval_colname = 'q-value' seq_name_colname = 'sequence name' pattern_name_subkey = 'pattern_name' qval_med_subkey = 'median_q_value' n_occurs_subkey = 'n_motif_occurences' with open(fimo_path, 'rb') as readfile: myreader = csv.reader(readfile, delimiter='\t') header = myreader.next() for row in myreader: ''' Check every row to make sure seq name and pattern name are different. If both are the same, then we don't want to include it in our analysis (too many RBPs) ''' seq_name = row[header.index(seq_name_colname)] pattern_name = row[header.index(pattern_name_colname)] qval = row[header.index(qval_colname)] ''' # Get gene name from pattern name, expect string to be: GENENAME,MOTIF,IorD, so we will separate commas. ''' gene_name = pattern_name.split(',')[0] # Create new key for pattern name if it doesnt exist in dic # Create qval subkey with empty list. if gene_name not in fimo_dic: fimo_dic[gene_name] = {} fimo_dic[gene_name][qval_colname] = [] fimo_dic[gene_name][seq_name_colname] = [] # Check that seq name does not already exist in our dic. # if it is already in our dic, then move to next row. if seq_name in fimo_dic[gene_name][seq_name_colname]: continue # Append empty list with qvalues from each row... fimo_dic[gene_name][qval_colname].append(qval) fimo_dic[gene_name][seq_name_colname].append(seq_name) # Get median q-value, n_motif_occurences and pattern name to dictionary. for gene_name, qval_dic in fimo_dic.iteritems(): # Get pattern name into dictionary info fimo_dic[gene_name][pattern_name_subkey] = gene_name # calculate med qval qval_list = [float(i) for i in qval_dic[qval_colname]] med_qval = stats_functions.median(qval_list) # Store med qval to dic fimo_dic[gene_name][qval_med_subkey] = med_qval # Get n_occurences # fimo_dic[gene_name][n_occurs_subkey] = len(qval_list) n_occurences = len(qval_list) if convert_to_fraction == True: ''' # Get n_occurences divided by total fasta inputs # Get dirname of fimo txt file, append the html file # to read html file. ''' fimo_html_path = os.path.join(os.path.dirname(fimo_path), 'fimo.html') n_fasta_seqs = \ get_number_of_sequences_from_fimohtml(fimo_html_path, rownumber=42) fimo_dic[gene_name][n_occurs_subkey] = float(n_occurences) / n_fasta_seqs elif convert_to_fraction == False: ''' Just get the occurences, no dividing by fasta seqs ''' fimo_dic[gene_name][n_occurs_subkey] = n_occurences else: print 'Expected convert to fraction '\ 'to be True or False. %s found.' %convert_to_fraction return fimo_dic
def get_info_from_fimo_output2(fimo_path, writeobj, fimo_dic, convert_to_fraction=False): """ Second attempt at getting info from fimo. I want gene name to show up only ONCE for a given sequence. """ # Def colname constants pattern_name_colname = "#pattern name" qval_colname = "q-value" seq_name_colname = "sequence name" pattern_name_subkey = "pattern_name" qval_med_subkey = "median_q_value" n_occurs_subkey = "n_motif_occurences" with open(fimo_path, "rb") as readfile: myreader = csv.reader(readfile, delimiter="\t") header = myreader.next() for row in myreader: """ Check every row to make sure seq name and pattern name are different. If both are the same, then we don't want to include it in our analysis (too many RBPs) """ seq_name = row[header.index(seq_name_colname)] pattern_name = row[header.index(pattern_name_colname)] qval = row[header.index(qval_colname)] """ # Get gene name from pattern name, expect string to be: GENENAME,MOTIF,IorD, so we will separate commas. """ gene_name = pattern_name.split(",")[0] # Create new key for pattern name if it doesnt exist in dic # Create qval subkey with empty list. if gene_name not in fimo_dic: fimo_dic[gene_name] = {} fimo_dic[gene_name][qval_colname] = [] fimo_dic[gene_name][seq_name_colname] = [] # Check that seq name does not already exist in our dic. # if it is already in our dic, then move to next row. if seq_name in fimo_dic[gene_name][seq_name_colname]: continue # Append empty list with qvalues from each row... fimo_dic[gene_name][qval_colname].append(qval) fimo_dic[gene_name][seq_name_colname].append(seq_name) # Get median q-value, n_motif_occurences and pattern name to dictionary. for gene_name, qval_dic in fimo_dic.iteritems(): # Get pattern name into dictionary info fimo_dic[gene_name][pattern_name_subkey] = gene_name # calculate med qval qval_list = [float(i) for i in qval_dic[qval_colname]] med_qval = stats_functions.median(qval_list) # Store med qval to dic fimo_dic[gene_name][qval_med_subkey] = med_qval # Get n_occurences # fimo_dic[gene_name][n_occurs_subkey] = len(qval_list) n_occurences = len(qval_list) if convert_to_fraction == True: """ # Get n_occurences divided by total fasta inputs # Get dirname of fimo txt file, append the html file # to read html file. """ fimo_html_path = os.path.join(os.path.dirname(fimo_path), "fimo.html") n_fasta_seqs = get_number_of_sequences_from_fimohtml(fimo_html_path, rownumber=42) fimo_dic[gene_name][n_occurs_subkey] = float(n_occurences) / n_fasta_seqs elif convert_to_fraction == False: """ Just get the occurences, no dividing by fasta seqs """ fimo_dic[gene_name][n_occurs_subkey] = n_occurences else: print "Expected convert to fraction " "to be True or False. %s found." % convert_to_fraction return fimo_dic