def plot_distributions(meme_gerp_scores, null_gerp_scores, mylabels, mytitle): for gerp_scores, mylabel in zip([meme_gerp_scores, null_gerp_scores], mylabels): plot_functions.plot_density(gerp_scores, mytitle, mylabel) plt.legend() plt.show()
def main(): usage = 'usage: %prog [opt] directory1 directory2'\ '\nTwo arguments must be specified in command line:\n'\ '1) Directory of interest containing .summary files\n'\ '2) Directory to which to compare containing .summary files (control)\n' parser = OptionParser(usage=usage) (_, args) = parser.parse_args() if len(args) != 2: print 'Requires 2 arguments to be specified in command line' print usage sys.exit() # parse args dir0 = args[0] dir1 = args[1] # get all files containing *.summary in each directory. Those are # anchor input files. ext = '.summary' anchor_files0 = \ [os.path.join(dir0, f) for f in os.listdir(dir0) if f.endswith(ext)] anchor_files1 = \ [os.path.join(dir1, f) for f in os.listdir(dir1) if f.endswith(ext)] # Read each file, retrieving the lengths of each amino acid sequence aa_lengths0 = [] aa_lengths1 = [] for file0 in anchor_files0: aa_lengths0 += get_aa_length_from_anchor_file(file0) for file1 in anchor_files1: aa_lengths1 += get_aa_length_from_anchor_file(file1) for id, aa in zip(['xeno', 'control'], [aa_lengths0, aa_lengths1]): print 'Mean for %s' % id print sum(aa) / float(len(aa)) print 'Median for %s' % id print sorted(aa)[len(aa) // 2] print '[min,max] for %s' % id print '[%s,%s]' % (min(aa), max(aa)) plot_functions.plot_density( [aa_lengths0, aa_lengths1], mytitle='Density plot of exon lengths', xlabel='Nucleotide length', ylabel='Density', labels_lists=['Cassette Exons', 'Constitutive Exons'], smoothness=0.1, legend_pos=1, ymin=0, ymax=0.025, xmin=0, xmax=200)
def main(): usage = 'usage: %prog [opt] directory1 directory2'\ '\nTwo arguments must be specified in command line:\n'\ '1) Directory of interest containing .summary files\n'\ '2) Directory to which to compare containing .summary files (control)\n' parser = OptionParser(usage=usage) (_, args) = parser.parse_args() if len(args) != 2: print 'Requires 2 arguments to be specified in command line' print usage sys.exit() # parse args dir0 = args[0] dir1 = args[1] # get all files containing *.summary in each directory. Those are # anchor input files. ext = '.summary' anchor_files0 = \ [os.path.join(dir0, f) for f in os.listdir(dir0) if f.endswith(ext)] anchor_files1 = \ [os.path.join(dir1, f) for f in os.listdir(dir1) if f.endswith(ext)] # Read each file, retrieving the lengths of each amino acid sequence aa_lengths0 = [] aa_lengths1 = [] for file0 in anchor_files0: aa_lengths0 += get_aa_length_from_anchor_file(file0) for file1 in anchor_files1: aa_lengths1 += get_aa_length_from_anchor_file(file1) for id, aa in zip(['xeno', 'control'], [aa_lengths0, aa_lengths1]): print 'Mean for %s' %id print sum(aa)/float(len(aa)) print 'Median for %s' %id print sorted(aa)[len(aa)//2] print '[min,max] for %s' %id print '[%s,%s]' %(min(aa), max(aa)) plot_functions.plot_density([aa_lengths0, aa_lengths1], mytitle='Density plot of exon lengths', xlabel='Nucleotide length', ylabel='Density', labels_lists=['Cassette Exons', 'Constitutive Exons'], smoothness=0.1, legend_pos=1, ymin=0, ymax=0.025, xmin=0, xmax=200)
def main(): usage = 'usage: %prog meme_gerp_summary_pkl\n'\ 'Requires one argument:\n'\ '1) pkl file from summarize_meme_results' parser = OptionParser(usage=usage) parser.add_option('-r', '--plot_raw_locations', dest='raw_locations', default=False, help='Boolean value. True=horizontal '\ 'line segment plot. False=density plot') (options, args) = parser.parse_args() if len(args) != 1: print 'Requires 1 argument to be specified in commandline' print usage sys.exit() pklpath = args[0] if options.raw_locations in ['True', 'TRUE', True]: raw_locations = True print 'Plotting raw locations...' elif options.raw_locations in ['False', 'FALSE', False]: raw_locations = False print 'Plotting density plot...' else: print '--plot_raw_locations option must be '\ 'True or False. %s found.' %options.raw_locations sys.exit() # get dics from pkl meme_dic = get_dic_from_pklpath(pklpath) print meme_dic event_count = 0 # used as y-axis locater... # init offsetters offset_length = 100 offsets = [0, 110, 220, 330, 440] ''' Set Motif 1 to #CC6666, Motif 2 to #33CCCC Motif 3 to "green" The colors you want will depend on the discovered meme motif number. ''' plot_settings_dic = {'intron_1_5p': {'offset': offsets[0], 'color': ['#CC6666', '#CC6666', '#CC6666']}, 'intron_1_3p': {'offset': offsets[1], 'color': ['green', 'black', 'yellow']}, 'intron_2_5p': {'offset': offsets[2], 'color': ['red']}, 'intron_2_3p': {'offset': offsets[3], 'color': ['#33CCCC', 'black']}} # collect plot information: start, end, color, y position # into a plot dic. plot_dic = {'start': [], 'end': [], 'color': [], 'ypos': [], 'motif_number': []} for event in meme_dic: for region in meme_dic[event]: start = meme_dic[event][region]['motif_relative_start'][0] end = meme_dic[event][region]['motif_relative_end'][0] motif_number = meme_dic[event][region]['motif_number'][0] # offset start and end depending on region start += plot_settings_dic[region]['offset'] end += plot_settings_dic[region]['offset'] ypos = event_count/10.0 try: # subtract motif number by 1 to get 0-based numbering color = plot_settings_dic[region]['color'][motif_number-1] except IndexError: print region, motif_number print 'Ran out of colors, using yellow as default.' color = 'yellow' for key, value in \ zip(['start', 'end', 'color', 'ypos', 'motif_number'], [start, end, color, ypos, '%s:Motif %s'%(region, motif_number)]): plot_dic[key].append(value) event_count += 1 if raw_locations is False: # begin: get lists of starts, colors, labels for density plot density_plot_dic = {} for motif_number, color, start in \ zip(plot_dic['motif_number'], plot_dic['color'], plot_dic['start']): if motif_number not in density_plot_dic: density_plot_dic[motif_number] = {'densitystarts': []} density_plot_dic[motif_number]['densitycolors'] = color density_plot_dic[motif_number]['densitystarts'].append(start) starts_list = [] labels_list = [] colors_list = [] for motif_number in density_plot_dic: labels_list.append(motif_number) colors_list.append(density_plot_dic[motif_number]['densitycolors']) starts_list.append(density_plot_dic[motif_number]['densitystarts']) # add number of sites in labels labels_with_nsites = [] motif_labels = ['Motif %s' %n for n in range(1, len(starts_list) + 1)] for labellist, startlist in zip(motif_labels, starts_list): n_sites = len(startlist) labels_with_nsites.append('%s (%s sites)' %(labellist, n_sites)) for startlist in starts_list: print 'Number of guys: %s' %len(startlist) print 'Min/Max: %s/%s' %(min(startlist), max(startlist)) # end: get lists of starts, colors, labels for density plot # begin: init figure fig = plt.figure() ax = fig.add_subplot(111) # end: init figure # add rectangles representing exons rect_height=0.002 rect_length=10 rectstarts = [offsets[0], offsets[2], offsets[4]] rcolors = ['cyan', 'yellow', 'cyan'] for start, color in zip(rectstarts, rcolors): patch = add_rectangles(start, height=rect_height, length=rect_length, color=color) ax.add_patch(patch) # draw intron lines connecting exons istarts = [offsets[0], offsets[1], offsets[2], offsets[3]] iends = [offsets[1] - rect_length, offsets[2] - rect_length, offsets[3] - rect_length, offsets[4] - rect_length] for start, end in zip(istarts, iends): plt.hlines(y=-rect_height/2., xmin=start, xmax=end, color='black', linewidths=1.5) # draw vertical lines representing break in intron breakstarts = [iends[0], istarts[1], iends[2], istarts[3]] for bstart in breakstarts: plt.vlines(bstart, ymin=-rect_height, ymax=0, color='black', linewidths=1) plot_functions.plot_density(values_lists=starts_list, labels_lists=labels_with_nsites, colors_list=colors_list, mytitle='Intronic distribution of MEME motifs', xlabel='Genic region', ylabel='Density', xmin=-20, xmax=450, smoothness=0.1, legend_pos=2, ymin=-0.01, ymax=0.075, showplot=False) # dont show xaxis plt.setp(ax.get_xticklabels(), visible=False) plt.show() else: plot_functions.plot_hline_segments(starts=plot_dic['start'], stops=plot_dic['end'], ypos=plot_dic['ypos'], colors=plot_dic['color'], labels=plot_dic['motif_number'])
def main(): usage = 'usage: %prog meme_gerp_summary_pkl\n'\ 'Requires one argument:\n'\ '1) pkl file from summarize_meme_results' parser = OptionParser(usage=usage) parser.add_option('-r', '--plot_raw_locations', dest='raw_locations', default=False, help='Boolean value. True=horizontal '\ 'line segment plot. False=density plot') (options, args) = parser.parse_args() if len(args) != 1: print 'Requires 1 argument to be specified in commandline' print usage sys.exit() pklpath = args[0] if options.raw_locations in ['True', 'TRUE', True]: raw_locations = True print 'Plotting raw locations...' elif options.raw_locations in ['False', 'FALSE', False]: raw_locations = False print 'Plotting density plot...' else: print '--plot_raw_locations option must be '\ 'True or False. %s found.' %options.raw_locations sys.exit() # get dics from pkl meme_dic = get_dic_from_pklpath(pklpath) print meme_dic event_count = 0 # used as y-axis locater... # init offsetters offset_length = 100 offsets = [0, 110, 220, 330, 440] ''' Set Motif 1 to #CC6666, Motif 2 to #33CCCC Motif 3 to "green" The colors you want will depend on the discovered meme motif number. ''' plot_settings_dic = { 'intron_1_5p': { 'offset': offsets[0], 'color': ['#CC6666', '#CC6666', '#CC6666'] }, 'intron_1_3p': { 'offset': offsets[1], 'color': ['green', 'black', 'yellow'] }, 'intron_2_5p': { 'offset': offsets[2], 'color': ['red'] }, 'intron_2_3p': { 'offset': offsets[3], 'color': ['#33CCCC', 'black'] } } # collect plot information: start, end, color, y position # into a plot dic. plot_dic = { 'start': [], 'end': [], 'color': [], 'ypos': [], 'motif_number': [] } for event in meme_dic: for region in meme_dic[event]: start = meme_dic[event][region]['motif_relative_start'][0] end = meme_dic[event][region]['motif_relative_end'][0] motif_number = meme_dic[event][region]['motif_number'][0] # offset start and end depending on region start += plot_settings_dic[region]['offset'] end += plot_settings_dic[region]['offset'] ypos = event_count / 10.0 try: # subtract motif number by 1 to get 0-based numbering color = plot_settings_dic[region]['color'][motif_number - 1] except IndexError: print region, motif_number print 'Ran out of colors, using yellow as default.' color = 'yellow' for key, value in \ zip(['start', 'end', 'color', 'ypos', 'motif_number'], [start, end, color, ypos, '%s:Motif %s'%(region, motif_number)]): plot_dic[key].append(value) event_count += 1 if raw_locations is False: # begin: get lists of starts, colors, labels for density plot density_plot_dic = {} for motif_number, color, start in \ zip(plot_dic['motif_number'], plot_dic['color'], plot_dic['start']): if motif_number not in density_plot_dic: density_plot_dic[motif_number] = {'densitystarts': []} density_plot_dic[motif_number]['densitycolors'] = color density_plot_dic[motif_number]['densitystarts'].append(start) starts_list = [] labels_list = [] colors_list = [] for motif_number in density_plot_dic: labels_list.append(motif_number) colors_list.append(density_plot_dic[motif_number]['densitycolors']) starts_list.append(density_plot_dic[motif_number]['densitystarts']) # add number of sites in labels labels_with_nsites = [] motif_labels = ['Motif %s' % n for n in range(1, len(starts_list) + 1)] for labellist, startlist in zip(motif_labels, starts_list): n_sites = len(startlist) labels_with_nsites.append('%s (%s sites)' % (labellist, n_sites)) for startlist in starts_list: print 'Number of guys: %s' % len(startlist) print 'Min/Max: %s/%s' % (min(startlist), max(startlist)) # end: get lists of starts, colors, labels for density plot # begin: init figure fig = plt.figure() ax = fig.add_subplot(111) # end: init figure # add rectangles representing exons rect_height = 0.002 rect_length = 10 rectstarts = [offsets[0], offsets[2], offsets[4]] rcolors = ['cyan', 'yellow', 'cyan'] for start, color in zip(rectstarts, rcolors): patch = add_rectangles(start, height=rect_height, length=rect_length, color=color) ax.add_patch(patch) # draw intron lines connecting exons istarts = [offsets[0], offsets[1], offsets[2], offsets[3]] iends = [ offsets[1] - rect_length, offsets[2] - rect_length, offsets[3] - rect_length, offsets[4] - rect_length ] for start, end in zip(istarts, iends): plt.hlines(y=-rect_height / 2., xmin=start, xmax=end, color='black', linewidths=1.5) # draw vertical lines representing break in intron breakstarts = [iends[0], istarts[1], iends[2], istarts[3]] for bstart in breakstarts: plt.vlines(bstart, ymin=-rect_height, ymax=0, color='black', linewidths=1) plot_functions.plot_density( values_lists=starts_list, labels_lists=labels_with_nsites, colors_list=colors_list, mytitle='Intronic distribution of MEME motifs', xlabel='Genic region', ylabel='Density', xmin=-20, xmax=450, smoothness=0.1, legend_pos=2, ymin=-0.01, ymax=0.075, showplot=False) # dont show xaxis plt.setp(ax.get_xticklabels(), visible=False) plt.show() else: plot_functions.plot_hline_segments(starts=plot_dic['start'], stops=plot_dic['end'], ypos=plot_dic['ypos'], colors=plot_dic['color'], labels=plot_dic['motif_number'])
def main(): usage = 'usage: %prog meme_gerp_genename_filepath output_filepath\n'\ 'Requires two input arguments:\n'\ '1) textfile output from '\ 'summarize_meme_results_with_gerp_scores\n'\ '2) inclusion fasta file\n'\ '3) exclusion fasta file\n'\ '4) meme dir containing meme results' parser = OptionParser(usage=usage) (_, args) = parser.parse_args() if len(args) < 5: print 'Four arguments need to be specified in command line.\n' print usage sys.exit() meme_summarypath = args[0] incl_fasta = args[1] excl_fasta = args[2] meme_dir = args[3] # define column name suffix (string after colon in colname) gerp_str = 'avg_rs_score' motif_numb_str = 'motif_number' # miso event has no col name suffix, this is entire colname miso_colname = 'miso_event' # define rel path to tomtom files from meme dir rel_path = os.path.join('rbp_matches', 'candidate_rbps.txt') # define plot title mytitle = 'GERP Score Comparison: Hits vs Non-Hits' # get dictionary containing inclusion and exclusion for miso event incl_excl_dic = miso_events.get_inclusion_exclusion(incl_file=incl_fasta, excl_file=excl_fasta) tomtom_dic = miso_events.get_tomtom_hits(meme_dir, rel_path) region_gerp_scores = {} # gerp scores, indexed by region. with open(meme_summarypath, 'rb') as readfile: myreader = csv.reader(readfile, delimiter='\t') header = myreader.next() regions = get_regions(header) # init output dic with empty lists for region in regions: region_gerp_scores[region] = {} for row in myreader: # get gerp score in each region. # beware of empty values. for region in regions: subdic = region_gerp_scores[region] gerp_colname = ':'.join([region, gerp_str]) motif_numb_colname = ':'.join([region, motif_numb_str]) gerp_score = row[header.index(gerp_colname)] motif_numb = row[header.index(motif_numb_colname)] miso_event = row[header.index(miso_colname)] incl_or_excl = incl_excl_dic[miso_event] motif_id = ' '.join(['Motif', motif_numb, incl_or_excl]) if gerp_score is not '': if motif_id not in subdic: subdic[motif_id] = [] subdic[motif_id].append(float(gerp_score)) avg_scores_in_tomtom = [] avg_scores_not_in_tomtom = [] # Plot histogram of average scores. for region in region_gerp_scores: for motif_id in region_gerp_scores[region]: tomtom_key = create_tomtom_key(motif_id, region) if tomtom_key in tomtom_dic: avg_scores_in_tomtom += region_gerp_scores[region][motif_id] else: avg_scores_not_in_tomtom += region_gerp_scores[region][ motif_id] conserved_counts_in_tomtom = 0 conserved_counts_not_in_tomtom = 0 for s in avg_scores_in_tomtom: if s >= 2: conserved_counts_in_tomtom += 1 for s in avg_scores_not_in_tomtom: if s >= 2: conserved_counts_not_in_tomtom += 1 for avg_scores, mylabel in zip( [avg_scores_in_tomtom, avg_scores_not_in_tomtom], ['Motif with matching RBPs', 'Motif without matching RBPs']): plot_functions.plot_density(avg_scores, mytitle=mytitle, mylabel=mylabel) plt.legend() plt.show()
def main(): usage = 'usage: %prog meme_gerp_genename_filepath output_filepath\n'\ 'Requires two input arguments:\n'\ '1) pkl file from summarize_meme_results: non-null\n'\ '2) pkl file from summarize_meme_results: null-mode\n' parser = OptionParser(usage=usage) parser.add_option('-t', '--threshold', dest='score_threshold', default=2.0, help='Float, threshold for what one considers conserved.') parser.add_option('-y', '--ymax', dest='ymax', type='float', default=0.03, help='Y max for density plot') (options, args) = parser.parse_args() if len(args) < 2: print 'Two arguments need to be specified in command line.\n' print usage sys.exit() non_null_pklpath = args[0] null_pklpath = args[1] # parse ops score_threshold = float(options.score_threshold) # get dics from pkl non_null_dic = get_dic_from_pklpath(non_null_pklpath) null_dic = get_dic_from_pklpath(null_pklpath) non_null_gerp_scores = get_gerp_scores(non_null_dic, gerpkey='avg_rs_score') null_gerp_scores = get_gerp_scores(null_dic, gerpkey='avg_rs_score') plot_functions.plot_density([non_null_gerp_scores, null_gerp_scores], mytitle='Density plot of conservation scores', labels_lists=['MEME motifs', 'Controls'], xlabel='GERP conservation score', ylabel='Density', xmin=-4, xmax=4, ymax=options.ymax, smoothness=0.15, drawvline=score_threshold) # find how many conserved regions are in each. n_conserved_in_meme = \ gerp_utilities.conserved_regions(non_null_gerp_scores, fraction=False, threshold=score_threshold) n_conserved_in_null = \ gerp_utilities.conserved_regions(null_gerp_scores, fraction=False, threshold=score_threshold) n_total_in_meme = len(non_null_gerp_scores) n_total_in_null = len(null_gerp_scores) n_not_conserved_in_meme = n_total_in_meme - n_conserved_in_meme n_not_conserved_in_null = n_total_in_null - n_conserved_in_null print 'Threshold: %s' %score_threshold print 'Number of conserved elements: %s' %n_conserved_in_meme print 'Number of conserved elements found in control: %s' %n_conserved_in_null # Perform fisher's exact test oddsratio, pvalue = fisher_exact([[n_conserved_in_meme, n_conserved_in_null], [n_not_conserved_in_meme, n_not_conserved_in_null]]) print 'Fishers Exact Test, Oddsratio: %s. Pvalue: %s' %(oddsratio, pvalue) # plot distributions mylabels = ['Meme motifs', 'Control region'] mytitle = 'Fraction of elements conserved compared to control region' # Plot bargraphs frac_conserved_meme = float(n_conserved_in_meme) / n_total_in_meme frac_conserved_null = float(n_conserved_in_null) / n_total_in_null myvals = [frac_conserved_meme, frac_conserved_null] plot_functions.plot_barplot(myvals, mytitle, mylabels, ylabel='Fraction of elements conserved', mytext1="%i/%i" \ %(n_conserved_in_meme, n_total_in_meme), mytext2='%i/%i' %(n_conserved_in_null, n_total_in_null), mytext3="*Fisher's Exact Test\nP-value=%.2e" %pvalue, ymin=0, ymax=1, width=0.5) plt.show()
def main(): usage = 'usage: %prog meme_gerp_genename_filepath output_filepath\n'\ 'Requires two input arguments:\n'\ '1) textfile output from '\ 'summarize_meme_results_with_gerp_scores\n'\ '2) inclusion fasta file\n'\ '3) exclusion fasta file\n'\ '4) meme dir containing meme results' parser = OptionParser(usage=usage) (_, args) = parser.parse_args() if len(args) < 5: print 'Four arguments need to be specified in command line.\n' print usage sys.exit() meme_summarypath = args[0] incl_fasta = args[1] excl_fasta = args[2] meme_dir = args[3] # define column name suffix (string after colon in colname) gerp_str = 'avg_rs_score' motif_numb_str = 'motif_number' # miso event has no col name suffix, this is entire colname miso_colname = 'miso_event' # define rel path to tomtom files from meme dir rel_path = os.path.join('rbp_matches', 'candidate_rbps.txt') # define plot title mytitle = 'GERP Score Comparison: Hits vs Non-Hits' # get dictionary containing inclusion and exclusion for miso event incl_excl_dic = miso_events.get_inclusion_exclusion(incl_file=incl_fasta, excl_file=excl_fasta) tomtom_dic = miso_events.get_tomtom_hits(meme_dir, rel_path) region_gerp_scores = {} # gerp scores, indexed by region. with open(meme_summarypath, 'rb') as readfile: myreader = csv.reader(readfile, delimiter='\t') header = myreader.next() regions = get_regions(header) # init output dic with empty lists for region in regions: region_gerp_scores[region] = {} for row in myreader: # get gerp score in each region. # beware of empty values. for region in regions: subdic = region_gerp_scores[region] gerp_colname = ':'.join([region, gerp_str]) motif_numb_colname = ':'.join([region, motif_numb_str]) gerp_score = row[header.index(gerp_colname)] motif_numb = row[header.index(motif_numb_colname)] miso_event = row[header.index(miso_colname)] incl_or_excl = incl_excl_dic[miso_event] motif_id = ' '.join(['Motif', motif_numb, incl_or_excl]) if gerp_score is not '': if motif_id not in subdic: subdic[motif_id] = [] subdic[motif_id].append(float(gerp_score)) avg_scores_in_tomtom = [] avg_scores_not_in_tomtom = [] # Plot histogram of average scores. for region in region_gerp_scores: for motif_id in region_gerp_scores[region]: tomtom_key = create_tomtom_key(motif_id, region) if tomtom_key in tomtom_dic: avg_scores_in_tomtom += region_gerp_scores[region][motif_id] else: avg_scores_not_in_tomtom += region_gerp_scores[region][motif_id] conserved_counts_in_tomtom = 0 conserved_counts_not_in_tomtom = 0 for s in avg_scores_in_tomtom: if s >= 2: conserved_counts_in_tomtom += 1 for s in avg_scores_not_in_tomtom: if s >= 2: conserved_counts_not_in_tomtom += 1 for avg_scores, mylabel in zip([avg_scores_in_tomtom, avg_scores_not_in_tomtom], ['Motif with matching RBPs', 'Motif without matching RBPs']): plot_functions.plot_density(avg_scores, mytitle=mytitle, mylabel=mylabel) plt.legend() plt.show()
def main(): usage = 'usage: %prog meme_gerp_genename_filepath output_filepath\n'\ 'Requires two input arguments:\n'\ '1) pkl file from summarize_meme_results: non-null\n'\ '2) pkl file from summarize_meme_results: null-mode\n' parser = OptionParser(usage=usage) parser.add_option( '-t', '--threshold', dest='score_threshold', default=2.0, help='Float, threshold for what one considers conserved.') parser.add_option('-y', '--ymax', dest='ymax', type='float', default=0.03, help='Y max for density plot') (options, args) = parser.parse_args() if len(args) < 2: print 'Two arguments need to be specified in command line.\n' print usage sys.exit() non_null_pklpath = args[0] null_pklpath = args[1] # parse ops score_threshold = float(options.score_threshold) # get dics from pkl non_null_dic = get_dic_from_pklpath(non_null_pklpath) null_dic = get_dic_from_pklpath(null_pklpath) non_null_gerp_scores = get_gerp_scores(non_null_dic, gerpkey='avg_rs_score') null_gerp_scores = get_gerp_scores(null_dic, gerpkey='avg_rs_score') plot_functions.plot_density([non_null_gerp_scores, null_gerp_scores], mytitle='Density plot of conservation scores', labels_lists=['MEME motifs', 'Controls'], xlabel='GERP conservation score', ylabel='Density', xmin=-4, xmax=4, ymax=options.ymax, smoothness=0.15, drawvline=score_threshold) # find how many conserved regions are in each. n_conserved_in_meme = \ gerp_utilities.conserved_regions(non_null_gerp_scores, fraction=False, threshold=score_threshold) n_conserved_in_null = \ gerp_utilities.conserved_regions(null_gerp_scores, fraction=False, threshold=score_threshold) n_total_in_meme = len(non_null_gerp_scores) n_total_in_null = len(null_gerp_scores) n_not_conserved_in_meme = n_total_in_meme - n_conserved_in_meme n_not_conserved_in_null = n_total_in_null - n_conserved_in_null print 'Threshold: %s' % score_threshold print 'Number of conserved elements: %s' % n_conserved_in_meme print 'Number of conserved elements found in control: %s' % n_conserved_in_null # Perform fisher's exact test oddsratio, pvalue = fisher_exact( [[n_conserved_in_meme, n_conserved_in_null], [n_not_conserved_in_meme, n_not_conserved_in_null]]) print 'Fishers Exact Test, Oddsratio: %s. Pvalue: %s' % (oddsratio, pvalue) # plot distributions mylabels = ['Meme motifs', 'Control region'] mytitle = 'Fraction of elements conserved compared to control region' # Plot bargraphs frac_conserved_meme = float(n_conserved_in_meme) / n_total_in_meme frac_conserved_null = float(n_conserved_in_null) / n_total_in_null myvals = [frac_conserved_meme, frac_conserved_null] plot_functions.plot_barplot(myvals, mytitle, mylabels, ylabel='Fraction of elements conserved', mytext1="%i/%i" \ %(n_conserved_in_meme, n_total_in_meme), mytext2='%i/%i' %(n_conserved_in_null, n_total_in_null), mytext3="*Fisher's Exact Test\nP-value=%.2e" %pvalue, ymin=0, ymax=1, width=0.5) plt.show()