def initialize_genome(genome_name): from bioutilities import Genome_2bit import urllib info('Initializing Genome:%s' % genome_name) genome_directory = determine_path('genomes') info('genome_directory: %s' % genome_directory) genome_filename = os.path.join(genome_directory, "%s.2bit" % genome_name) chr_len_filename = os.path.join(genome_directory, "%s_chr_lengths.txt" % genome_name) meme_bg_filename = os.path.join(genome_directory, "%s_meme_bg" % genome_name) download_genome = True if os.path.exists(genome_filename): try: Genome_2bit(genome_filename, verbose=True) md5_check_flag = check_md5sum(genome_filename, genome_name) if md5_check_flag: download_genome = False info('File %s exists. Skipping genome download' % genome_filename) else: download_genome = True except: download_genome = True error("Unable to check MD5 sum. Downloading genome.") if download_genome: info( 'Sorry I need the genome file to perform the analysis. Downloading...' ) urlpath = "" % ( genome_name, genome_name) info('Downloading %s in %s...' % (urlpath, genome_filename)) try: with TqdmUpTo(unit='B', unit_scale=True, mininterval=30, miniters=1, desc=urlpath.split('/')[-1]) as t: urllib.urlretrieve(urlpath, filename=genome_filename, reporthook=t.update_to, data=None) info('Downloaded %s in %s:' % (urlpath, genome_filename)) except IOError, e: error("Can't retrieve %r to %r: %s" % (urlpath, genome_filename, e)) info( 'Sorry I need the genome file to perform the analysis. Exiting...' ) sys.exit(1)
urllib.urlretrieve(urlpath, filename=genome_filename, reporthook=t.update_to, data=None) info('Downloaded %s in %s:' % (urlpath, genome_filename)) except IOError, e: error("Can't retrieve %r to %r: %s" % (urlpath, genome_filename, e)) info( 'Sorry I need the genome file to perform the analysis. Exiting...' ) sys.exit(1) check_file(genome_filename) genome = Genome_2bit(genome_filename, verbose=True) if not os.path.exists(chr_len_filename): info('Extracting chromosome lengths') genome.write_chr_len(chr_len_filename) info('Done!') else: info('File %s exists, skipping generation' % chr_len_filename) if not os.path.exists(meme_bg_filename): info('Calculating nucleotide frequencies....') genome.write_meme_background(meme_bg_filename) info('Done!') else: info('File %s exists, skipping generation' % meme_bg_filename)
def main(): print '\n[H A Y S T A C K M O T I F S]' print( '\n-MOTIF ENRICHMENT ANALYSIS- [Luca Pinello - [email protected]]\n' ) print 'Version %s\n' % HAYSTACK_VERSION bootstrap = False ngram_correction = 'g' #mandatory parser = argparse.ArgumentParser(description='HAYSTACK Parameters') parser.add_argument( 'bed_target_filename', type=str, help= 'A bed file containing the target coordinates on the genome of reference' ) parser.add_argument( 'genome_name', type=str, help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)') #optional parser.add_argument( '--bed_bg_filename', type=str, help= "A bed file containing the backround coordinates on the genome of reference (default random sampled regions from the genome)", default='random_background') parser.add_argument( '--meme_motifs_filename', type=str, help='Motifs database in MEME format (default JASPAR CORE 2016)') parser.add_argument( '--nucleotide_bg_filename', type=str, help= 'Nucleotide probability for the background in MEME format (default precomupted on the Genome)' ) parser.add_argument( '--p_value', type=float, help='FIMO p-value for calling a motif hit significant (deafult: 1e-4)', default=1e-4) parser.add_argument( '--no_c_g_correction', help='Disable the matching of the C+G density of the background', action='store_true') parser.add_argument( '--c_g_bins', type=int, help='Number of bins for the C+G density correction (default: 8)', default=8) parser.add_argument('--mask_repetitive', help='Mask repetitive sequences', action='store_true') parser.add_argument( '--n_target_coordinates', type=int, help='Number of target coordinates to use (default: all)', default=np.inf) parser.add_argument( '--use_entire_bg', help= 'Use the entire background file (use only when the cg correction is disabled)', action='store_true') parser.add_argument( '--bed_score_column', type=int, help='Column in the bedfile that represents the score (default: 5)', default=5) parser.add_argument( '--bg_target_ratio', type=int, help='Background size/Target size ratio (default: 1.0)', default=2) parser.add_argument( '--bootstrap', help= 'Enable the bootstrap if the target set or the background set are too small, choices: True, False (default: False)', action='store_true') parser.add_argument( '--temp_directory', help='Directory to store temporary files (default: /tmp)', default='/tmp') parser.add_argument( '--no_random_sampling_target', help= 'Select the best --n_target_coordinates using the score column from the target file instead of randomly select them', action='store_true') parser.add_argument('--name', help='Define a custom output filename for the report', default='') parser.add_argument( '--internal_window_length', type=int, help= 'Window length in bp for the enrichment (default: average lenght of the target sequences)' ) parser.add_argument( '--window_length', type=int, help= 'Window length in bp for the profiler (default:internal_window_length*5)' ) parser.add_argument( '--min_central_enrichment', type=float, help='Minimum central enrichment to report a motif (default:>1.0)', default=1.0) parser.add_argument('--disable_ratio', help='Disable target/bg ratio filter', action='store_true') parser.add_argument( '--dump', help= 'Dump all the intermediate data, choices: True, False (default: False)', action='store_true') parser.add_argument('--output_directory', type=str, help='Output directory (default: current directory)', default='') parser.add_argument( '--smooth_size', type=int, help= 'Size in bp for the smoothing window (default: internal_window_length/4)' ) parser.add_argument( '--gene_annotations_filename', type=str, help= 'Optional gene annotations file from the UCSC Genome Browser in bed format to map each region to its closes gene' ) parser.add_argument( '--gene_ids_to_names_filename', type=str, help= 'Optional mapping file between gene ids to gene names (relevant only if --gene_annotation_filename is used)' ) parser.add_argument( '--n_processes', type=int, help= 'Specify the number of processes to use. The default is #cores available.', default=mp.cpu_count()) parser.add_argument('--version', help='Print version and exit.', action='version', version='Version %s' % HAYSTACK_VERSION) args = parser.parse_args() args_dict = vars(args) for key, value in args_dict.items(): if key == 'n_target_coordinates': n_target_coordinates = value else: exec('%s=%s' % (key, repr(value))) bed_score_column -= 1 if no_c_g_correction: c_g_correction = False else: c_g_correction = True if no_random_sampling_target: random_sampling_target = False else: random_sampling_target = True check_file(bed_target_filename) if not bed_bg_filename == 'random_background': check_file(bed_bg_filename) if meme_motifs_filename: check_file(meme_motifs_filename) else: meme_motifs_filename = os.path.join( determine_path('motif_databases'), '') annotation_directory = determine_path('gene_annotations') if gene_annotations_filename: if which('java') is None: error( 'The mapping to the closest gene requires Java free available from:' ) use_gene_annotations = False else: check_file(gene_annotations_filename) info('Using %s as gene annotations file' % gene_annotations_filename) use_gene_annotations = True else: gene_annotations_filename = os.path.join(annotation_directory, '%s_genes.bed' % genome_name) gene_ids_to_names_filename = os.path.join( annotation_directory, '%s_genes_id_to_names' % genome_name) if os.path.exists(gene_annotations_filename) and os.path.exists( gene_ids_to_names_filename): use_gene_annotations = True else: use_gene_annotations = False info('No gene annotations file specified') target_name = ntpath.basename(bed_target_filename.replace('.bed', '')) bg_name = ntpath.basename(bed_bg_filename.replace('.bed', '')) #timestamp=([:-3].replace('T','(')+str(np.random.randint(10000))+')').replace(':','.') if name: directory_name = 'HAYSTACK_MOTIFS_on_' + name else: directory_name = 'HAYSTACK_on_' + target_name + '_VS_' + bg_name if output_directory: output_directory = os.path.join(output_directory, directory_name) else: output_directory = directory_name info('###PARAMETERS USED###\n\t\t -TARGET: %s \n\t\t -BACKGROUND: %s \n\t\t -BG_TARGET_RATIO: %s\n\t\t -C+G CORRECTION: %s\n\t\t -MASKING REPETITIVE: %s\n\t\t -COORDINATES TO ANALYZE: %s\n\t\t -OUTPUT DIRECTORY: %s\n'\ %(bed_target_filename,bed_bg_filename,str(bg_target_ratio),str(c_g_correction),str(mask_repetitive),'ALL' if np.isinf(n_target_coordinates) else str(n_target_coordinates),output_directory)) info('Initializing Genome:%s' % genome_name) genome_directory = determine_path('genomes') genome_2bit = os.path.join(genome_directory, genome_name + '.2bit') if os.path.exists(genome_2bit): genome = Genome_2bit(genome_2bit) else: info("\nIt seems you don't have the required genome file.") if query_yes_no('Should I download it for you?'):'haystack_download_genome %s' % genome_name, shell=True, env=system_env) if os.path.exists(genome_2bit): info('Genome correctly downloaded!') genome = Genome_2bit(genome_2bit) else: error( 'Sorry I cannot download the required file for you. Check your Internet connection.' ) sys.exit(1) else: error( 'Sorry I need the genome file to perform the analysis. Exiting...' ) sys.exit(1) if not nucleotide_bg_filename: nucleotide_bg_filename = os.path.join(genome_directory, genome_name + '_meme_bg') check_file(nucleotide_bg_filename) N_TARGET = None N_BG = None COMMAND_USED = ' '.join(sys.argv) _n_target_coordinates = n_target_coordinates info('Loading Target coordinates from bed:%s' % bed_target_filename) target_coords = Coordinate.bed_to_coordinates(bed_target_filename, cl_score=bed_score_column) if len(target_coords) == 0: info('No coordinates to analyze in your input file. Exiting.') sys.exit(1) #calculate automatically the average lenght of the target regions if internal_window_length: info('Using the user defined internal window length:%d' % internal_window_length) if internal_window_length % 2: internal_window_length += 1 else: internal_window_length = int(np.mean(map(len, target_coords))) if internal_window_length % 2: internal_window_length += 1 info( 'Using the average length of target coordinates as internal window length:%d' % internal_window_length) if not window_length: window_length = internal_window_length * 5 info('Total window length:%d' % window_length) if not smooth_size: smooth_size = internal_window_length / 5 target_coords = Coordinate.coordinates_of_intervals_around_center( target_coords, internal_window_length) if len(target_coords) > n_target_coordinates: if random_sampling_target: info('Sampling %d coordinates among the %d total' % (n_target_coordinates, len(target_coords))) target_coords = random.sample(target_coords, n_target_coordinates) else: info('Selecting the best %d coordinates among the %d total' % (n_target_coordinates, len(target_coords))) sorted_idxs_by_score = np.argsort([c.score for c in target_coords])[::-1] target_coords = [ target_coords[idx] for idx in sorted_idxs_by_score[:n_target_coordinates] ] else: if random_sampling_target and bootstrap and not np.isinf( n_target_coordinates): warn('Number of target regions < %d' % n_target_coordinates) info('bootstrapping to obtain enough target regions') target_coords = sample_wr(target_coords, n_target_coordinates) else: info('Using all the %d target coordinates' % len(target_coords)) info('Extracting Motifs in target coordinates') positive_matrix, motifs_profiles_in_sequences, idxs_seqs_with_motif, motif_coords_in_seqs_with_motif, motif_names, motif_ids = parallel_fimo_scanning( target_coords, meme_motifs_filename, genome, nucleotide_bg_filename, temp_directory=temp_directory, p_value=p_value, mask_repetitive=mask_repetitive, window_length=window_length, internal_window_length=internal_window_length, num_consumers=n_processes) n_target_coordinates = len(target_coords) #fix for the bootstrap! if bed_bg_filename == 'random_background': info('Extracting Random Coordinates from the genome...') if c_g_correction: info('Calculating the C+G content of the target coordinates') bg_coords = [] c_g_content_target = calculate_average_ngram_presence( target_coords, genome, ngram_correction) info('Extract a Matching C+G Background') bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf)) for _ in range(bg_target_ratio): for idx_c, c in enumerate(target_coords): c_bin = np.nonzero( np.histogram(c_g_content_target[idx_c], bins)[0])[0][0] c_random_bin = -1 while c_random_bin != c_bin: random_bpstart = np.random.randint( 1, genome.chr_len[c.chr_id] - len(c) + 1) c_random = Coordinate(c.chr_id, random_bpstart, random_bpstart + len(c) - 1) seq = genome.extract_sequence(c_random) c_g_content_c_random = (seq.count('c') + seq.count('g')) / float(len(c)) c_random_bin = np.nonzero( np.histogram(c_g_content_c_random, bins)[0])[0][0] #print bg_target_ratio,c_bin,c_random_bin, ' still to match:',len(target_coords)-idx_c bg_coords.append(c_random) c_g_content_bg = calculate_average_ngram_presence( bg_coords, genome, ngram_correction) bg_hist = np.histogram(c_g_content_bg, bins)[0] debug('original: ' + str(np.histogram(c_g_content_target, bins)[0])) debug('obtained:' + str(np.histogram(c_g_content_bg, bins)[0])) else: bg_coords = get_random_coordinates(target_coords, genome) info('Done!') else: info('Loading Background Coordinates from:%s' % bed_bg_filename) bg_coords = Coordinate.bed_to_coordinates(bed_bg_filename) bg_coords = Coordinate.coordinates_of_intervals_around_center( bg_coords, internal_window_length) if use_entire_bg: bg_target_ratio = float(len(bg_coords)) / n_target_coordinates info('Using all the coordinates in the BG, BG/TG:%f', bg_target_ratio) if c_g_correction: info('Calculating the C+G content') c_g_content_target = calculate_average_ngram_presence( target_coords, genome, ngram_correction) c_g_content_bg = calculate_average_ngram_presence( bg_coords, genome, ngram_correction) info('Extract a Matching C+G Background') bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf)) target_hist = np.histogram(c_g_content_target, bins)[0] bg_hist = np.histogram(c_g_content_bg, bins)[0] ratios = bg_hist / (target_hist * 1.0) debug('original:%s' % target_hist) debug('bg:%s' % bg_hist) debug('ratios:%s' % ratios) K_MATCH = min( bg_target_ratio, ratios[~np.isnan(ratios) & ~np.isinf(ratios) & (ratios > 0) & (target_hist / float(target_hist.sum()) > 0.05)].min()) debug('K_MATCH:%d' % K_MATCH) to_match = np.int32(np.floor(K_MATCH * target_hist)) debug('to_match:%s' % to_match) idxs_corrected_bg = np.array([], dtype=int) for idx_bin in range(len(bins) - 1): idxs_matching_regions = np.nonzero( (c_g_content_bg >= bins[idx_bin]) & (c_g_content_bg < bins[idx_bin + 1]))[0] to_take = np.random.permutation(len(idxs_matching_regions)) to_take = to_take[range( min(len(idxs_matching_regions), to_match[idx_bin]))] idxs_corrected_bg = np.hstack( (idxs_corrected_bg, idxs_matching_regions[to_take])) debug('original:%s' % target_hist) debug('K:%d' % K_MATCH) debug('to sample:%s' % to_match) debug('obtained:%s' % np.histogram(c_g_content_bg[idxs_corrected_bg], bins)[0]) bg_coords = [bg_coords[idx] for idx in idxs_corrected_bg] c_g_content_bg = calculate_average_ngram_presence( bg_coords, genome, ngram_correction) debug(np.histogram(c_g_content_bg, bins)[0]) if np.array_equal(K_MATCH * target_hist, np.histogram(c_g_content_bg, bins)[0]): info('C+G content perfectly matched!\n\ttarget:%s\n\tbg :%s' % (target_hist, np.histogram(c_g_content_bg, bins)[0])) else: warn( 'C+G content not perfectly matched\n\ttarget:%s\n\tbg :%s' % (target_hist, np.histogram(c_g_content_bg, bins)[0])) debug(target_hist / np.histogram(c_g_content_bg, bins)[0]) if len(bg_coords) >= bg_target_ratio * n_target_coordinates: bg_coords = random.sample(bg_coords, int(bg_target_ratio * n_target_coordinates)) else: if bootstrap and len(bg_coords) < (bg_target_ratio * n_target_coordinates * 0.95): #allow a small tollerance! info('bootstrapping to obtain enough background regions') bg_coords = sample_wr(bg_coords, int(bg_target_ratio * n_target_coordinates)) c_g_content_bg = calculate_average_ngram_presence( bg_coords, genome, ngram_correction) debug('After bootstrap:\n\ttarget:%s\n\tbg :%s' % (target_hist, np.histogram(c_g_content_bg, bins)[0])) info('Extracting Motifs in background coordinates') negative_matrix, motifs_profiles_in_bg, idxs_seqs_with_motif_bg = parallel_fimo_scanning( bg_coords, meme_motifs_filename, genome, nucleotide_bg_filename, temp_directory=temp_directory, p_value=p_value, mask_repetitive=mask_repetitive, window_length=window_length, internal_window_length=internal_window_length, num_consumers=n_processes)[0:3] #allocate date for reports N_MOTIFS = len(motif_ids) rankings = np.zeros(N_MOTIFS, dtype=np.int16) motif_ratios = np.zeros(N_MOTIFS) support_p = np.zeros(N_MOTIFS) support_n = np.zeros(N_MOTIFS) fisher_p_values = np.zeros(N_MOTIFS) central_enrichment = np.zeros(N_MOTIFS) N_seq_p = positive_matrix.shape[0] N_seq_n = negative_matrix.shape[0] profile_presence_p = (positive_matrix > 0).sum(0) profile_presence_n = (negative_matrix > 0).sum(0) support_p = profile_presence_p / float(N_seq_p) support_n = profile_presence_n / float(N_seq_n) internal_bpstart = window_length / 2 - internal_window_length / 2 internal_bpend = window_length / 2 + internal_window_length / 2 for idx, motif_id in enumerate(motif_ids): fisher_p_values[idx] = stats.fisher_exact( [[profile_presence_p[idx], N_seq_p - profile_presence_p[idx]], [profile_presence_n[idx], N_seq_n - profile_presence_n[idx]]])[1] central_enrichment[idx] = motifs_profiles_in_sequences[motif_id][ internal_bpstart:internal_bpend].mean() / np.hstack([ motifs_profiles_in_sequences[motif_id][:internal_bpstart], motifs_profiles_in_sequences[motif_id][internal_bpend:] ]).mean() motif_ratios = (support_p + 0.01) / (support_n + 0.01) #Foundamental! if not disable_ratio: motif_ratios[support_p < 0.03] = 1 rankings = stats.rankdata(-motif_ratios) #filter here positive or positive and negative################################# if not disable_ratio: idxs_to_keep = np.nonzero(motif_ratios > 1)[0] else: idxs_to_keep = range(len(motif_ratios)) rankings = rankings[idxs_to_keep] motif_ratios = motif_ratios[idxs_to_keep] support_p = support_p[idxs_to_keep] support_n = support_n[idxs_to_keep] fisher_p_values = fisher_p_values[idxs_to_keep] central_enrichment = central_enrichment[idxs_to_keep] motif_ids = [motif_ids[_] for _ in idxs_to_keep] motif_names = [motif_names[_] for _ in idxs_to_keep] motif_idxs = [_ for _ in idxs_to_keep] try: qvalues = estimate_qvalues(fisher_p_values) # we test the ones only with ratio >1 except: print fisher_p_values #qvalues=estimate_qvalues(fisher_p_values,m=len(motif_ids)) ################################################################################ #generate reports in html info('Generating HTML report...') imgs_directory = os.path.join(output_directory, 'images') genes_list_directory = os.path.join(output_directory, 'genes_lists') motif_regions_directory = os.path.join(output_directory, 'motifs_regions') #create folders if not os.path.exists(imgs_directory): os.makedirs(imgs_directory) if use_gene_annotations and not os.path.exists(genes_list_directory): os.makedirs(genes_list_directory) if not os.path.exists(motif_regions_directory): os.makedirs(motif_regions_directory) j2_env = Environment( loader=FileSystemLoader(determine_path('extra') + '/templates/'), trim_blocks=True) info('DIRECTORY:%s' % determine_path('extra') + '/templates/') template = j2_env.get_template('report_template.html') #copy haystack logo and bg shutil.copyfile( determine_path('extra') + '/templates/haystack_logo.png', os.path.join(imgs_directory, 'haystack_logo.png')) shutil.copyfile( determine_path('extra') + '/templates/noise.png', os.path.join(imgs_directory, 'noise.png')) motifs_dump = [] for i in np.argsort(rankings): if (support_p[i] >= 0.03 or disable_ratio) and fisher_p_values[i] < 0.01 and ( motif_ratios[i] > 1 or disable_ratio ) and central_enrichment[i] > min_central_enrichment: #if (support_p[i]>=0.01 or support_n[i]>=0.01) and fisher_p_values[i]<0.1 and (central_enrichment[i]>1.1 or central_enrichment[i]<0.9) and ( motif_ratios[i]>1.1 or motif_ratios[i]<0.9): info('Generating logo and profile for:' + motif_ids[i]) #create motif logo img_logo = os.path.join(imgs_directory, 'logo_' + motif_ids[i]) generate_weblogo(motif_ids[i], meme_motifs_filename, img_logo, title=motif_ids[i], SEQLOGO=determine_path('extra') + '/seqlogo') generate_weblogo(motif_ids[i], meme_motifs_filename, img_logo, title=motif_ids[i], SEQLOGO=determine_path('extra') + '/seqlogo', file_format='pdf') #fix the weblogo prefix problem img_logo_url = os.path.join('images', 'logo_' + motif_ids[i] + '.png') #create motif enrichment profile img_profile = os.path.join(imgs_directory, 'profile_' + motif_ids[i] + '.png') motif_profile_target = motifs_profiles_in_sequences[ motif_ids[i]] / N_seq_p motif_profile_bg = motifs_profiles_in_bg[motif_ids[i]] / N_seq_n #print motif_profile_target.shape, motif_profile_bg.shape generate_motif_profile(motif_profile_target, motif_profile_bg, motif_ids[i], img_profile, smooth_size=smooth_size, window_size=window_length) img_profile_url = os.path.join('images', 'profile_' + motif_ids[i] + '.png') #create regions info('Extracting regions with:' + motif_ids[i]) regions = os.path.join( motif_regions_directory, motif_ids[i] + '_motif_region_in_target.bed') with open(regions, 'w+') as outfile: outfile.write( 'Chromosome\tStart\tEnd\tMotif hits inside region\tNumber of hits\n' ) for c, locations in motif_coords_in_seqs_with_motif[ motif_ids[i]].items(): outfile.write('\t'.join([ c.chr_id, str(c.bpstart), str(c.bpend), ';'.join([ '-'.join(map(str, map(int, l))) for l in locations ]), str(len(locations)) ]) + '\n') regions_url = os.path.join( 'motifs_regions', motif_ids[i] + '_motif_region_in_target.bed') #map closest downstream genes genes_url = None if use_gene_annotations: info('Mapping regions with:%s to the clostest genes' % motif_ids[i]) peak_annotator_path = os.path.join(determine_path('extra/'), 'PeakAnnotator.jar') if gene_ids_to_names_filename:'java -jar '+peak_annotator_path+' -u TSS -p %s -a %s -s %s -o %s >/dev/null 2>&1' \ %(regions,gene_annotations_filename,gene_ids_to_names_filename,genes_list_directory), shell=True,env=system_env) else:'java -jar '+peak_annotator_path+' -u TSS -p %s -a %s -o %s >/dev/null 2>&1' \ %(regions,gene_annotations_filename,genes_list_directory), shell=True,env=system_env) genes_url = os.path.join( 'genes_lists', motif_ids[i] + '_motif_region_in_target.tss.bed') motifs_dump.append({ 'id': motif_ids[i], 'name': motif_names[i], 'support_p': support_p[i] * 100, 'support_n': support_n[i] * 100, 'ratio': motif_ratios[i], 'rank': float(rankings[i]), 'pvalue': fisher_p_values[i], 'qvalue': qvalues[i], 'central_enrichment': central_enrichment[i], 'img_logo': img_logo_url, 'img_profile': img_profile_url, 'regions': regions_url, 'genes': genes_url, 'idx_motif': motif_idxs[i] }) outfile = os.path.join(output_directory, "Haystack_report.html"), "w", "utf-8") outfile.write(template.render(motifs_dump=motifs_dump,bed_target_filename=bed_target_filename,bed_bg_filename=bed_bg_filename, N_TARGET=N_seq_p, N_BG=N_seq_n,\ meme_motifs_filename=meme_motifs_filename, COMMAND_USED=COMMAND_USED,use_gene_annotations=use_gene_annotations)) outfile.close() if dump: info('Saving all the intermediate data on: %s ...' % output_directory) dump_directory = os.path.join(output_directory, 'dump') if not os.path.exists(dump_directory): os.makedirs(dump_directory), 'matrix_' + target_name), positive_matrix), 'matrix_BG_' + target_name), negative_matrix) cp.dump( motifs_dump, open( os.path.join(dump_directory, target_name + '_motif_dumps.pickle'), 'w')) #cp.dump( motifs_profiles_in_sequences,open( os.path.join(dump_directory,target_name+'_profiles.pickle'),'w')) #cp.dump( motifs_profiles_in_bg,open( os.path.join(dump_directory,bg_name+'_profiles.pickle'),'w')) cp.dump( idxs_seqs_with_motif, open( os.path.join(dump_directory, target_name + '_motif_seqs_idxs.pickle'), 'w')) cp.dump( idxs_seqs_with_motif_bg, open( os.path.join(dump_directory, bg_name + '_motif_seqs_idxs.pickle'), 'w')) cp.dump( motif_coords_in_seqs_with_motif, open( os.path.join( dump_directory, target_name + '_motif_coords_in_seqs_with_motif.pickle'), 'w')) Coordinate.coordinates_to_bed( target_coords, os.path.join( dump_directory, 'Target_coordinates_selected_on_' + target_name + '.bed'), minimal_format=False) Coordinate.coordinates_to_bed( bg_coords, os.path.join(dump_directory, 'BG_coordinates_selected_on_' + bg_name + '.bed'), minimal_format=True) info('All done! Ciao!') sys.exit(0)
fh.setFormatter(formatter) logger.addHandler(fh) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) for pam in pams: if not all(x in ['A', 'T', 'C', 'G', '[', ']'] for x in set(list(pam))): logger.error('The PAM %s includes an unidentifiable character. Please only use A, T, C, G bases.\nBrackets [] can be used to specify multiple bases at a single position.\nFor example: NGG = [ATCG]GG ...' % pam) sys.exit(1) ##### Input genome'Importing %s genome ...' % genome) genome = Genome_2bit(genome) ##### Input regions file'Reading in %s file ...' % input_f) total_regions = 0 bed_dict = {} with open(input_f, 'r') as f: for line in f: total_regions += 1 if 'csv' in input_f: line = line.strip('\n').split(',') else: line = line.strip('\n').split()
def main(): print '\n[H A Y S T A C K H O T S P O T]' print( '\n-SELECTION OF VARIABLE REGIONS- [Luca Pinello - [email protected]]\n' ) print 'Version %s\n' % HAYSTACK_VERSION if which('samtools') is None: error( 'Haystack requires samtools free available at:' ) sys.exit(1) if which('bedtools') is None: error( 'Haystack requires bedtools free available at:' ) sys.exit(1) if which('bedGraphToBigWig') is None: info( 'To generate the bigwig files Haystack requires bedGraphToBigWig please download from here: and add to your PATH' ) #mandatory parser = argparse.ArgumentParser(description='HAYSTACK Parameters') parser.add_argument( 'samples_filename_or_bam_folder', type=str, help= 'A tab delimeted file with in each row (1) a sample name, (2) the path to the corresponding bam filename. Alternatively it is possible to specify a folder containing some .bam files to analyze.' ) parser.add_argument( 'genome_name', type=str, help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)') #optional parser.add_argument('--bin_size', type=int, help='bin size to use(default: 500bp)', default=500) parser.add_argument('--disable_quantile_normalization', help='Disable quantile normalization (default: False)', action='store_true') parser.add_argument( '--th_rpm', type=float, help= 'Percentile on the signal intensity to consider for the hotspots (default: 99)', default=99) parser.add_argument( '--transformation', type=str, help= 'Variance stabilizing transformation among: none, log2, angle (default: angle)', default='angle', choices=['angle', 'log2', 'none']) parser.add_argument('--recompute_all', help='Ignore any file previously precalculated', action='store_true') parser.add_argument( '--z_score_high', type=float, help='z-score value to select the specific regions(default: 1.5)', default=1.5) parser.add_argument( '--z_score_low', type=float, help='z-score value to select the not specific regions(default: 0.25)', default=0.25) parser.add_argument('--name', help='Define a custom output filename for the report', default='') parser.add_argument('--output_directory', type=str, help='Output directory (default: current directory)', default='') parser.add_argument( '--use_X_Y', help= 'Force to process the X and Y chromosomes (default: not processed)', action='store_true') parser.add_argument( '--max_regions_percentage', type=float, help= 'Upper bound on the %% of the regions selected (deafult: 0.1, 0.0=0%% 1.0=100%%)', default=0.1) parser.add_argument( '--depleted', help= 'Look for cell type specific regions with depletion of signal instead of enrichment', action='store_true') parser.add_argument( '--input_is_bigwig', help= 'Use the bigwig format instead of the bam format for the input. Note: The files must have extension .bw', action='store_true') parser.add_argument('--version', help='Print version and exit.', action='version', version='Version %s' % HAYSTACK_VERSION) args = parser.parse_args() args_dict = vars(args) for key, value in args_dict.items(): exec('%s=%s' % (key, repr(value))) if input_is_bigwig: extension_to_check = '.bw' info('Input is set BigWig (.bw)') else: extension_to_check = '.bam' info('Input is set compressed SAM (.bam)') #check folder or sample filename if os.path.isfile(samples_filename_or_bam_folder): BAM_FOLDER = False bam_filenames = [] sample_names = [] with open(samples_filename_or_bam_folder) as infile: for line in infile: if not line.strip(): continue if line.startswith( '#'): #skip optional header line or empty lines info('Skipping header/comment line:%s' % line) continue fields = line.strip().split() n_fields = len(fields) if n_fields == 2: sample_names.append(fields[0]) bam_filenames.append(fields[1]) else: error('The samples file format is wrong!') sys.exit(1) else: if os.path.exists(samples_filename_or_bam_folder): BAM_FOLDER = True bam_filenames = glob.glob( os.path.join(samples_filename_or_bam_folder, '*' + extension_to_check)) if not bam_filenames: error('No bam/bigwig files to analyze in %s. Exiting.' % samples_filename_or_bam_folder) sys.exit(1) sample_names = [ os.path.basename(bam_filename).replace(extension_to_check, '') for bam_filename in bam_filenames ] else: error("The file or folder %s doesn't exist. Exiting." % samples_filename_or_bam_folder) sys.exit(1) #check all the files before starting info('Checking samples files location...') for bam_filename in bam_filenames: check_file(bam_filename) info('Initializing Genome:%s' % genome_name) genome_directory = determine_path('genomes') genome_2bit = os.path.join(genome_directory, genome_name + '.2bit') if os.path.exists(genome_2bit): genome = Genome_2bit(genome_2bit) else: info("\nIt seems you don't have the required genome file.") if query_yes_no('Should I download it for you?'):'haystack_download_genome %s' % genome_name, shell=True, env=system_env) if os.path.exists(genome_2bit): info('Genome correctly downloaded!') genome = Genome_2bit(genome_2bit) else: error( 'Sorry I cannot download the required file for you. Check your Internet connection.' ) sys.exit(1) else: error( 'Sorry I need the genome file to perform the analysis. Exiting...' ) sys.exit(1) chr_len_filename = os.path.join(genome_directory, "%s_chr_lengths.txt" % genome_name) check_file(chr_len_filename) if name: directory_name = 'HAYSTACK_HOTSPOTS_on_%s' % name else: directory_name = 'HAYSTACK_HOTSPOTS' if output_directory: output_directory = os.path.join(output_directory, directory_name) else: output_directory = directory_name if not os.path.exists(output_directory): os.makedirs(output_directory) genome_sorted_bins_file = os.path.join( output_directory, '%s.%dbp.bins.sorted.bed' % (os.path.basename(genome_name), bin_size)) tracks_directory = os.path.join(output_directory, 'TRACKS') if not os.path.exists(tracks_directory): os.makedirs(tracks_directory) intermediate_directory = os.path.join(output_directory, 'INTERMEDIATE') if not os.path.exists(intermediate_directory): os.makedirs(intermediate_directory) if not os.path.exists(genome_sorted_bins_file) or recompute_all: info('Creating bins of %dbp for %s in %s' % (bin_size, chr_len_filename, genome_sorted_bins_file)) 'bedtools makewindows -g %s -w %s | bedtools sort -i stdin |' % (chr_len_filename, bin_size) + "perl -nle 'print " + '"$_\t$.";' + "' /dev/stdin> %s" % genome_sorted_bins_file, shell=True, env=system_env) #convert bam files to genome-wide rpm tracks for base_name, bam_filename in zip(sample_names, bam_filenames): info('Processing:%s' % bam_filename) rpm_filename = os.path.join(tracks_directory, '%s.bedgraph' % base_name) sorted_rpm_filename = os.path.join(tracks_directory, '%s_sorted.bedgraph' % base_name) mapped_sorted_rpm_filename = os.path.join( tracks_directory, '%s_mapped_sorted.bedgraph' % base_name) binned_rpm_filename = os.path.join( intermediate_directory, '%s.%dbp.rpm' % (base_name, bin_size)) bigwig_filename = os.path.join(tracks_directory, '' % base_name) if input_is_bigwig and which('bigWigAverageOverBed'): if not os.path.exists(binned_rpm_filename) or recompute_all: cmd = 'bigWigAverageOverBed %s %s /dev/stdout | sort -s -n -k 1,1 | cut -f5 > %s' % ( bam_filename, genome_sorted_bins_file, binned_rpm_filename), shell=True, env=system_env) shutil.copy2(bam_filename, bigwig_filename) else: if not os.path.exists(binned_rpm_filename) or recompute_all: info('Computing Scaling Factor...') cmd = 'samtools view -c -F 512 %s' % bam_filename #print cmd proc = sb.Popen(cmd, stdout=sb.PIPE, shell=True, env=system_env) (stdout, stderr) = proc.communicate() #print stdout,stderr scaling_factor = (1.0 / float(stdout.strip())) * 1000000 info('Scaling Factor: %e' % scaling_factor) info('Building BedGraph RPM track...') cmd = 'samtools view -b -F 512 %s | bamToBed | slopBed -r %s -l 0 -s -i stdin -g %s | genomeCoverageBed -g %s -i stdin -bg -scale %.32f > %s' % ( bam_filename, bin_size, chr_len_filename, chr_len_filename, scaling_factor, rpm_filename) #print cmd proc =, shell=True, env=system_env) if which('bedGraphToBigWig'): if not os.path.exists(bigwig_filename) or recompute_all: info('Converting BedGraph to BigWig') cmd = 'bedGraphToBigWig %s %s %s' % ( rpm_filename, chr_len_filename, bigwig_filename) proc =, shell=True, env=system_env) else: info( 'Sorry I cannot create the bigwig file.\nPlease download and install bedGraphToBigWig from here: and add to your PATH' ) if not os.path.exists(binned_rpm_filename) or recompute_all: info('Make constant binned (%dbp) rpm values file' % bin_size) #cmd='bedtools sort -i %s | bedtools map -a %s -b stdin -c 4 -o mean -null 0.0 | cut -f5 > %s' %(rpm_filename,genome_sorted_bins_file,binned_rpm_filename),shell=True,env=system_env) cmd = 'sort -k1,1 -k2,2n %s > %s' % (rpm_filename, sorted_rpm_filename) proc =, shell=True, env=system_env) cmd = 'bedtools map -a %s -b %s -c 4 -o mean -null 0.0 > %s' % ( genome_sorted_bins_file, sorted_rpm_filename, mapped_sorted_rpm_filename) proc =, shell=True, env=system_env) cmd = 'cut -f5 %s > %s' % (mapped_sorted_rpm_filename, binned_rpm_filename) proc =, shell=True, env=system_env) try: os.remove(rpm_filename) os.remove(sorted_rpm_filename) os.remove(mapped_sorted_rpm_filename) except: pass #load coordinates of bins coordinates_bin = pd.read_csv(genome_sorted_bins_file, names=['chr_id', 'bpstart', 'bpend'], sep='\t', header=None, usecols=[0, 1, 2]) N_BINS = coordinates_bin.shape[0] if not use_X_Y: coordinates_bin = coordinates_bin.ix[ (coordinates_bin['chr_id'] != 'chrX') & (coordinates_bin['chr_id'] != 'chrY')] #load all the tracks info('Loading the processed tracks') df_chip = {} for state_file in glob.glob(os.path.join(intermediate_directory, '*.rpm')): col_name = os.path.basename(state_file).replace('.rpm', '') df_chip[col_name] = pd.read_csv(state_file, squeeze=True, header=None) info('Loading:%s' % col_name) df_chip = pd.DataFrame(df_chip) if disable_quantile_normalization: info('Skipping quantile normalization...') else: info('Normalizing the data...') df_chip = pd.DataFrame(quantile_normalization(df_chip.values), columns=df_chip.columns, index=df_chip.index) if which('bedGraphToBigWig'): #write quantile normalized tracks coord_quantile = coordinates_bin.copy() for col in df_chip: if disable_quantile_normalization: normalized_output_filename = os.path.join( tracks_directory, '%s.bedgraph' % os.path.basename(col)) else: normalized_output_filename = os.path.join( tracks_directory, '%s_quantile_normalized.bedgraph' % os.path.basename(col)) normalized_output_filename_bigwig = normalized_output_filename.replace( '.bedgraph', '.bw') if not os.path.exists( normalized_output_filename_bigwig) or recompute_all: info('Writing binned track: %s' % normalized_output_filename_bigwig) coord_quantile['rpm_normalized'] = df_chip.ix[:, col] coord_quantile.dropna().to_csv(normalized_output_filename, sep='\t', header=False, index=False) cmd = 'bedGraphToBigWig %s %s %s' % ( normalized_output_filename, chr_len_filename, normalized_output_filename_bigwig) proc =, shell=True, env=system_env) try: os.remove(normalized_output_filename) except: pass else: info( 'Sorry I cannot creat the bigwig file.\nPlease download and install bedGraphToBigWig from here: and add to your PATH' ) #th_rpm=np.min(df_chip.apply(lambda x: np.percentile(x,th_rpm))) th_rpm = find_th_rpm(df_chip, th_rpm) info('Estimated th_rpm:%s' % th_rpm) df_chip_not_empty = df_chip.ix[(df_chip > th_rpm).any(1), :] if transformation == 'log2': df_chip_not_empty = df_chip_not_empty.applymap(log2_transform) info('Using log2 transformation') elif transformation == 'angle': df_chip_not_empty = df_chip_not_empty.applymap(angle_transform) info('Using angle transformation') else: info('Using no transformation') iod_values = df_chip_not_empty.var(1) / df_chip_not_empty.mean(1) ####calculate the inflation point a la superenhancers scores = iod_values min_s = np.min(scores) max_s = np.max(scores) N_POINTS = len(scores) x = np.linspace(0, 1, N_POINTS) y = sorted((scores - min_s) / (max_s - min_s)) m = smooth((np.diff(y) / np.diff(x)), 50) m = m - 1 m[m <= 0] = np.inf m[:int(len(m) * (1 - max_regions_percentage))] = np.inf idx_th = np.argmin(m) + 1 #print idx_th, th_iod = sorted(iod_values)[idx_th] #print th_iod hpr_idxs = iod_values > th_iod #print len(iod_values),len(hpr_idxs),sum(hpr_idxs), sum(hpr_idxs)/float(len(hpr_idxs)), info('Selected %f%% regions (%d)' % (sum(hpr_idxs) / float(len(hpr_idxs)) * 100, sum(hpr_idxs))) coordinates_bin['iod'] = iod_values #we remove the regions "without" signal in any of the cell types coordinates_bin.dropna(inplace=True) #create a track for IGV bedgraph_iod_track_filename = os.path.join(tracks_directory, 'VARIABILITY.bedgraph') bw_iod_track_filename = os.path.join(tracks_directory, '') if not os.path.exists(bw_iod_track_filename) or recompute_all: info('Generating variability track in bigwig format in:%s' % bw_iod_track_filename) coordinates_bin.to_csv(bedgraph_iod_track_filename, sep='\t', header=False, index=False)'bedGraphToBigWig %s %s %s' % (bedgraph_iod_track_filename, chr_len_filename, bw_iod_track_filename), shell=True, env=system_env) try: os.remove(bedgraph_iod_track_filename) except: pass #Write the HPRs bedgraph_hpr_filename = os.path.join( tracks_directory, 'SELECTED_VARIABILITY_HOTSPOT.bedgraph') to_write = coordinates_bin.ix[hpr_idxs[hpr_idxs].index] to_write.dropna(inplace=True) to_write['bpstart'] = to_write['bpstart'].astype(int) to_write['bpend'] = to_write['bpend'].astype(int) to_write.to_csv(bedgraph_hpr_filename, sep='\t', header=False, index=False) bed_hpr_fileaname = os.path.join(output_directory, 'SELECTED_VARIABILITY_HOTSPOT.bed') if not os.path.exists(bed_hpr_fileaname) or recompute_all: info('Writing the HPRs in: %s' % bed_hpr_fileaname)'sort -k1,1 -k2,2n %s | bedtools merge -i stdin > %s' % (bedgraph_hpr_filename, bed_hpr_fileaname), shell=True, env=system_env) #os.remove(bedgraph_hpr_filename) df_chip_hpr = df_chip_not_empty.ix[hpr_idxs, :] df_chip_hpr_zscore = df_chip_hpr.apply(zscore, axis=1) specific_regions_directory = os.path.join(output_directory, 'SPECIFIC_REGIONS') if not os.path.exists(specific_regions_directory): os.makedirs(specific_regions_directory) if depleted: z_score_high = -z_score_high z_score_low = -z_score_low #write target info('Writing Specific Regions for each cell line...') coord_zscore = coordinates_bin.copy() for col in df_chip_hpr_zscore: regions_specific_filename = 'Regions_specific_for_%s_z_%.2f.bedgraph' % ( os.path.basename(col).replace('.rpm', ''), z_score_high) specific_output_filename = os.path.join(specific_regions_directory, regions_specific_filename) specific_output_bed_filename = specific_output_filename.replace( '.bedgraph', '.bed') if not os.path.exists(specific_output_bed_filename) or recompute_all: if depleted: coord_zscore['z-score'] = df_chip_hpr_zscore.ix[ df_chip_hpr_zscore.ix[:, col] < z_score_high, col] else: coord_zscore['z-score'] = df_chip_hpr_zscore.ix[ df_chip_hpr_zscore.ix[:, col] > z_score_high, col] coord_zscore.dropna().to_csv(specific_output_filename, sep='\t', header=False, index=False) info('Writing:%s' % specific_output_bed_filename)'sort -k1,1 -k2,2n %s | bedtools merge -i stdin > %s' % (specific_output_filename, specific_output_bed_filename), shell=True, env=system_env) #write background info('Writing Background Regions for each cell line...') coord_zscore = coordinates_bin.copy() for col in df_chip_hpr_zscore: regions_bg_filename = 'Background_for_%s_z_%.2f.bedgraph' % ( os.path.basename(col).replace('.rpm', ''), z_score_low) bg_output_filename = os.path.join( specific_regions_directory, 'Background_for_%s_z_%.2f.bedgraph' % (os.path.basename(col).replace('.rpm', ''), z_score_low)) bg_output_bed_filename = bg_output_filename.replace( '.bedgraph', '.bed') if not os.path.exists(bg_output_bed_filename) or recompute_all: if depleted: coord_zscore['z-score'] = df_chip_hpr_zscore.ix[ df_chip_hpr_zscore.ix[:, col] > z_score_low, col] else: coord_zscore['z-score'] = df_chip_hpr_zscore.ix[ df_chip_hpr_zscore.ix[:, col] < z_score_low, col] coord_zscore.dropna().to_csv(bg_output_filename, sep='\t', header=False, index=False) info('Writing:%s' % bg_output_bed_filename)'sort -k1,1 -k2,2n -i %s | bedtools merge -i stdin > %s' % (bg_output_filename, bg_output_bed_filename), shell=True, env=system_env) ###plot selection pl.figure() pl.title('Selection of the HPRs') pl.plot(x, y, 'r', lw=3) pl.plot(x[idx_th], y[idx_th], '*', markersize=20) pl.hold(True) x_ext = np.linspace(-0.1, 1.2, N_POINTS) y_line = (m[idx_th] + 1.0) * (x_ext - x[idx_th]) + y[idx_th] pl.plot(x_ext, y_line, '--k', lw=3) pl.xlim(0, 1.1) pl.ylim(0, 1) pl.xlabel('Fraction of bins') pl.ylabel('Score normalized') pl.savefig( os.path.join(output_directory, 'SELECTION_OF_VARIABILITY_HOTSPOT.pdf')) pl.close() igv_session_filename = os.path.join(output_directory, 'OPEN_ME_WITH_IGV.xml') info('Creating an IGV session file (.xml) in: %s' % igv_session_filename) session = ET.Element("Session") session.set("genome", genome_name) session.set("hasGeneTrack", "true") session.set("version", "7") resources = ET.SubElement(session, "Resources") panel = ET.SubElement(session, "Panel") resource_items = [] track_items = [] hpr_iod_scores = scores[scores > th_iod] min_h = np.mean(hpr_iod_scores) - 2 * np.std(hpr_iod_scores) max_h = np.mean(hpr_iod_scores) + 2 * np.std(hpr_iod_scores) mid_h = np.mean(hpr_iod_scores) #write the tracks for sample_name in sample_names: if disable_quantile_normalization: track_full_path = os.path.join( output_directory, 'TRACKS', '' % (sample_name, bin_size)) else: track_full_path = os.path.join( output_directory, 'TRACKS', '' % (sample_name, bin_size)) track_filename = rem_base_path(track_full_path, output_directory) if os.path.exists(track_full_path): resource_items.append(ET.SubElement(resources, "Resource")) resource_items[-1].set("path", track_filename) track_items.append(ET.SubElement(panel, "Track")) track_items[-1].set('color', "0,0,178") track_items[-1].set('id', track_filename) track_items[-1].set("name", sample_name) resource_items.append(ET.SubElement(resources, "Resource")) resource_items[-1].set( "path", rem_base_path(bw_iod_track_filename, output_directory)) track_items.append(ET.SubElement(panel, "Track")) track_items[-1].set('color', "178,0,0") track_items[-1].set('id', rem_base_path(bw_iod_track_filename, output_directory)) track_items[-1].set('renderer', "HEATMAP") track_items[-1].set( "colorScale", "ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0" % (mid_h, min_h, mid_h, max_h)) track_items[-1].set("name", 'VARIABILITY') resource_items.append(ET.SubElement(resources, "Resource")) resource_items[-1].set("path", rem_base_path(bed_hpr_fileaname, output_directory)) track_items.append(ET.SubElement(panel, "Track")) track_items[-1].set('color', "178,0,0") track_items[-1].set('id', rem_base_path(bed_hpr_fileaname, output_directory)) track_items[-1].set('renderer', "HEATMAP") track_items[-1].set( "colorScale", "ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0" % (mid_h, min_h, mid_h, max_h)) track_items[-1].set("name", 'HOTSPOTS') for sample_name in sample_names: track_full_path = glob.glob( os.path.join(output_directory, 'SPECIFIC_REGIONS', 'Regions_specific_for_%s*.bedgraph' % sample_name))[0] specific_track_filename = rem_base_path(track_full_path, output_directory) if os.path.exists(track_full_path): resource_items.append(ET.SubElement(resources, "Resource")) resource_items[-1].set("path", specific_track_filename) track_items.append(ET.SubElement(panel, "Track")) track_items[-1].set('color', "178,0,0") track_items[-1].set('id', specific_track_filename) track_items[-1].set('renderer', "HEATMAP") track_items[-1].set( "colorScale", "ContinuousColorScale;%e;%e;%e;%e;0,153,255;255,255,51;204,0,0" % (mid_h, min_h, mid_h, max_h)) track_items[-1].set("name", 'REGION SPECIFIC FOR %s' % sample_name) tree = ET.ElementTree(session) tree.write(igv_session_filename, xml_declaration=True) info('All done! Ciao!') sys.exit(0)