def bed_to_bedgraph_by_strand(in_files, out_bedgraphs): "extend reads to the full fragment length and create a bedgraph from them" in_bed, in_chrom_sizes = in_files cmd = ( """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """ + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s" ) % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph[0], ) sys_call(cmd) cmd = ( """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """ + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s" ) % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph[1], ) sys_call(cmd)
def quest_to_wig(in_reads, out_glob, out_template, in_dir, chrom_sizes): in_template = in_dir + '/tracks/wig_profiles/%s.profile.wig.gz' for f in ['background_unnormalized', 'background_normalized', 'ChIP_normalized', 'ChIP_unnormalized']: in_file = in_template % f out_file = out_template % f #shutil.copy(in_file, out_file) sys_call('gunzip -c -d %s > %s' % (in_file, out_file))
def merge_comparison_types(in_files, out_merged): """concatenate the comparison types together for plotting""" cmd = 'cat %s > %s ' % (in_files[0], out_merged) sys_call(cmd, file_log=False) # skip the header for remaining files for f in in_files[1:]: cmd = 'sed 1d %s >> %s' % (f, out_merged) sys_call(cmd, file_log=False)
def remove_nonoverlapping_reads(in_bed, out_bed, min_read_count): """ Remove mapped reads that don't overlap with at least *min_read_count* reads """ cmd = "intersectBed -wa -c -a %s -b %s | awk '$(NF) >= %s' |" \ r"cut -f 1,2,3,4,5,6 > %s" % (in_bed, in_bed, min_read_count + 1, out_bed) sys_call(cmd, file_log=False)
def run_quest(in_reads, out_peaks, chrom_sizes): """Run QuEST on the given treatment and control data""" in_treat = filter(lambda f: '.treat.' in f, in_reads)[0] in_control = filter(lambda f: '.control.' in f, in_reads)[0] sys_call('echo "y\n1\n2\ny\n" | generate_QuEST_parameters.pl -QuEST_align_ChIP %s ' '-QuEST_align_RX_noIP %s -gt %s -ap %s_output -silent' % (in_treat, in_control, chrom_sizes, in_treat)) shutil.copy('%s_output/calls/peak_caller.ChIP.out.accepted' % in_treat, out_peaks)
def clip_and_sort_peaks(in_bed, out_sorted): """Sort the bed file and constrain bed regions to chromosome sizes""" with tempfile.NamedTemporaryFile() as tmp_clipped: cmd = "bedClip %s %s.chrom.sizes %s" % (in_bed, genome_path(), tmp_clipped.name) sys_call(cmd) # cmd = 'bedSort %s %s' % (out_clipped, out_sorted) cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted) sys_call(cmd)
def discover_nmica_motifs(in_fasta, out_motifs): """Discover sequence motifs in peaks by running nestedMICA""" cmd = 'nminfer -seqs %s %s ' % (in_fasta, cfg.get('motifs', 'nmica_params')) sys_call(cmd) motifs_name = in_fasta.replace('.fasta', '.motifs.xms') sys_call('mv motifs.xms %s' % motifs_name) motifs = sequence_motif.parse_xms_motifs(motifs_name) pickle.dump(motifs, open(out_motifs, 'w'))
def discover_meme_motifs(in_fasta, out_motifs): """Discover sequence motifs in peaks by running meme""" cmd = 'meme %s %s -oc %s_meme_out ' % ( in_fasta, cfg.get('motifs', 'meme_params'), out_motifs) #if 'top' in in_fasta and 'around' in in_fasta: sys_call(cmd) motifs = sequence_motif.parseMemeMotifs('%s_meme_out/meme.txt' % out_motifs) pickle.dump(motifs, open(out_motifs, 'w'))
def deploy_track_files(in_files, out_header): """Copy UCSC tracks to public url""" remote = cfg.get("visualization", "remote_ssh_dir") remote_host = remote.split(":")[0] remote_dir = remote.split(":")[1] for in_track in in_files: sys_call("ssh %s mkdir -p %s" % (remote_host, remote_dir)) sys_call("scp %s %s" % (in_track, remote)) touch(out_header)
def discover_meme_motifs(in_fasta, out_motifs): """Discover sequence motifs in peaks by running meme""" cmd = 'meme %s %s -oc %s_meme_out ' % (in_fasta, cfg.get('motifs', 'meme_params'), out_motifs) #if 'top' in in_fasta and 'around' in in_fasta: sys_call(cmd) motifs = sequence_motif.parseMemeMotifs('%s_meme_out/meme.txt' % out_motifs) pickle.dump(motifs, open(out_motifs, 'w'))
def deploy_track_files(in_files, out_header): """Copy UCSC tracks to public url""" remote = cfg.get('visualization', 'remote_ssh_dir') remote_host = remote.split(':')[0] remote_dir = remote.split(':')[1] for in_track in in_files: sys_call('ssh %s mkdir -p %s' % (remote_host, remote_dir)) sys_call('scp %s %s' % (in_track, remote)) touch(out_header)
def clip_and_sort_peaks(in_bed, out_sorted): """Sort the bed file and constrain bed regions to chromosome sizes""" with tempfile.NamedTemporaryFile() as tmp_clipped: cmd = 'bedClip %s %s.chrom.sizes %s' % (in_bed, genome_path(), tmp_clipped.name) sys_call(cmd) #cmd = 'bedSort %s %s' % (out_clipped, out_sorted) cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted) sys_call(cmd)
def merge_strands(in_files, out_merged): """concatenate the strand-specific analyses for plotting""" # output the first file in its entirety cmd = 'cat %s > %s ' % (in_files[0], out_merged) sys_call(cmd, file_log=False) # skip the header for remaining files for f in in_files[1:]: cmd = 'sed 1d %s >> %s' % (f, out_merged) sys_call(cmd, file_log=False)
def run_mosaik_align(in_files, out_align): 'align reads to reference using MosaikAligner' # MosaikAligner -in sequence_archives/c_elegans_chr2_test.dat -out sequence_archives/c_elegans_chr2_test_aligned.dat -ia reference/c.elegans_chr2.dat -hs 14 -act 17 -mm 2 -m unique in_reads, in_genome_dat, in_genome_jump, _, _ = in_files in_genome_jump = in_genome_jump.replace('_keys.jmp', '') cmd = 'MosaikAligner -in %s -ia %s -j %s -out %s -hs %s %s' cmd = cmd % (in_reads, in_genome_dat, in_genome_jump, out_align, cfg.getint('mapping', 'mosaik_hash_size'), cfg.get('mapping', 'mosaik_params')) sys_call(cmd)
def bed_to_bedgraph(in_files, out_bedgraph): 'extend reads to the full fragment length and create a bedgraph from them' in_bed, in_chrom_sizes = in_files cmd = ('slopBed -i %s -s -r %s -l 0 -g %s | ' + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph) sys_call(cmd)
def get_microRNA(_, out_mirna): """retrieve microRNA genes from UCSC""" url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/wgRna.txt.gz' url = url % cfg.get('DEFAULT', 'genome') sys_call('wget -N -P . %s' % url) sys_call('gunzip -f wgRna.txt.gz') with open(out_mirna, 'w') as outfile: for line in open('wgRna.txt'): (bin, chrom, start, end, name, score, strand, thickStart, thickEnd, type) = line.strip().split('\t') outfile.write('\t'.join([chrom, start, end, name + '_' + type, score, strand]) + '\n')
def get_ssaha2_hashtable(in_genome, out_ssaha2): """Use ssaha2Build to generate a hash table for the genetic sequences stored in an input .fasta file ssaha2Build writes five files to disk, each preceded by the hash name. Their file extensions are: .base, .body, .head, .name, .size """ #TODO: add useful parameters to cmd and config file cmd = 'ssaha2Build -save %s %s' % (out_ssaha2, in_genome) sys_call(cmd)
def run_glitr(in_files, out_peaks): """Call peaks with GLITR""" in_treat = filter(lambda f: '.treat.' in f, in_files)[0] in_control = filter(lambda f: '.control.' in f, in_files)[0] glitr_dir = in_treat + '.GLITR_out' cmd = ('rm -r %s; mkdir %s; cd %s; GLITR.pl --CHIP=../%s ' + \ '--CONTROL=../%s --GENOME=%s %s ') % ( glitr_dir, glitr_dir, glitr_dir, in_treat, in_control, cfg.get('DEFAULT', 'genome').upper(), cfg.get('peaks', 'glitr_params')) sys_call(cmd) sys_call('cp %s/allChIP.FDR_*PercentFDR %s' % (glitr_dir, out_peaks))
def get_microRNA(_, out_mirna): """retrieve microRNA genes from UCSC""" url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/wgRna.txt.gz' url = url % cfg.get('DEFAULT', 'genome') sys_call('wget -N -P . %s' % url) sys_call('gunzip -f wgRna.txt.gz') with open(out_mirna, 'w') as outfile: for line in open('wgRna.txt'): (bin, chrom, start, end, name, score, strand, thickStart, thickEnd, type) = line.strip().split('\t') outfile.write('\t'.join( [chrom, start, end, name + '_' + type, score, strand]) + '\n')
def bed_to_bedgraph(in_files, out_bedgraph): "extend reads to the full fragment length and create a bedgraph from them" in_bed, in_chrom_sizes = in_files cmd = ("slopBed -i %s -s -r %s -l 0 -g %s | " + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s") % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph, ) sys_call(cmd)
def run_ssaha2(in_fastq, out_ssaha2): """ Runs ssaha2 command using the prebuilt hash table from get_ssaha2_hashtable. The ssaha2 command maps DNA sequence reads onto a genomic reference sequence using a combination of word hashing and dynamic programming. (From ssaha2 manual) """ #TODO: add useful parameters to cmd and config file #cmd = 'ssaha2 -outfile %s -save %s %s' % (out_ssaha2, hash_name, in_fastq) cmd = 'ssaha2 -outfile %s -disk 1 -save %s %s' cmd = cmd % (out_ssaha2, (cfg.get('mapping', 'ssaha2_hash_name')), in_fastq) sys_call(cmd)
def run_macs(in_files, out_peaks, max_fdr): """Call peak with MACS (v1.3). Apply a maximum FDR threshold and treat centers as peak summits """ in_treat, in_control = in_files[0] matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups() name = matches[0] + matches[1] + '.macs.peaks' max_fdr = cfg.getfloat('peaks', 'max_FDR') cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name, cfg.get('peaks', 'macs_params')) sys_call(cmd) # convert to proper bedfile- ints for score and + for strand with open(out_peaks, 'w') as outfile: with open(name + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter( bedCommentFilter, infile)): fields = line.split('\t') if fields[0] == 'chr': continue # skip header start = str(max(0, int(fields[1]))) score = str(max(0, min(1000, int(float(fields[6]))))) fdr = float(fields[8]) if fdr <= max_fdr: outfile.write('\t'.join([fields[0], start, fields[2], 'MACS_peak_%s' % (index + 1), score]) + '\t+\n') # take region surrounding the peak center as the summit summit_size = cfg.getint('peaks', 'peak_summit_size') with open(out_peaks + '_summits.%s_around' % \ cfg.get('peaks', 'peak_summit_size'), 'w') as outfile: with open(name + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.strip().split('\t') if fields[0] == 'chr': continue # skip header score = str(max(0, min(1000, int(float(fields[6]))))) p_start, p_stop = max(0, int(fields[1])), int(fields[2]) p_center = p_start + (p_stop - p_start) / 2 s_start = p_center - summit_size / 2 s_stop = p_center + summit_size / 2 fdr = float(fields[8]) if fdr <= max_fdr: outfile.write('\t'.join([fields[0], str(s_start), str(s_stop), 'MACS_peak_%s' % (index + 1), score]) + '\t+\n')
def get_nearest_features(in_files, _, out_pattern): """Calculate the distance from each peak to the nearest features""" print in_files print out_pattern in_peaks, chrom_sizes, all_features = in_files[0], in_files[1], in_files[ 2:] if len(all_features) == 0: raise RuntimeError("No features present to compare to!") # get distances for each feature tmp_output = tempfile.NamedTemporaryFile(delete=False) for in_feature in all_features: distances = [] all_distances = [] cmd = 'closestBed -a %s -b %s -t first -D ref > %s' % ( in_peaks, in_feature, tmp_output.name) sys_call(cmd, file_log=False) with open(tmp_output.name) as infile: for line in infile: if not line: continue fields = line.strip().split('\t') dist = int(fields[-1]) #if int(fields[1]) < int(fields[7]): # dist *= -1 distances.append(dist) all_distances.append(distances) cmd = 'shuffleBed -chrom -i %s -g %s | closestBed -a stdin -b %s -t first -D ref > %s' % ( in_peaks, chrom_sizes, in_feature, tmp_output.name) sys_call(cmd, file_log=False) distances = [] with open(tmp_output.name) as infile: for line in infile: if not line: continue fields = line.strip().split('\t') dist = int(fields[-1]) #if int(fields[1]) < int(fields[7]): # dist *= -1 distances.append(dist) all_distances.append(distances) with open(out_pattern % in_feature, 'w') as outfile: outfile.write('\t'.join([in_feature, 'Random']) + '\n') # header for d in zip(*all_distances): outfile.write('\t'.join(map(str, d)) + '\n') # distance as column os.unlink(tmp_output.name)
def find_nearby_genes(in_files, out_genes): """report which genes are within a certain distance of a peak""" in_peaks, in_genes = in_files[0] tmp_output = tempfile.NamedTemporaryFile(delete=False).name cmd = 'closestBed -a %s -b %s -t first -d > %s' % (in_peaks, in_genes, tmp_output) sys_call(cmd) with open(tmp_output) as infile: with open(out_genes, 'w') as outfile: for line in infile: if not line: continue fields = line.strip().split('\t') dist = int(fields[-1]) if abs(dist) <= cfg.getint('genes', 'nearby_genes_max_dist'): outfile.write(line) os.unlink(tmp_output)
def get_nearest_features(in_files, _, out_pattern): """Calculate the distance from each peak to the nearest features""" print in_files print out_pattern in_peaks, chrom_sizes, all_features = in_files[0], in_files[1], in_files[2:] if len(all_features) == 0: raise RuntimeError("No features present to compare to!") # get distances for each feature tmp_output = tempfile.NamedTemporaryFile(delete=False) for in_feature in all_features: distances = [] all_distances = [] cmd = 'closestBed -a %s -b %s -t first -D ref > %s' % (in_peaks, in_feature, tmp_output.name) sys_call(cmd, file_log=False) with open(tmp_output.name) as infile: for line in infile: if not line: continue fields = line.strip().split('\t') dist = int(fields[-1]) #if int(fields[1]) < int(fields[7]): # dist *= -1 distances.append(dist) all_distances.append(distances) cmd = 'shuffleBed -chrom -i %s -g %s | closestBed -a stdin -b %s -t first -D ref > %s' % ( in_peaks, chrom_sizes, in_feature, tmp_output.name) sys_call(cmd, file_log=False) distances = [] with open(tmp_output.name) as infile: for line in infile: if not line: continue fields = line.strip().split('\t') dist = int(fields[-1]) #if int(fields[1]) < int(fields[7]): # dist *= -1 distances.append(dist) all_distances.append(distances) with open(out_pattern % in_feature, 'w') as outfile: outfile.write('\t'.join([in_feature, 'Random']) + '\n') # header for d in zip(*all_distances): outfile.write('\t'.join(map(str, d)) + '\n') # distance as column os.unlink(tmp_output.name)
def run_macs14(in_files, out_peaks, max_fdr): """Call peaks using MACS (v1.4). Apply a maximum FDR threshold.""" in_treat, in_control = in_files[0] matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups() name = matches[0] + matches[1] + '.macs14.peaks' cmd = 'macs14 -t %s -c %s --name=%s %s --diag' % (in_treat, in_control, name, cfg.get('peaks', 'macs14_params')) sys_call(cmd) peaks_to_keep = set() # convert to proper bedfile- ints for score and + for strand with open(out_peaks, 'w') as outfile: with open(name + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.split('\t') if fields[0] == 'chr': continue # skip header start = str(max(0, int(fields[1]))) score = str(max(0, min(1000, int(float(fields[6]))))) fdr = float(fields[8]) if fdr <= max_fdr: outfile.write('\t'.join([fields[0], start, fields[2], 'MACS14_peak_%s' % (index + 1), score]) + '\t+\n') peaks_to_keep.add(index) # take region surrounding the peak summit summit_size = cfg.getint('peaks', 'peak_summit_size') with open(out_peaks + '_summits.%s_around' % \ cfg.get('peaks', 'peak_summit_size'), 'w') as outfile: with open(name + '_summits.bed') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.strip().split('\t') if fields[0] == 'chr': continue # skip header # score is number of reads at summit score = str(max(0, min(1000, int(float(fields[-1]))))) start = str(max(0, int(fields[1]) - summit_size / 2)) stop = str(int(fields[2]) + summit_size / 2) if index in peaks_to_keep: outfile.write('\t'.join([fields[0], start, stop, 'MACS_peak_%s' % (index + 1), score]) + '\t+\n')
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs): 'extend reads to the full fragment length and create a bedgraph from them' in_bed, in_chrom_sizes = in_files cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """ + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph[0]) sys_call(cmd) cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """ + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph[1]) sys_call(cmd)
def get_refseq_genes(_, out_genes): """Download refseq genes from UCSC and reformat as BED""" url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/refGene.txt.gz' url = url % cfg.get('DEFAULT', 'genome') sys_call('wget -N -P . %s' % url) sys_call('gunzip -f refGene.txt.gz') sys_call('mv refGene.txt %s' % out_genes)
def run_macs14_no_control(in_treat, out_peaks): """Call peaks using MACS (v1.4) without control data""" cmd = 'macs14 -t %s --name=%s %s' % (in_treat, out_peaks, cfg.get('peaks', 'macs14_params')) sys_call(cmd) peaks_to_keep = set() # convert to proper bedfile- ints for score and + for strand with open(out_peaks, 'w') as outfile: with open(out_peaks + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.split('\t') if fields[0] == 'chr': continue # skip header start = str(max(0, int(fields[1]))) score = str(max(0, min(1000, int(float(fields[6]))))) outfile.write('\t'.join([fields[0], start, fields[2], 'MACS14_peak_%s' % (index + 1), score]) + '\t+\n') peaks_to_keep.add(index) # take region surrounding the peak summit summit_size = cfg.getint('peaks', 'peak_summit_size') with open(out_peaks + '_summits.%s_around' % \ cfg.get('peaks', 'peak_summit_size'), 'w') as outfile: with open(out_peaks + '_summits.bed') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.strip().split('\t') if fields[0] == 'chr': continue # skip header # score is number of reads at summit score = str(max(0, min(1000, int(float(fields[-1]))))) start = str(max(0, int(fields[1]) - summit_size / 2)) stop = str(int(fields[2]) + summit_size / 2) if index in peaks_to_keep: outfile.write('\t'.join([fields[0], start, stop, 'MACS_peak_%s' % (index + 1), score]) + '\t+\n')
def maq_map_reads(in_files, out_map): """ Use maq match to align the reads to the reference. Input files are in binary format and output is in .map format. """ cmd = 'maq match %s %s %s' % (out_map, in_files[0], in_files[1]) sys_call(cmd)
def get_polyA_DB(_, out_db, genome_build): cmd = r"curl 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/polyaDb.txt.gz' | gunzip - | cut -d $'\t' -f 2- > %s" cmd = cmd % (genome_build, out_db) sys_call(cmd, file_log=False)
def wig_to_bigwig(in_wig, out_bigwig): """Convert the wig file to a bigwig file""" cmd = 'wigToBigWig %s %s.chrom.sizes %s' % (in_wig, genome_path(), out_bigwig) sys_call(cmd)
def bedgraph_to_bigwig(in_bedgraph, out_bigwig): """Convert the bedgraph file to .bigwig for viewing on UCSC""" cmd = 'bedGraphToBigWig %s %s.chrom.sizes %s' % (in_bedgraph, genome_path(), out_bigwig) sys_call(cmd)
def wig_to_bigwig(in_wig, out_bigwig): """Convert the wig file to a bigwig file""" cmd = "wigToBigWig %s %s.chrom.sizes %s" % (in_wig, genome_path(), out_bigwig) sys_call(cmd)
def remove_terminal_exon(in_files, out_bed): """Remove all exons but the last one using intersectBed""" in_bed, exon_file = in_files cmd = 'intersectBed -v -a %s -b %s > %s' % (in_bed, exon_file, out_bed) sys_call(cmd, file_log=False)
def convert_gff3_genes_to_bed(in_gff3, out_gene_pred): """convert gff3 genes to UCSC's genePred format""" sys_call('gff3ToGenePred %s %s' % (in_gff3, out_gene_pred), file_log=False)
def maq_index_reads(in_fastq, out_bfq): """ Use maq fastq2bfq to convert read sequences in .fastq format to BFQ format, which is a binary representation. """ cmd = 'maq fastq2bfq %s %s' % (in_fastq, out_bfq) sys_call(cmd)
def bedgraph_to_bigwig(in_bedgraph, out_bigwig): """Convert the bedgraph file to .bigwig for viewing on UCSC""" cmd = "bedGraphToBigWig %s %s.chrom.sizes %s" % (in_bedgraph, genome_path(), out_bigwig) sys_call(cmd)
def uniquefy_downsample_reads(in_files, out_files): """Uniquefy sequence reads then downsample so the total unique tag count in treatment and control is the same. This may generate many downsampled datasets. """ # WARNING: this is a circular dependency. It has to be included at runtime # Top-level import will cause this module to load only 1/2 way # we import here because we need to call this function directly, # and not just when using ruffus from hts_waterworks.visualize import bed_uniquefy if not cfg.getboolean('peaks', 'downsample_reads'): with log_mtx: log.debug('NOT downsampling the sequence reads!') else: in_treat, in_control = in_files out_treat_template = re.sub(r'mapped_reads$', 'matched_size_%s.mapped_reads', in_treat) out_control_template = re.sub(r'mapped_reads$', 'matched_size_%s.mapped_reads', in_control) if out_treat_template == in_treat: raise RuntimeError('regex substitution failed from %s to %s' % ( in_treat, out_treat_template)) if out_control_template == in_control: raise RuntimeError('regex substitution failed from %s to %s' % ( in_control, out_control_template)) tmp_t_sorted = tempfile.NamedTemporaryFile(delete=False).name tmp_c_sorted = tempfile.NamedTemporaryFile(delete=False).name tmp_t_unique = tempfile.NamedTemporaryFile(delete=False).name tmp_c_unique = tempfile.NamedTemporaryFile(delete=False).name # sort the reads bed_clip_and_sort(in_treat, tmp_t_sorted) bed_clip_and_sort(in_control, tmp_c_sorted) # uniquefy the reads bed_uniquefy(tmp_t_sorted, tmp_t_unique, cfg.getint('visualization', 'uniquefy_track_max_reads')) bed_uniquefy(tmp_c_sorted, tmp_c_unique, cfg.getint('visualization', 'uniquefy_track_max_reads')) total_treat = sum(1 for l in open(tmp_t_unique)) total_control = sum(1 for l in open(tmp_c_unique)) if total_treat == total_control: with log_mtx: log.debug('No downsampling required-- tag counts identical') else: # downsample num_down_sample times for i in xrange(cfg.getint('peaks', 'num_down_samples')): out_treat = out_treat_template % i out_control = out_control_template % i if total_treat > total_control: # reduce number of treatment reads inds_to_keep = set(random.sample(xrange(total_treat), total_control)) in_orig, out_orig = tmp_c_unique, out_control in_subset, out_subset = tmp_t_unique, out_treat else: # reduce number of control reads inds_to_keep = set(random.sample(xrange(total_control), total_treat)) in_orig, out_orig = tmp_t_unique, out_treat in_subset, out_subset = tmp_c_unique, out_control sys_call('cp %s %s' % (in_orig, out_orig)) # subset the tags with open(in_subset) as infile: with open(out_subset, 'w') as outfile: outfile.writelines(line for i, line in enumerate(infile) if i in inds_to_keep) for f in [tmp_t_sorted, tmp_t_unique, tmp_c_sorted, tmp_c_unique]: os.unlink(f)
def maq_view_reads(in_map, out_map): """ Use maq mapview to generate a human readable .map format. """ cmd = 'maq mapview %s > %s' % (in_map, out_map) sys_call(cmd)
def bed_to_bigbed(in_bed, out_bigbed): """Convert a BED file to .bigbed for viewing on UCSC browser""" cmd = "bedToBigBed %s %s.chrom.sizes %s" % (in_bed, genome_path(), out_bigbed) sys_call(cmd)
def bed_to_bigbed(in_bed, out_bigbed): """Convert a BED file to .bigbed for viewing on UCSC browser""" cmd = 'bedToBigBed %s %s.chrom.sizes %s' % (in_bed, genome_path(), out_bigbed) sys_call(cmd)
def maq_index_reference(in_fasta, out_bfa): """ Use maq fasta2bfa to convert reference sequences in .fasta format to BFA format, which is a binary representation. """ cmd = 'maq fasta2bfa %s %s' % (in_fasta, out_bfa) sys_call(cmd)
def merge_adjacent_reads(in_bed, out_pattern, window_width, iterations, out_merged, out_pileup, min_read_count): """Reassign read ends to a weighted average of adjacent reads""" # helper functions for parsing bed files filter_lines = lambda l: l.strip() and (not l.startswith('#') or \ l.startswith('"')) read_bed_lines = lambda infile: itertools.ifilter(filter_lines, infile) # sort the input by chrom, stop tmpfile = in_bed + '.merged_adjacent_sorted' cmd = r"sort -t $'\t' -k 1,1 -k 3g,3 %s > %s" % (in_bed, tmpfile) print cmd sys_call(cmd, file_log=False) p_file = tmpfile outfile_pileup = None # used on last iteration to generate the final pileup for i in range(iterations): print 'merge iteration %s' % i # read in from output of previous iteration infile = read_bed_lines(open(p_file)) # output to a temp file except on the last iteration if i != iterations - 1: p_file = in_bed + '.merge_adjacent_%s' % i else: p_file = out_merged outfile_pileup = open(out_pileup, 'w') outfile = open(p_file, 'w') # parse first line (chrom, start, stop, name, score, strand) = infile.next().strip().split('\t')[:6] if strand == '+': p_chrom, p_stops, p_names, p_strands = (chrom, [int(stop)], [name], [strand]) else: p_chrom, p_stops, p_names, p_strands = (chrom, [int(start)], [name], [strand]) print 'first line:', chrom, start, stop, name, score, strand for index, line in enumerate(infile): try: (chrom, start, stop, name, score, strand) = line.strip().split('\t')[:6] except: print index, 'this line:', line raise if strand == '+': stop = int(stop) else: stop = int(start) + 1 # is next read too far from first recorded? if p_chrom != chrom or (len(p_stops) > 0 and abs(p_stops[0] - stop) > window_width): if len(p_stops) == 0 or len(p_names) == 0: print 'error!' print line print p_stops, p_names, p_strands raise if len(p_stops) > min_read_count: avg = int(round(sum(p_stops) / float(len(p_stops)))) # write out reads in this cluster, using avg as coordinate outfile.writelines('\t'.join([p_chrom, str(max(0, avg-1)), str(avg), n_name, '0', n_strand]) + '\n' for n_name, n_strand in zip(p_names, p_strands)) if outfile_pileup is not None: outfile_pileup.write('\t'.join([p_chrom, str(max(0, avg-1)), str(avg), p_names[0], str(len(p_stops)), p_strands[0]]) + '\n') # reset our record p_chrom = chrom p_stops = [stop] p_names = [name] p_strands = [strand] # otherwise, the next read is within the window, on same chrom else: p_stops.append(stop) p_names.append(name) p_strands.append(strand) # output anything left in queue after EOF if len(p_stops) > 0: avg = int(round(sum(p_stops) / float(len(p_stops)))) # write out reads in this cluster, using avg as coordinate outfile.writelines('\t'.join([chrom, str(max(0, avg-1)), str(avg), n_name, '0', n_strand]) + '\n' for n_name, n_strand in zip(p_names, p_strands)) if outfile_pileup is not None: outfile_pileup.write('\t'.join([chrom, str(max(0, avg-1)), str(avg), p_names[0], str(len(p_stops)), p_strands[0]]) + '\n') if outfile_pileup is not None: outfile_pileup.close() outfile.close()