def sample_genome_short(_, out_samples): """Genomic sampling for threshold score""" args = shlex.split('''%s --genome=%s --sample_length=30 --num_samples=%s ''' % (out_samples, cfg.get('DEFAULT', 'worldbase_genome'), cfg.get('motifs', 'motif_threshold_sample_size'))) sampling.main(args)
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs): "extend reads to the full fragment length and create a bedgraph from them" in_bed, in_chrom_sizes = in_files cmd = ( """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """ + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s" ) % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph[0], ) sys_call(cmd) cmd = ( """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """ + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s" ) % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph[1], ) sys_call(cmd)
def motif_enrichment_control(in_files, out_enrichment): """Determine a motif's enrichment vs. control data""" in_motifs, in_peaks, in_control_sample = in_files[0] for zscore in cfg.get('motifs', 'motif_zscores').split(','): args = shlex.split( '''%s --motif_file=%s --bg_samples=%s --genome=%s --output_file=%s --zscore=%s''' % (in_peaks, in_motifs, in_control_sample, cfg.get('DEFAULT', 'worldbase_genome'), out_enrichment, zscore)) motif_significance.main(args)
def motif_enrichment_control(in_files, out_enrichment): """Determine a motif's enrichment vs. control data""" in_motifs, in_peaks, in_control_sample = in_files[0] for zscore in cfg.get('motifs', 'motif_zscores').split(','): args = shlex.split('''%s --motif_file=%s --bg_samples=%s --genome=%s --output_file=%s --zscore=%s''' % ( in_peaks, in_motifs, in_control_sample, cfg.get('DEFAULT', 'worldbase_genome'), out_enrichment, zscore)) motif_significance.main(args)
def refseq_genes_to_regions(in_genes, out_pattern): """make regions (promoter, downstream, 5UTR, etc) from refseq_genes""" args = shlex.split('''%s --promoter_size=%s --promoter_extend=%s --downstream_size=%s --downstream_extend=%s --with_gene_name''' % (in_genes, cfg.get('genes', 'promoter_size'), cfg.get('genes', 'promoter_extend'), cfg.get('genes', 'downstream_size'), cfg.get('genes', 'downstream_extend'))) makeGeneStructure.main(args)
def refseq_genes_to_regions(in_genes, out_pattern): """make regions (promoter, downstream, 5UTR, etc) from refseq_genes""" args = shlex.split('''%s --promoter_size=%s --promoter_extend=%s --downstream_size=%s --downstream_extend=%s --with_gene_name''' % ( in_genes, cfg.get('genes', 'promoter_size'), cfg.get('genes', 'promoter_extend'), cfg.get('genes', 'downstream_size'), cfg.get('genes', 'downstream_extend'))) makeGeneStructure.main(args)
def run_glitr(in_files, out_peaks): """Call peaks with GLITR""" in_treat = filter(lambda f: '.treat.' in f, in_files)[0] in_control = filter(lambda f: '.control.' in f, in_files)[0] glitr_dir = in_treat + '.GLITR_out' cmd = ('rm -r %s; mkdir %s; cd %s; GLITR.pl --CHIP=../%s ' + \ '--CONTROL=../%s --GENOME=%s %s ') % ( glitr_dir, glitr_dir, glitr_dir, in_treat, in_control, cfg.get('DEFAULT', 'genome').upper(), cfg.get('peaks', 'glitr_params')) sys_call(cmd) sys_call('cp %s/allChIP.FDR_*PercentFDR %s' % (glitr_dir, out_peaks))
def trim_reads(in_fastq, out_fastq): 'trim leading and/or trailing bases from all reads' cmd1 = 'cat %s' % in_fastq cmd2 = 'fastx_trimmer -o %s -f %s -l %s' % (out_fastq, cfg.get('filtering', 'trim_start'), cfg.get('filtering', 'trim_end')) p1 = Popen([cmd1], stdout=PIPE, shell=True) p2 = Popen([cmd2], stdin=p1.stdout, shell=True) p2.communicate() if p1.returncode: raise CalledProcessError(p1.returncode, cmd1) if p2.returncode: raise CalledProcessError(p2.returncode, cmd2)
def trim_reads(in_fastq, out_fastq): 'trim leading and/or trailing bases from all reads' cmd1 = 'cat %s' % in_fastq cmd2 = 'fastx_trimmer -o %s -f %s -l %s' % ( out_fastq, cfg.get('filtering', 'trim_start'), cfg.get('filtering', 'trim_end')) p1 = Popen([cmd1], stdout=PIPE, shell=True) p2 = Popen([cmd2], stdin=p1.stdout, shell=True) p2.communicate() if p1.returncode: raise CalledProcessError(p1.returncode, cmd1) if p2.returncode: raise CalledProcessError(p2.returncode, cmd2)
def run_macs(in_files, out_peaks, max_fdr): """Call peak with MACS (v1.3). Apply a maximum FDR threshold and treat centers as peak summits """ in_treat, in_control = in_files[0] matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups() name = matches[0] + matches[1] + '.macs.peaks' max_fdr = cfg.getfloat('peaks', 'max_FDR') cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name, cfg.get('peaks', 'macs_params')) sys_call(cmd) # convert to proper bedfile- ints for score and + for strand with open(out_peaks, 'w') as outfile: with open(name + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter( bedCommentFilter, infile)): fields = line.split('\t') if fields[0] == 'chr': continue # skip header start = str(max(0, int(fields[1]))) score = str(max(0, min(1000, int(float(fields[6]))))) fdr = float(fields[8]) if fdr <= max_fdr: outfile.write('\t'.join([fields[0], start, fields[2], 'MACS_peak_%s' % (index + 1), score]) + '\t+\n') # take region surrounding the peak center as the summit summit_size = cfg.getint('peaks', 'peak_summit_size') with open(out_peaks + '_summits.%s_around' % \ cfg.get('peaks', 'peak_summit_size'), 'w') as outfile: with open(name + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.strip().split('\t') if fields[0] == 'chr': continue # skip header score = str(max(0, min(1000, int(float(fields[6]))))) p_start, p_stop = max(0, int(fields[1])), int(fields[2]) p_center = p_start + (p_stop - p_start) / 2 s_start = p_center - summit_size / 2 s_stop = p_center + summit_size / 2 fdr = float(fields[8]) if fdr <= max_fdr: outfile.write('\t'.join([fields[0], str(s_start), str(s_stop), 'MACS_peak_%s' % (index + 1), score]) + '\t+\n')
def remove_internal_priming(in_bed, out_bed): """Reads that map to genomic locations with 6 conseuctive downstream A's or 7/10 downstream nt being A's should be filtered out. """ wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) with open(out_bed, 'w') as outfile: for line in open(in_bed): chrom,start,stop,name,score,strand = line.strip().split('\t') start, stop = int(start), int(stop) if strand not in ['+','-']: raise RuntimeError("unknown strand", strand, line) if strand == '+': try: downstream = str(wb_genome[chrom][stop:stop+10]).upper() except IndexError: downstream = '' down_A = downstream.count('A') down_consecutive_A= downstream.count('A' * 6) else: try: downstream = str(wb_genome[chrom][max(0,start-10):start]).upper() except IndexError: downstream = '' down_A = downstream.count('T') down_consecutive_A = downstream.count('T' * 6) #filter if 6+ consecutive A's in sequence or 7+ A's downstream if down_consecutive_A < 1 and down_A < 7: outfile.write(line)
def get_refseq_genes(_, out_genes): """Download refseq genes from UCSC and reformat as BED""" url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/refGene.txt.gz' url = url % cfg.get('DEFAULT', 'genome') sys_call('wget -N -P . %s' % url) sys_call('gunzip -f refGene.txt.gz') sys_call('mv refGene.txt %s' % out_genes)
def sample_genome_like_peaks(in_peaks, out_files): """Sample from the genome, keeping the sample widths the same as peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array( 'i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) s = sampling.sample_genome( wb_genome, peak_lengths, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'), excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'), excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'), ignoreCharacters='_', weighted=True) with open(out_sample, 'w') as outfile: with open(out_locations, 'w') as outlocations: for index, line in enumerate(s): outfile.write('>%s\n%s\n' % (index, line)) outlocations.write('\t'.join([ line.id, str(line.start), str(line.stop), str(index), '0', '+' if line.orientation == 1 else '-' ]) + '\n')
def sample_control_like_peaks(in_peaks, out_files): """Sample from the control IgG, with similar widths as the peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array('i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) # do the dance to map peaks back to their control raw reads control_bed = re.sub(r'treat', 'control', in_peaks) control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed) control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed) control_bed = re.sub(r'peaks', 'mapped_reads', control_bed) control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed) with open(control_bed) as control_file: with open(out_locations, 'w') as outlocations: s = sampling.sample_middles(wb_genome, peak_lengths, control_file, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size')) with open(out_sample, 'w') as outfile: for index, seq in enumerate(s): # repr() gives location, str() gives sequence outfile.write('>%s_%s\n%s\n' % (index, repr(seq), str(seq))) outlocations.write('\t'.join([seq.id, str(seq.start), str(seq.stop), str(index), '0', '+' if seq.orientation == 1 else '-']) + '\n')
def motif_select_random_seqs(in_fasta, out_pattern): """Split a fasta file into several chunks so motif discovery is easier""" name = name = re.search('(.*).fasta', in_fasta).groups()[0] with open(in_fasta) as infile: seqs = list(parseFastaLines(infile)) if len(seqs) <= cfg.get('motifs', 'motif_chunk_size'): num_chunks = 1 else: num_chunks = cfg.get('motifs', 'motif_num_chunks') # get a random sample of peaks for i in xrange(num_chunks): with open(name + '.small_sample.%s.fasta' % i, 'w') as outfile: subset = random.sample(seqs, min(len(seqs), cfg.getint('motifs', 'motif_chunk_size'))) outfile.writelines('>%s\n%s\n' % (s[0].strip(), s[1].strip()) for s in subset)
def sample_control_like_peaks(in_peaks, out_files): """Sample from the control IgG, with similar widths as the peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array( 'i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) # do the dance to map peaks back to their control raw reads control_bed = re.sub(r'treat', 'control', in_peaks) control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed) control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed) control_bed = re.sub(r'peaks', 'mapped_reads', control_bed) control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed) with open(control_bed) as control_file: with open(out_locations, 'w') as outlocations: s = sampling.sample_middles(wb_genome, peak_lengths, control_file, sampleSize=cfg.getint( 'motifs', 'motif_significance_sample_size')) with open(out_sample, 'w') as outfile: for index, seq in enumerate(s): # repr() gives location, str() gives sequence outfile.write('>%s_%s\n%s\n' % (index, repr(seq), str(seq))) outlocations.write('\t'.join([ seq.id, str(seq.start), str(seq.stop), str(index), '0', '+' if seq.orientation == 1 else '-' ]) + '\n')
def discover_nmica_motifs(in_fasta, out_motifs): """Discover sequence motifs in peaks by running nestedMICA""" cmd = 'nminfer -seqs %s %s ' % (in_fasta, cfg.get('motifs', 'nmica_params')) sys_call(cmd) motifs_name = in_fasta.replace('.fasta', '.motifs.xms') sys_call('mv motifs.xms %s' % motifs_name) motifs = sequence_motif.parse_xms_motifs(motifs_name) pickle.dump(motifs, open(out_motifs, 'w'))
def get_peak_sequence(in_peaks, out_fasta): """Get fasta file for peak summits """ in_summits = out_fasta.replace('.fasta', '') args = shlex.split( '''--genome=%s %s %s''' % (cfg.get('DEFAULT', 'worldbase_genome'), in_summits, out_fasta)) get_bed_sequence.main(args)
def get_peak_sequence(in_peaks, out_fasta): """Get fasta file for peak summits """ in_summits = out_fasta.replace('.fasta', '') args = shlex.split('''--genome=%s %s %s''' % ( cfg.get('DEFAULT', 'worldbase_genome'), in_summits, out_fasta)) get_bed_sequence.main(args)
def motif_select_random_seqs(in_fasta, out_pattern): """Split a fasta file into several chunks so motif discovery is easier""" name = name = re.search('(.*).fasta', in_fasta).groups()[0] with open(in_fasta) as infile: seqs = list(parseFastaLines(infile)) if len(seqs) <= cfg.get('motifs', 'motif_chunk_size'): num_chunks = 1 else: num_chunks = cfg.get('motifs', 'motif_num_chunks') # get a random sample of peaks for i in xrange(num_chunks): with open(name + '.small_sample.%s.fasta' % i, 'w') as outfile: subset = random.sample( seqs, min(len(seqs), cfg.getint('motifs', 'motif_chunk_size'))) outfile.writelines('>%s\n%s\n' % (s[0].strip(), s[1].strip()) for s in subset)
def deploy_track_files(in_files, out_header): """Copy UCSC tracks to public url""" remote = cfg.get("visualization", "remote_ssh_dir") remote_host = remote.split(":")[0] remote_dir = remote.split(":")[1] for in_track in in_files: sys_call("ssh %s mkdir -p %s" % (remote_host, remote_dir)) sys_call("scp %s %s" % (in_track, remote)) touch(out_header)
def discover_meme_motifs(in_fasta, out_motifs): """Discover sequence motifs in peaks by running meme""" cmd = 'meme %s %s -oc %s_meme_out ' % ( in_fasta, cfg.get('motifs', 'meme_params'), out_motifs) #if 'top' in in_fasta and 'around' in in_fasta: sys_call(cmd) motifs = sequence_motif.parseMemeMotifs('%s_meme_out/meme.txt' % out_motifs) pickle.dump(motifs, open(out_motifs, 'w'))
def maq_map_to_bed(in_map, out_bed): """ Convert maq map file to BED format """ with open(in_map) as infile: # use first ten reads to determine read length read_lengths = [len(infile.readline().split('\t')[14]) for i in range(10)] read_lengths = sum(read_lengths) / len(read_lengths) infile.seek(0) with open(out_bed, 'w') as outfile: for line in infile: fields = line.strip().split('\t') chrom, start, strand = fields[1], fields[2], fields[3] name = cfg.get('mapping', 'maq_bed_name') score = cfg.get('mapping', 'maq_bed_score') stop = int(start) + read_lengths + 1 # stop is fencepost after outfile.write('\t'.join([chrom, str(start), str(stop), str(name), str(score), str(strand)]) + '\n')
def discover_meme_motifs(in_fasta, out_motifs): """Discover sequence motifs in peaks by running meme""" cmd = 'meme %s %s -oc %s_meme_out ' % (in_fasta, cfg.get('motifs', 'meme_params'), out_motifs) #if 'top' in in_fasta and 'around' in in_fasta: sys_call(cmd) motifs = sequence_motif.parseMemeMotifs('%s_meme_out/meme.txt' % out_motifs) pickle.dump(motifs, open(out_motifs, 'w'))
def deploy_track_files(in_files, out_header): """Copy UCSC tracks to public url""" remote = cfg.get('visualization', 'remote_ssh_dir') remote_host = remote.split(':')[0] remote_dir = remote.split(':')[1] for in_track in in_files: sys_call('ssh %s mkdir -p %s' % (remote_host, remote_dir)) sys_call('scp %s %s' % (in_track, remote)) touch(out_header)
def run_mosaik_align(in_files, out_align): 'align reads to reference using MosaikAligner' # MosaikAligner -in sequence_archives/c_elegans_chr2_test.dat -out sequence_archives/c_elegans_chr2_test_aligned.dat -ia reference/c.elegans_chr2.dat -hs 14 -act 17 -mm 2 -m unique in_reads, in_genome_dat, in_genome_jump, _, _ = in_files in_genome_jump = in_genome_jump.replace('_keys.jmp', '') cmd = 'MosaikAligner -in %s -ia %s -j %s -out %s -hs %s %s' cmd = cmd % (in_reads, in_genome_dat, in_genome_jump, out_align, cfg.getint('mapping', 'mosaik_hash_size'), cfg.get('mapping', 'mosaik_params')) sys_call(cmd)
def run_macs14(in_files, out_peaks, max_fdr): """Call peaks using MACS (v1.4). Apply a maximum FDR threshold.""" in_treat, in_control = in_files[0] matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups() name = matches[0] + matches[1] + '.macs14.peaks' cmd = 'macs14 -t %s -c %s --name=%s %s --diag' % (in_treat, in_control, name, cfg.get('peaks', 'macs14_params')) sys_call(cmd) peaks_to_keep = set() # convert to proper bedfile- ints for score and + for strand with open(out_peaks, 'w') as outfile: with open(name + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.split('\t') if fields[0] == 'chr': continue # skip header start = str(max(0, int(fields[1]))) score = str(max(0, min(1000, int(float(fields[6]))))) fdr = float(fields[8]) if fdr <= max_fdr: outfile.write('\t'.join([fields[0], start, fields[2], 'MACS14_peak_%s' % (index + 1), score]) + '\t+\n') peaks_to_keep.add(index) # take region surrounding the peak summit summit_size = cfg.getint('peaks', 'peak_summit_size') with open(out_peaks + '_summits.%s_around' % \ cfg.get('peaks', 'peak_summit_size'), 'w') as outfile: with open(name + '_summits.bed') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.strip().split('\t') if fields[0] == 'chr': continue # skip header # score is number of reads at summit score = str(max(0, min(1000, int(float(fields[-1]))))) start = str(max(0, int(fields[1]) - summit_size / 2)) stop = str(int(fields[2]) + summit_size / 2) if index in peaks_to_keep: outfile.write('\t'.join([fields[0], start, stop, 'MACS_peak_%s' % (index + 1), score]) + '\t+\n')
def get_microRNA(_, out_mirna): """retrieve microRNA genes from UCSC""" url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/wgRna.txt.gz' url = url % cfg.get('DEFAULT', 'genome') sys_call('wget -N -P . %s' % url) sys_call('gunzip -f wgRna.txt.gz') with open(out_mirna, 'w') as outfile: for line in open('wgRna.txt'): (bin, chrom, start, end, name, score, strand, thickStart, thickEnd, type) = line.strip().split('\t') outfile.write('\t'.join([chrom, start, end, name + '_' + type, score, strand]) + '\n')
def consensus_enrichment(in_files, out_enrichment): """Determine a consensus motif's enrichment vs. genomic samples""" in_samples, in_peaks = in_files[:2] in_consensuses = in_files[2:] for in_con in in_consensuses: args = shlex.split('''%s --consensus_file=%s --bg_samples=%s --genome=%s --output_file=%s ''' % ( in_peaks, in_con, in_samples, cfg.get('DEFAULT', 'worldbase_genome'), out_enrichment)) motif_significance.main(args)
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs): 'extend reads to the full fragment length and create a bedgraph from them' in_bed, in_chrom_sizes = in_files cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """ + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph[0]) sys_call(cmd) cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """ + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph[1]) sys_call(cmd)
def consensus_enrichment(in_files, out_enrichment): """Determine a consensus motif's enrichment vs. genomic samples""" in_samples, in_peaks = in_files[:2] in_consensuses = in_files[2:] for in_con in in_consensuses: args = shlex.split( '''%s --consensus_file=%s --bg_samples=%s --genome=%s --output_file=%s ''' % (in_peaks, in_con, in_samples, cfg.get('DEFAULT', 'worldbase_genome'), out_enrichment)) motif_significance.main(args)
def bed_to_bedgraph(in_files, out_bedgraph): 'extend reads to the full fragment length and create a bedgraph from them' in_bed, in_chrom_sizes = in_files cmd = ('slopBed -i %s -s -r %s -l 0 -g %s | ' + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph) sys_call(cmd)
def run_bowtie(in_fastq, out_bowtie): 'align reads to reference using Bowtie' cmd1 = 'zcat %s' % in_fastq cmd2 = 'bowtie %s %s - %s' % (genome_path(), cfg.get('mapping', 'bowtie_params'), out_bowtie) p1 = Popen([cmd1], stdout=PIPE, shell=True) p2 = Popen([cmd2], stdin=p1.stdout, shell=True) p2.communicate() if p1.returncode: raise CalledProcessError(p1.returncode, cmd1) if p2.returncode: raise CalledProcessError(p2.returncode, cmd2)
def motif_enrichment_genomic(in_files, out_pattern, out_template): """Determine a motif's enrichment vs. genomic samples""" in_motifs = in_files[0] in_peaks = in_files[1][0] in_control_samples = filter(lambda x: x.endswith('sample'), in_files[1][1:]) for peak_file in in_peaks: # get the similar control data cur_control = filter(lambda x: x == (peak_file + '.similar.genomic.sample'), in_control_samples) for c in cur_control: short_control = c.split(peak_file)[1][1:] for zscore in cfg.get('motifs', 'motif_zscores').split(','): outfile = out_template % (zscore) args = shlex.split( '%s --motif_file=%s --bg_samples=%s ' '--genome=%s --output_file=%s --zscore=%s' % (peak_file, in_motifs, c, cfg.get('DEFAULT', 'worldbase_genome'), outfile, zscore)) print args motif_significance.main(args)
def bed_to_bedgraph(in_files, out_bedgraph): "extend reads to the full fragment length and create a bedgraph from them" in_bed, in_chrom_sizes = in_files cmd = ("slopBed -i %s -s -r %s -l 0 -g %s | " + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s") % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph, ) sys_call(cmd)
def clip_adapter(in_fastq, out_fastq): 'remove adapter sequence from raw reads' cmd1 = 'cat %s' % in_fastq cmd2 = 'fastx_clipper -o %s -a %s' % ( out_fastq, cfg.get('filtering', 'adapter_sequence')) p1 = Popen([cmd1], stdout=PIPE, shell=True) p2 = Popen([cmd2], stdin=p1.stdout, shell=True) p2.communicate() if p1.returncode: raise CalledProcessError(p1.returncode, cmd1) if p2.returncode: raise CalledProcessError(p2.returncode, cmd2)
def clip_adapter(in_fastq, out_fastq): 'remove adapter sequence from raw reads' cmd1 = 'cat %s' % in_fastq cmd2 = 'fastx_clipper -o %s -a %s' % (out_fastq, cfg.get('filtering', 'adapter_sequence')) p1 = Popen([cmd1], stdout=PIPE, shell=True) p2 = Popen([cmd2], stdin=p1.stdout, shell=True) p2.communicate() if p1.returncode: raise CalledProcessError(p1.returncode, cmd1) if p2.returncode: raise CalledProcessError(p2.returncode, cmd2)
def get_microRNA(_, out_mirna): """retrieve microRNA genes from UCSC""" url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/wgRna.txt.gz' url = url % cfg.get('DEFAULT', 'genome') sys_call('wget -N -P . %s' % url) sys_call('gunzip -f wgRna.txt.gz') with open(out_mirna, 'w') as outfile: for line in open('wgRna.txt'): (bin, chrom, start, end, name, score, strand, thickStart, thickEnd, type) = line.strip().split('\t') outfile.write('\t'.join( [chrom, start, end, name + '_' + type, score, strand]) + '\n')
def run_ssaha2(in_fastq, out_ssaha2): """ Runs ssaha2 command using the prebuilt hash table from get_ssaha2_hashtable. The ssaha2 command maps DNA sequence reads onto a genomic reference sequence using a combination of word hashing and dynamic programming. (From ssaha2 manual) """ #TODO: add useful parameters to cmd and config file #cmd = 'ssaha2 -outfile %s -save %s %s' % (out_ssaha2, hash_name, in_fastq) cmd = 'ssaha2 -outfile %s -disk 1 -save %s %s' cmd = cmd % (out_ssaha2, (cfg.get('mapping', 'ssaha2_hash_name')), in_fastq) sys_call(cmd)
def run_macs14_no_control(in_treat, out_peaks): """Call peaks using MACS (v1.4) without control data""" cmd = 'macs14 -t %s --name=%s %s' % (in_treat, out_peaks, cfg.get('peaks', 'macs14_params')) sys_call(cmd) peaks_to_keep = set() # convert to proper bedfile- ints for score and + for strand with open(out_peaks, 'w') as outfile: with open(out_peaks + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.split('\t') if fields[0] == 'chr': continue # skip header start = str(max(0, int(fields[1]))) score = str(max(0, min(1000, int(float(fields[6]))))) outfile.write('\t'.join([fields[0], start, fields[2], 'MACS14_peak_%s' % (index + 1), score]) + '\t+\n') peaks_to_keep.add(index) # take region surrounding the peak summit summit_size = cfg.getint('peaks', 'peak_summit_size') with open(out_peaks + '_summits.%s_around' % \ cfg.get('peaks', 'peak_summit_size'), 'w') as outfile: with open(out_peaks + '_summits.bed') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.strip().split('\t') if fields[0] == 'chr': continue # skip header # score is number of reads at summit score = str(max(0, min(1000, int(float(fields[-1]))))) start = str(max(0, int(fields[1]) - summit_size / 2)) stop = str(int(fields[2]) + summit_size / 2) if index in peaks_to_keep: outfile.write('\t'.join([fields[0], start, stop, 'MACS_peak_%s' % (index + 1), score]) + '\t+\n')
def motif_enrichment_genomic(in_files, out_pattern, out_template): """Determine a motif's enrichment vs. genomic samples""" in_motifs = in_files[0] in_peaks = in_files[1][0] in_control_samples = filter(lambda x: x.endswith('sample'), in_files[1][1:]) for peak_file in in_peaks: # get the similar control data cur_control = filter( lambda x: x == (peak_file + '.similar.genomic.sample'), in_control_samples) for c in cur_control: short_control = c.split(peak_file)[1][1:] for zscore in cfg.get('motifs', 'motif_zscores').split(','): outfile = out_template % (zscore) args = shlex.split( '%s --motif_file=%s --bg_samples=%s ' '--genome=%s --output_file=%s --zscore=%s' % (peak_file, in_motifs, c, cfg.get('DEFAULT', 'worldbase_genome'), outfile, zscore)) print args motif_significance.main(args)
def run_tophat(in_fastq, out_tophat): 'gapped alignment of reads to reference using TopHat' cmd1 = 'zcat %s' % in_fastq cmd2 = 'tophat %s - %s --output-dir=%s --GTF %s' % (genome_path(), cfg.get('mapping', 'bowtie_params'), '%s_tophat_out' % in_fastq, 'hg19.refseq_genes.gff') p1 = Popen([cmd1], stdout=PIPE, shell=True) p2 = Popen([cmd2], stdin=p1.stdout, shell=True) p2.communicate() if p1.returncode: raise CalledProcessError(p1.returncode, cmd1) if p2.returncode: raise CalledProcessError(p2.returncode, cmd2)
def make_track_headers(in_files, out_header): """For all the visualization files, create UCSC track headers""" with open(out_header,'w') as outfile: for in_track in in_files: if in_track.endswith('.bigwig'): track_extras = 'type=bigWig' elif in_track.endswith('.bigbed'): track_extras = 'type=bigBed itemRgb="On"' else: raise RuntimeError("Unrecognized file type: %s" % in_track) url = cfg.get('visualization', 'public_url_base') + '/' + in_track print url # remove cruft from the names short_name = re.sub(r'(mapped_reads|clipped|sorted|colored|\.)+', ' ', in_track) track_str = 'track %s name="%s" description="%s" ' \ 'bigDataUrl=%s\n' % (track_extras, short_name, short_name, url) outfile.write(track_str)
def gene_ontology(in_peaks, out_files): """Calculate the significance of the peaks near genes using BioConductor""" out_genes, out_go, out_raw = out_files cmd = """echo ' peaks = read.table("%s", header=FALSE, sep="\t"); peaks = data.frame(chr=as.factor(peaks[,1]), start=as.numeric(peaks[,2]), end=as.numeric(peaks[,3])); peaks = RangedData(IRanges(start=peaks[,2], end=peaks[,3]), space=peaks[,1]) source("http://bioconductor.org/biocLite.R"); biocLite("ChIPpeakAnno"); library(ChIPpeakAnno); mart<-useMart(biomart="ensembl",dataset="%s"); tss = getAnnotation(mart, featureType="TSS"); annopeaks = annotatePeakInBatch(peaks[, ], AnnotationData=tss); write.table(annopeaks, file="%s", sep="\t"); ' | R --vanilla --slave > %s""" % (in_peaks, cfg.get( 'DEFAULT', 'R_mart'), out_genes, out_go, out_raw) print cmd touch(out_raw)
def gene_ontology(in_peaks, out_files): """Calculate the significance of the peaks near genes using BioConductor""" out_genes, out_go, out_raw = out_files cmd = """echo ' peaks = read.table("%s", header=FALSE, sep="\t"); peaks = data.frame(chr=as.factor(peaks[,1]), start=as.numeric(peaks[,2]), end=as.numeric(peaks[,3])); peaks = RangedData(IRanges(start=peaks[,2], end=peaks[,3]), space=peaks[,1]) source("http://bioconductor.org/biocLite.R"); biocLite("ChIPpeakAnno"); library(ChIPpeakAnno); mart<-useMart(biomart="ensembl",dataset="%s"); tss = getAnnotation(mart, featureType="TSS"); annopeaks = annotatePeakInBatch(peaks[, ], AnnotationData=tss); write.table(annopeaks, file="%s", sep="\t"); ' | R --vanilla --slave > %s""" % (in_peaks, cfg.get('DEFAULT', 'R_mart'), out_genes, out_go, out_raw) print cmd touch(out_raw)
def make_track_headers(in_files, out_header): """For all the visualization files, create UCSC track headers""" with open(out_header, "w") as outfile: for in_track in in_files: if in_track.endswith(".bigwig"): track_extras = "type=bigWig" elif in_track.endswith(".bigbed"): track_extras = 'type=bigBed itemRgb="On"' else: raise RuntimeError("Unrecognized file type: %s" % in_track) url = cfg.get("visualization", "public_url_base") + "/" + in_track print url # remove cruft from the names short_name = re.sub(r"(mapped_reads|clipped|sorted|colored|\.)+", " ", in_track) track_str = 'track %s name="%s" description="%s" ' "bigDataUrl=%s\n" % ( track_extras, short_name, short_name, url, ) outfile.write(track_str)
def glitr_range_to_bed(in_range, out_bed): """Convert GLITR ranges to BED format, use peak centers as summits""" summit_size = cfg.get('peaks', 'peak_summit_size') with open(in_range) as infile: with open(out_bed, 'w') as outfile: with open(out_bed + '_summits.%s_around' % summit_size, 'w') \ as outfile_summits: for i, line in enumerate(infile): fields = line.strip('\n').split('\t') chrom, start, stop = parse_ucsc_range(fields[0]) start = max(0, start) foldchange = fields[3] outfile.write('\t'.join([chrom, str(start), str(stop), 'GLITR_peak_%s'%(i+1), str(int(float(foldchange))),'+']) + '\n') # take bases around center as summit center = start + (stop - start) / 2 center_start = center - summit_size / 2 center_stop = center + summit_size / 2 outfile_summits.write('\t'.join([chrom, str(center_start), str(center_stop), 'GLITR_peak_%s'%(i+1), str(int(float(foldchange))),'+']) + '\n')