Example #1
0
def sample_genome_short(_, out_samples):
    """Genomic sampling for threshold score"""
    args = shlex.split('''%s --genome=%s --sample_length=30 --num_samples=%s
                       ''' %
                       (out_samples, cfg.get('DEFAULT', 'worldbase_genome'),
                        cfg.get('motifs', 'motif_threshold_sample_size')))
    sampling.main(args)
Example #2
0
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs):
    "extend reads to the full fragment length and create a bedgraph from them"
    in_bed, in_chrom_sizes = in_files
    cmd = (
        """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """
        + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s"
    ) % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph[0],
    )
    sys_call(cmd)

    cmd = (
        """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """
        + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s"
    ) % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph[1],
    )
    sys_call(cmd)
Example #3
0
def sample_genome_short(_, out_samples):
    """Genomic sampling for threshold score"""
    args = shlex.split('''%s --genome=%s --sample_length=30 --num_samples=%s
                       ''' % (out_samples,
                              cfg.get('DEFAULT', 'worldbase_genome'),
                              cfg.get('motifs', 'motif_threshold_sample_size')))
    sampling.main(args)
Example #4
0
def motif_enrichment_control(in_files, out_enrichment):
    """Determine a motif's enrichment vs. control data"""
    in_motifs, in_peaks, in_control_sample = in_files[0]
    for zscore in cfg.get('motifs', 'motif_zscores').split(','):
        args = shlex.split(
            '''%s --motif_file=%s --bg_samples=%s --genome=%s
                              --output_file=%s --zscore=%s''' %
            (in_peaks, in_motifs, in_control_sample,
             cfg.get('DEFAULT', 'worldbase_genome'), out_enrichment, zscore))
        motif_significance.main(args)
Example #5
0
def motif_enrichment_control(in_files, out_enrichment):
    """Determine a motif's enrichment vs. control data"""
    in_motifs, in_peaks, in_control_sample = in_files[0]
    for zscore in cfg.get('motifs', 'motif_zscores').split(','):
        args = shlex.split('''%s --motif_file=%s --bg_samples=%s --genome=%s
                              --output_file=%s --zscore=%s''' % (
                                        in_peaks, in_motifs, in_control_sample,
                                        cfg.get('DEFAULT', 'worldbase_genome'),
                                        out_enrichment, zscore))
        motif_significance.main(args)
Example #6
0
def refseq_genes_to_regions(in_genes, out_pattern):
    """make regions (promoter, downstream, 5UTR, etc) from refseq_genes"""
    args = shlex.split('''%s --promoter_size=%s --promoter_extend=%s
                          --downstream_size=%s --downstream_extend=%s
                          --with_gene_name''' %
                       (in_genes, cfg.get('genes', 'promoter_size'),
                        cfg.get('genes', 'promoter_extend'),
                        cfg.get('genes', 'downstream_size'),
                        cfg.get('genes', 'downstream_extend')))
    makeGeneStructure.main(args)
Example #7
0
def refseq_genes_to_regions(in_genes, out_pattern):
    """make regions (promoter, downstream, 5UTR, etc) from refseq_genes"""
    args = shlex.split('''%s --promoter_size=%s --promoter_extend=%s
                          --downstream_size=%s --downstream_extend=%s
                          --with_gene_name''' % (
                            in_genes,
                            cfg.get('genes', 'promoter_size'),
                            cfg.get('genes', 'promoter_extend'),
                            cfg.get('genes', 'downstream_size'),
                            cfg.get('genes', 'downstream_extend')))
    makeGeneStructure.main(args)
def run_glitr(in_files, out_peaks):
    """Call peaks with GLITR"""
    in_treat = filter(lambda f: '.treat.' in f, in_files)[0]
    in_control = filter(lambda f: '.control.' in f, in_files)[0]
    glitr_dir = in_treat + '.GLITR_out'
    cmd = ('rm -r %s; mkdir %s; cd %s; GLITR.pl --CHIP=../%s ' + \
            '--CONTROL=../%s --GENOME=%s %s ') % (
                glitr_dir, glitr_dir, glitr_dir, in_treat, in_control,
                cfg.get('DEFAULT', 'genome').upper(),
                cfg.get('peaks', 'glitr_params'))
    sys_call(cmd)
    sys_call('cp %s/allChIP.FDR_*PercentFDR %s' % (glitr_dir, out_peaks))
Example #9
0
def trim_reads(in_fastq, out_fastq):
    'trim leading and/or trailing bases from all reads'
    cmd1 = 'cat %s' % in_fastq
    cmd2 = 'fastx_trimmer -o %s -f %s -l %s' % (out_fastq,
                                    cfg.get('filtering', 'trim_start'),
                                    cfg.get('filtering', 'trim_end'))
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
def trim_reads(in_fastq, out_fastq):
    'trim leading and/or trailing bases from all reads'
    cmd1 = 'cat %s' % in_fastq
    cmd2 = 'fastx_trimmer -o %s -f %s -l %s' % (
        out_fastq, cfg.get('filtering',
                           'trim_start'), cfg.get('filtering', 'trim_end'))
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
def run_macs(in_files, out_peaks, max_fdr):
    """Call peak with MACS (v1.3).
    Apply a maximum FDR threshold and treat centers as peak summits
    
    """
    in_treat, in_control = in_files[0]
    matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups()
    name = matches[0] + matches[1] + '.macs.peaks'
    max_fdr = cfg.getfloat('peaks', 'max_FDR')
    cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name,
                                               cfg.get('peaks', 'macs_params'))
    sys_call(cmd)
    
    # convert to proper bedfile- ints for score and + for strand
    with open(out_peaks, 'w') as outfile:
        with open(name + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(
                                        bedCommentFilter, infile)):
                fields = line.split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                start = str(max(0, int(fields[1])))
                score = str(max(0, min(1000, int(float(fields[6])))))
                fdr = float(fields[8])
                if fdr <= max_fdr:
                    outfile.write('\t'.join([fields[0], start, fields[2],
                                        'MACS_peak_%s' % (index + 1), score]) +
                                    '\t+\n')
    # take region surrounding the peak center as the summit
    summit_size = cfg.getint('peaks', 'peak_summit_size')
    with open(out_peaks + '_summits.%s_around' % \
                        cfg.get('peaks', 'peak_summit_size'), 'w') as outfile:
        with open(name + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.strip().split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                score = str(max(0, min(1000, int(float(fields[6])))))
                p_start, p_stop = max(0, int(fields[1])), int(fields[2])
                p_center = p_start + (p_stop - p_start) / 2
                s_start = p_center - summit_size / 2
                s_stop = p_center + summit_size / 2
                fdr = float(fields[8])
                if fdr <= max_fdr:
                    outfile.write('\t'.join([fields[0], str(s_start),
                                    str(s_stop),
                                    'MACS_peak_%s' % (index + 1), score])
                                        + '\t+\n')
Example #12
0
def remove_internal_priming(in_bed, out_bed):
    """Reads that map to genomic locations with 6 conseuctive downstream A's or
    7/10 downstream nt being A's should be filtered out.
    """
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    with open(out_bed, 'w') as outfile:
        for line in open(in_bed):
            chrom,start,stop,name,score,strand = line.strip().split('\t')
            start, stop = int(start), int(stop)
            if strand not in ['+','-']:
                raise RuntimeError("unknown strand", strand, line)
            if strand == '+':
                try:
                    downstream = str(wb_genome[chrom][stop:stop+10]).upper()
                except IndexError:
                    downstream = ''
                down_A = downstream.count('A')
                down_consecutive_A= downstream.count('A' * 6)
            else:
                try:
                    downstream = str(wb_genome[chrom][max(0,start-10):start]).upper()
                except IndexError:
                    downstream = ''
                down_A  = downstream.count('T')
                down_consecutive_A = downstream.count('T' * 6)
            #filter if 6+ consecutive A's in sequence or 7+ A's downstream
            if down_consecutive_A < 1 and down_A < 7:
                outfile.write(line)
Example #13
0
def get_refseq_genes(_, out_genes):
    """Download refseq genes from UCSC and reformat as BED"""
    url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/refGene.txt.gz'
    url = url % cfg.get('DEFAULT', 'genome')
    sys_call('wget -N -P . %s' % url)
    sys_call('gunzip -f refGene.txt.gz')
    sys_call('mv refGene.txt %s' % out_genes)
Example #14
0
def sample_genome_like_peaks(in_peaks, out_files):
    """Sample from the genome, keeping the sample widths the same as peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array(
        'i', (stop - start
              for chrom, start, stop, strand in readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)

    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    s = sampling.sample_genome(
        wb_genome,
        peak_lengths,
        sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'),
        excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'),
        excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'),
        ignoreCharacters='_',
        weighted=True)
    with open(out_sample, 'w') as outfile:
        with open(out_locations, 'w') as outlocations:
            for index, line in enumerate(s):
                outfile.write('>%s\n%s\n' % (index, line))
                outlocations.write('\t'.join([
                    line.id,
                    str(line.start),
                    str(line.stop),
                    str(index), '0', '+' if line.orientation == 1 else '-'
                ]) + '\n')
Example #15
0
def sample_control_like_peaks(in_peaks, out_files):
    """Sample from the control IgG, with similar widths as the peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array('i', (stop - start for chrom, start, stop, strand in
                                                readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    # do the dance to map peaks back to their control raw reads
    control_bed = re.sub(r'treat', 'control', in_peaks)
    control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed)
    control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed)
    control_bed = re.sub(r'peaks', 'mapped_reads', control_bed)
    control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed)
    with open(control_bed) as control_file:
        with open(out_locations, 'w') as outlocations:
            s = sampling.sample_middles(wb_genome, peak_lengths, control_file,
                                sampleSize=cfg.getint('motifs',
                                            'motif_significance_sample_size'))
            with open(out_sample, 'w') as outfile:
                for index, seq in enumerate(s):
                    # repr() gives location, str() gives sequence
                    outfile.write('>%s_%s\n%s\n' % (index, repr(seq), str(seq)))
                    outlocations.write('\t'.join([seq.id, str(seq.start),
                                                 str(seq.stop), str(index), '0',
                                '+' if seq.orientation == 1 else '-']) + '\n')
Example #16
0
def motif_select_random_seqs(in_fasta, out_pattern):
    """Split a fasta file into several chunks so motif discovery is easier"""
    name = name = re.search('(.*).fasta', in_fasta).groups()[0]
    with open(in_fasta) as infile:
        seqs = list(parseFastaLines(infile))
        if len(seqs) <= cfg.get('motifs', 'motif_chunk_size'):
            num_chunks = 1
        else:
            num_chunks = cfg.get('motifs', 'motif_num_chunks')
        # get a random sample of peaks
        for i in xrange(num_chunks):
            with open(name + '.small_sample.%s.fasta' % i, 'w') as outfile:
                subset = random.sample(seqs, min(len(seqs),
                                    cfg.getint('motifs', 'motif_chunk_size')))
                outfile.writelines('>%s\n%s\n' % (s[0].strip(), s[1].strip())
                                                                for s in subset)
Example #17
0
def remove_internal_priming(in_bed, out_bed):
    """Reads that map to genomic locations with 6 conseuctive downstream A's or
    7/10 downstream nt being A's should be filtered out.
    """
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    with open(out_bed, 'w') as outfile:
        for line in open(in_bed):
            chrom,start,stop,name,score,strand = line.strip().split('\t')
            start, stop = int(start), int(stop)
            if strand not in ['+','-']:
                raise RuntimeError("unknown strand", strand, line)
            if strand == '+':
                try:
                    downstream = str(wb_genome[chrom][stop:stop+10]).upper()
                except IndexError:
                    downstream = ''
                down_A = downstream.count('A')
                down_consecutive_A= downstream.count('A' * 6)
            else:
                try:
                    downstream = str(wb_genome[chrom][max(0,start-10):start]).upper()
                except IndexError:
                    downstream = ''
                down_A  = downstream.count('T')
                down_consecutive_A = downstream.count('T' * 6)
            #filter if 6+ consecutive A's in sequence or 7+ A's downstream
            if down_consecutive_A < 1 and down_A < 7:
                outfile.write(line)
Example #18
0
def sample_control_like_peaks(in_peaks, out_files):
    """Sample from the control IgG, with similar widths as the peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array(
        'i', (stop - start
              for chrom, start, stop, strand in readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    # do the dance to map peaks back to their control raw reads
    control_bed = re.sub(r'treat', 'control', in_peaks)
    control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed)
    control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed)
    control_bed = re.sub(r'peaks', 'mapped_reads', control_bed)
    control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed)
    with open(control_bed) as control_file:
        with open(out_locations, 'w') as outlocations:
            s = sampling.sample_middles(wb_genome,
                                        peak_lengths,
                                        control_file,
                                        sampleSize=cfg.getint(
                                            'motifs',
                                            'motif_significance_sample_size'))
            with open(out_sample, 'w') as outfile:
                for index, seq in enumerate(s):
                    # repr() gives location, str() gives sequence
                    outfile.write('>%s_%s\n%s\n' %
                                  (index, repr(seq), str(seq)))
                    outlocations.write('\t'.join([
                        seq.id,
                        str(seq.start),
                        str(seq.stop),
                        str(index), '0', '+' if seq.orientation == 1 else '-'
                    ]) + '\n')
Example #19
0
def get_refseq_genes(_, out_genes):
    """Download refseq genes from UCSC and reformat as BED"""
    url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/refGene.txt.gz'
    url = url % cfg.get('DEFAULT', 'genome')
    sys_call('wget -N -P . %s' % url)
    sys_call('gunzip -f refGene.txt.gz')
    sys_call('mv refGene.txt %s' % out_genes)
Example #20
0
def discover_nmica_motifs(in_fasta, out_motifs):
    """Discover sequence motifs in peaks by running nestedMICA"""
    cmd = 'nminfer -seqs %s %s ' % (in_fasta, cfg.get('motifs', 'nmica_params'))
    sys_call(cmd)
    motifs_name = in_fasta.replace('.fasta', '.motifs.xms')
    sys_call('mv motifs.xms %s' % motifs_name)
    motifs = sequence_motif.parse_xms_motifs(motifs_name)
    pickle.dump(motifs, open(out_motifs, 'w'))
Example #21
0
def get_peak_sequence(in_peaks, out_fasta):
    """Get fasta file for peak summits
    """
    in_summits = out_fasta.replace('.fasta', '')
    args = shlex.split(
        '''--genome=%s %s %s''' %
        (cfg.get('DEFAULT', 'worldbase_genome'), in_summits, out_fasta))
    get_bed_sequence.main(args)
Example #22
0
def get_peak_sequence(in_peaks, out_fasta):
    """Get fasta file for peak summits
    """
    in_summits = out_fasta.replace('.fasta', '')
    args = shlex.split('''--genome=%s %s %s''' % (
                                        cfg.get('DEFAULT', 'worldbase_genome'),
                                        in_summits, out_fasta))
    get_bed_sequence.main(args)
Example #23
0
def motif_select_random_seqs(in_fasta, out_pattern):
    """Split a fasta file into several chunks so motif discovery is easier"""
    name = name = re.search('(.*).fasta', in_fasta).groups()[0]
    with open(in_fasta) as infile:
        seqs = list(parseFastaLines(infile))
        if len(seqs) <= cfg.get('motifs', 'motif_chunk_size'):
            num_chunks = 1
        else:
            num_chunks = cfg.get('motifs', 'motif_num_chunks')
        # get a random sample of peaks
        for i in xrange(num_chunks):
            with open(name + '.small_sample.%s.fasta' % i, 'w') as outfile:
                subset = random.sample(
                    seqs,
                    min(len(seqs), cfg.getint('motifs', 'motif_chunk_size')))
                outfile.writelines('>%s\n%s\n' % (s[0].strip(), s[1].strip())
                                   for s in subset)
Example #24
0
def deploy_track_files(in_files, out_header):
    """Copy UCSC tracks to public url"""
    remote = cfg.get("visualization", "remote_ssh_dir")
    remote_host = remote.split(":")[0]
    remote_dir = remote.split(":")[1]
    for in_track in in_files:
        sys_call("ssh %s mkdir -p %s" % (remote_host, remote_dir))
        sys_call("scp %s %s" % (in_track, remote))
    touch(out_header)
Example #25
0
def discover_nmica_motifs(in_fasta, out_motifs):
    """Discover sequence motifs in peaks by running nestedMICA"""
    cmd = 'nminfer -seqs %s %s ' % (in_fasta, cfg.get('motifs',
                                                      'nmica_params'))
    sys_call(cmd)
    motifs_name = in_fasta.replace('.fasta', '.motifs.xms')
    sys_call('mv motifs.xms %s' % motifs_name)
    motifs = sequence_motif.parse_xms_motifs(motifs_name)
    pickle.dump(motifs, open(out_motifs, 'w'))
Example #26
0
def discover_meme_motifs(in_fasta, out_motifs):
    """Discover sequence motifs in peaks by running meme"""
    cmd = 'meme %s %s -oc %s_meme_out ' % (
        in_fasta, cfg.get('motifs', 'meme_params'), out_motifs)
    #if 'top' in in_fasta and 'around' in in_fasta:
    sys_call(cmd)
    motifs = sequence_motif.parseMemeMotifs('%s_meme_out/meme.txt' %
                                            out_motifs)
    pickle.dump(motifs, open(out_motifs, 'w'))
Example #27
0
def maq_map_to_bed(in_map, out_bed):
    """ Convert maq map file to BED format """
    with open(in_map) as infile:
        # use first ten reads to determine read length
        read_lengths = [len(infile.readline().split('\t')[14])
                                                    for i in range(10)]
        read_lengths = sum(read_lengths) / len(read_lengths)
        infile.seek(0)
        with open(out_bed, 'w') as outfile:
            for line in infile:
                fields = line.strip().split('\t')
                chrom, start, strand = fields[1], fields[2], fields[3]
                name = cfg.get('mapping', 'maq_bed_name')
                score = cfg.get('mapping', 'maq_bed_score')
                stop = int(start) + read_lengths + 1  # stop is fencepost after
                outfile.write('\t'.join([chrom, str(start), str(stop),
                                         str(name), str(score), str(strand)])
                              + '\n')
Example #28
0
def discover_meme_motifs(in_fasta, out_motifs):
    """Discover sequence motifs in peaks by running meme"""
    cmd = 'meme %s %s -oc %s_meme_out ' % (in_fasta,
                                           cfg.get('motifs', 'meme_params'),
                                           out_motifs)
    #if 'top' in in_fasta and 'around' in in_fasta:
    sys_call(cmd)
    motifs = sequence_motif.parseMemeMotifs('%s_meme_out/meme.txt' % out_motifs)
    pickle.dump(motifs, open(out_motifs, 'w'))
Example #29
0
def maq_map_to_bed(in_map, out_bed):
    """ Convert maq map file to BED format """
    with open(in_map) as infile:
        # use first ten reads to determine read length
        read_lengths = [len(infile.readline().split('\t')[14])
                                                    for i in range(10)]
        read_lengths = sum(read_lengths) / len(read_lengths)
        infile.seek(0)
        with open(out_bed, 'w') as outfile:
            for line in infile:
                fields = line.strip().split('\t')
                chrom, start, strand = fields[1], fields[2], fields[3]
                name = cfg.get('mapping', 'maq_bed_name')
                score = cfg.get('mapping', 'maq_bed_score')
                stop = int(start) + read_lengths + 1  # stop is fencepost after
                outfile.write('\t'.join([chrom, str(start), str(stop),
                                         str(name), str(score), str(strand)])
                              + '\n')
Example #30
0
def deploy_track_files(in_files, out_header):
    """Copy UCSC tracks to public url"""
    remote = cfg.get('visualization', 'remote_ssh_dir')
    remote_host = remote.split(':')[0]
    remote_dir = remote.split(':')[1]
    for in_track in in_files:
        sys_call('ssh %s mkdir -p %s' % (remote_host, remote_dir))
        sys_call('scp %s %s' % (in_track, remote))
    touch(out_header)
Example #31
0
def run_mosaik_align(in_files, out_align):
    'align reads to reference using MosaikAligner'
    # MosaikAligner -in sequence_archives/c_elegans_chr2_test.dat -out sequence_archives/c_elegans_chr2_test_aligned.dat -ia reference/c.elegans_chr2.dat -hs 14 -act 17 -mm 2 -m unique
    in_reads, in_genome_dat, in_genome_jump, _, _ = in_files
    in_genome_jump = in_genome_jump.replace('_keys.jmp', '')
    cmd = 'MosaikAligner -in %s -ia %s -j %s -out %s -hs %s  %s'
    cmd = cmd % (in_reads, in_genome_dat, in_genome_jump, out_align,
                   cfg.getint('mapping', 'mosaik_hash_size'),
                   cfg.get('mapping', 'mosaik_params'))
    sys_call(cmd)
Example #32
0
def run_mosaik_align(in_files, out_align):
    'align reads to reference using MosaikAligner'
    # MosaikAligner -in sequence_archives/c_elegans_chr2_test.dat -out sequence_archives/c_elegans_chr2_test_aligned.dat -ia reference/c.elegans_chr2.dat -hs 14 -act 17 -mm 2 -m unique
    in_reads, in_genome_dat, in_genome_jump, _, _ = in_files
    in_genome_jump = in_genome_jump.replace('_keys.jmp', '')
    cmd = 'MosaikAligner -in %s -ia %s -j %s -out %s -hs %s  %s'
    cmd = cmd % (in_reads, in_genome_dat, in_genome_jump, out_align,
                   cfg.getint('mapping', 'mosaik_hash_size'),
                   cfg.get('mapping', 'mosaik_params'))
    sys_call(cmd)
def run_macs14(in_files, out_peaks, max_fdr):
    """Call peaks using MACS (v1.4). Apply a maximum FDR threshold."""
    in_treat, in_control = in_files[0]
    matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups()
    name = matches[0] + matches[1] + '.macs14.peaks'
    cmd = 'macs14 -t %s -c %s --name=%s %s --diag' % (in_treat, in_control, name,
                                             cfg.get('peaks', 'macs14_params'))
    sys_call(cmd)
    peaks_to_keep = set()
    # convert to proper bedfile- ints for score and + for strand
    with open(out_peaks, 'w') as outfile:
        with open(name + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                start = str(max(0, int(fields[1])))
                score = str(max(0, min(1000, int(float(fields[6])))))
                fdr = float(fields[8])
                if fdr <= max_fdr:
                    outfile.write('\t'.join([fields[0], start, fields[2],
                                        'MACS14_peak_%s' % (index + 1), score])
                                                + '\t+\n')
                    peaks_to_keep.add(index)
    # take region surrounding the peak summit
    summit_size = cfg.getint('peaks', 'peak_summit_size')
    with open(out_peaks + '_summits.%s_around' % \
                        cfg.get('peaks', 'peak_summit_size'), 'w') as outfile:
        with open(name + '_summits.bed') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.strip().split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                # score is number of reads at summit
                score = str(max(0, min(1000, int(float(fields[-1])))))
                start = str(max(0, int(fields[1]) - summit_size / 2))
                stop = str(int(fields[2]) + summit_size / 2)
                if index in peaks_to_keep:
                    outfile.write('\t'.join([fields[0], start, stop,
                                        'MACS_peak_%s' % (index + 1), score])
                                            + '\t+\n')
Example #34
0
def get_microRNA(_, out_mirna):
    """retrieve microRNA genes from UCSC"""
    url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/wgRna.txt.gz'
    url = url % cfg.get('DEFAULT', 'genome')
    sys_call('wget -N -P . %s' % url)
    sys_call('gunzip -f wgRna.txt.gz')
    with open(out_mirna, 'w') as outfile:
        for line in open('wgRna.txt'):
            (bin, chrom, start, end, name, score,
             strand, thickStart, thickEnd, type) = line.strip().split('\t')
            outfile.write('\t'.join([chrom, start, end, name + '_' + type, score, strand]) + '\n')
Example #35
0
def consensus_enrichment(in_files, out_enrichment):
    """Determine a consensus motif's enrichment vs. genomic samples"""
    in_samples, in_peaks = in_files[:2]
    in_consensuses = in_files[2:]
    for in_con in in_consensuses:
        args = shlex.split('''%s --consensus_file=%s --bg_samples=%s --genome=%s
                              --output_file=%s ''' % (
                                        in_peaks, in_con, in_samples,
                                        cfg.get('DEFAULT', 'worldbase_genome'),
                                        out_enrichment))
        motif_significance.main(args)
Example #36
0
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs):
    'extend reads to the full fragment length and create a bedgraph from them'
    in_bed, in_chrom_sizes = in_files
    cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """ + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph[0])
    sys_call(cmd)
    
    cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """ + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph[1])
    sys_call(cmd)
Example #37
0
def consensus_enrichment(in_files, out_enrichment):
    """Determine a consensus motif's enrichment vs. genomic samples"""
    in_samples, in_peaks = in_files[:2]
    in_consensuses = in_files[2:]
    for in_con in in_consensuses:
        args = shlex.split(
            '''%s --consensus_file=%s --bg_samples=%s --genome=%s
                              --output_file=%s ''' %
            (in_peaks, in_con, in_samples,
             cfg.get('DEFAULT', 'worldbase_genome'), out_enrichment))
        motif_significance.main(args)
Example #38
0
def bed_to_bedgraph(in_files, out_bedgraph):
    'extend reads to the full fragment length and create a bedgraph from them'
    in_bed, in_chrom_sizes = in_files
    cmd = ('slopBed -i %s -s -r %s -l 0 -g %s | ' + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph)
    sys_call(cmd)
Example #39
0
def run_bowtie(in_fastq, out_bowtie):
    'align reads to reference using Bowtie'
    cmd1 = 'zcat %s' % in_fastq
    cmd2 = 'bowtie %s %s - %s' % (genome_path(), cfg.get('mapping', 'bowtie_params'),
                                  out_bowtie)
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
Example #40
0
def motif_enrichment_genomic(in_files, out_pattern, out_template):
    """Determine a motif's enrichment vs. genomic samples"""
    in_motifs = in_files[0]
    in_peaks = in_files[1][0]
    in_control_samples = filter(lambda x: x.endswith('sample'), in_files[1][1:])
    
    for peak_file in in_peaks:
        # get the similar control data
        cur_control = filter(lambda x: x == (peak_file + '.similar.genomic.sample'),
                             in_control_samples)
        for c in cur_control:
            short_control = c.split(peak_file)[1][1:]
            for zscore in cfg.get('motifs', 'motif_zscores').split(','):
                outfile = out_template % (zscore)
                args = shlex.split( '%s --motif_file=%s --bg_samples=%s '
                                   '--genome=%s --output_file=%s --zscore=%s' %
                                        (peak_file, in_motifs, c,
                                         cfg.get('DEFAULT', 'worldbase_genome'),
                                         outfile, zscore))
                print args
                motif_significance.main(args)
Example #41
0
def bed_to_bedgraph(in_files, out_bedgraph):
    "extend reads to the full fragment length and create a bedgraph from them"
    in_bed, in_chrom_sizes = in_files
    cmd = ("slopBed -i %s -s -r %s -l 0 -g %s | " + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s") % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph,
    )
    sys_call(cmd)
Example #42
0
def run_bowtie(in_fastq, out_bowtie):
    'align reads to reference using Bowtie'
    cmd1 = 'zcat %s' % in_fastq
    cmd2 = 'bowtie %s %s - %s' % (genome_path(), cfg.get('mapping', 'bowtie_params'),
                                  out_bowtie)
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
def clip_adapter(in_fastq, out_fastq):
    'remove adapter sequence from raw reads'
    cmd1 = 'cat %s' % in_fastq
    cmd2 = 'fastx_clipper -o %s -a %s' % (
        out_fastq, cfg.get('filtering', 'adapter_sequence'))
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
Example #44
0
def clip_adapter(in_fastq, out_fastq):
    'remove adapter sequence from raw reads'
    cmd1 = 'cat %s' % in_fastq
    cmd2 = 'fastx_clipper -o %s -a %s' % (out_fastq,
                                    cfg.get('filtering', 'adapter_sequence'))
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
Example #45
0
def get_microRNA(_, out_mirna):
    """retrieve microRNA genes from UCSC"""
    url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/wgRna.txt.gz'
    url = url % cfg.get('DEFAULT', 'genome')
    sys_call('wget -N -P . %s' % url)
    sys_call('gunzip -f wgRna.txt.gz')
    with open(out_mirna, 'w') as outfile:
        for line in open('wgRna.txt'):
            (bin, chrom, start, end, name, score, strand, thickStart, thickEnd,
             type) = line.strip().split('\t')
            outfile.write('\t'.join(
                [chrom, start, end, name + '_' + type, score, strand]) + '\n')
Example #46
0
def run_ssaha2(in_fastq, out_ssaha2):
    """ Runs ssaha2 command using the prebuilt hash table from
        get_ssaha2_hashtable. 
    
        The ssaha2 command maps DNA sequence reads onto a genomic 
        reference sequence using a combination of word hashing and 
        dynamic programming. (From ssaha2 manual)
    """
    #TODO: add useful parameters to cmd and config file
    #cmd = 'ssaha2 -outfile %s -save %s %s' % (out_ssaha2, hash_name, in_fastq)
    cmd = 'ssaha2 -outfile %s -disk 1 -save %s %s'
    cmd = cmd % (out_ssaha2, (cfg.get('mapping', 'ssaha2_hash_name')), in_fastq)
    sys_call(cmd)
Example #47
0
def run_ssaha2(in_fastq, out_ssaha2):
    """ Runs ssaha2 command using the prebuilt hash table from
        get_ssaha2_hashtable. 
    
        The ssaha2 command maps DNA sequence reads onto a genomic 
        reference sequence using a combination of word hashing and 
        dynamic programming. (From ssaha2 manual)
    """
    #TODO: add useful parameters to cmd and config file
    #cmd = 'ssaha2 -outfile %s -save %s %s' % (out_ssaha2, hash_name, in_fastq)
    cmd = 'ssaha2 -outfile %s -disk 1 -save %s %s'
    cmd = cmd % (out_ssaha2, (cfg.get('mapping', 'ssaha2_hash_name')), in_fastq)
    sys_call(cmd)
def run_macs14_no_control(in_treat, out_peaks):
    """Call peaks using MACS (v1.4) without control data"""
    cmd = 'macs14 -t %s --name=%s %s' % (in_treat, out_peaks,
                                         cfg.get('peaks', 'macs14_params'))
    sys_call(cmd)
    peaks_to_keep = set()
    # convert to proper bedfile- ints for score and + for strand
    with open(out_peaks, 'w') as outfile:
        with open(out_peaks + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                start = str(max(0, int(fields[1])))
                score = str(max(0, min(1000, int(float(fields[6])))))
                outfile.write('\t'.join([fields[0], start, fields[2],
                                        'MACS14_peak_%s' % (index + 1), score])
                                                + '\t+\n')
                peaks_to_keep.add(index)
    # take region surrounding the peak summit
    summit_size = cfg.getint('peaks', 'peak_summit_size')
    with open(out_peaks + '_summits.%s_around' % \
                        cfg.get('peaks', 'peak_summit_size'), 'w') as outfile:
        with open(out_peaks + '_summits.bed') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.strip().split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                # score is number of reads at summit
                score = str(max(0, min(1000, int(float(fields[-1])))))
                start = str(max(0, int(fields[1]) - summit_size / 2))
                stop = str(int(fields[2]) + summit_size / 2)
                if index in peaks_to_keep:
                    outfile.write('\t'.join([fields[0], start, stop,
                                        'MACS_peak_%s' % (index + 1), score])
                                            + '\t+\n')
Example #49
0
def motif_enrichment_genomic(in_files, out_pattern, out_template):
    """Determine a motif's enrichment vs. genomic samples"""
    in_motifs = in_files[0]
    in_peaks = in_files[1][0]
    in_control_samples = filter(lambda x: x.endswith('sample'),
                                in_files[1][1:])

    for peak_file in in_peaks:
        # get the similar control data
        cur_control = filter(
            lambda x: x == (peak_file + '.similar.genomic.sample'),
            in_control_samples)
        for c in cur_control:
            short_control = c.split(peak_file)[1][1:]
            for zscore in cfg.get('motifs', 'motif_zscores').split(','):
                outfile = out_template % (zscore)
                args = shlex.split(
                    '%s --motif_file=%s --bg_samples=%s '
                    '--genome=%s --output_file=%s --zscore=%s' %
                    (peak_file, in_motifs, c,
                     cfg.get('DEFAULT', 'worldbase_genome'), outfile, zscore))
                print args
                motif_significance.main(args)
Example #50
0
def run_tophat(in_fastq, out_tophat):
    'gapped alignment of reads to reference using TopHat'
    cmd1 = 'zcat %s' % in_fastq
    cmd2 = 'tophat %s - %s --output-dir=%s --GTF %s' % (genome_path(),
                                  cfg.get('mapping', 'bowtie_params'),
                                  '%s_tophat_out' % in_fastq,
                                  'hg19.refseq_genes.gff')
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
Example #51
0
def run_tophat(in_fastq, out_tophat):
    'gapped alignment of reads to reference using TopHat'
    cmd1 = 'zcat %s' % in_fastq
    cmd2 = 'tophat %s - %s --output-dir=%s --GTF %s' % (genome_path(),
                                  cfg.get('mapping', 'bowtie_params'),
                                  '%s_tophat_out' % in_fastq,
                                  'hg19.refseq_genes.gff')
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
Example #52
0
def make_track_headers(in_files, out_header):
    """For all the visualization files, create UCSC track headers"""
    with open(out_header,'w') as outfile:
        for in_track in in_files:
            if in_track.endswith('.bigwig'):
                track_extras = 'type=bigWig'
            elif in_track.endswith('.bigbed'):
                track_extras = 'type=bigBed itemRgb="On"'
            else:
                raise RuntimeError("Unrecognized file type: %s" % in_track)
            url = cfg.get('visualization', 'public_url_base') + '/' + in_track
            print url
            # remove cruft from the names
            short_name = re.sub(r'(mapped_reads|clipped|sorted|colored|\.)+',
                                                            ' ', in_track)
            track_str = 'track %s name="%s" description="%s" ' \
                        'bigDataUrl=%s\n' % (track_extras, short_name,
                                             short_name, url)
            outfile.write(track_str)
Example #53
0
def gene_ontology(in_peaks, out_files):
    """Calculate the significance of the peaks near genes using BioConductor"""
    out_genes, out_go, out_raw = out_files
    cmd = """echo '
    peaks = read.table("%s", header=FALSE, sep="\t");
    peaks = data.frame(chr=as.factor(peaks[,1]), start=as.numeric(peaks[,2]),
                        end=as.numeric(peaks[,3]));
    peaks = RangedData(IRanges(start=peaks[,2], end=peaks[,3]), space=peaks[,1])
    source("http://bioconductor.org/biocLite.R");
    biocLite("ChIPpeakAnno");
    library(ChIPpeakAnno);
    mart<-useMart(biomart="ensembl",dataset="%s");
    tss = getAnnotation(mart, featureType="TSS");
    annopeaks = annotatePeakInBatch(peaks[, ], AnnotationData=tss);
    write.table(annopeaks, file="%s", sep="\t");
    ' | R --vanilla --slave > %s""" % (in_peaks, cfg.get(
        'DEFAULT', 'R_mart'), out_genes, out_go, out_raw)
    print cmd
    touch(out_raw)
Example #54
0
def gene_ontology(in_peaks, out_files):
    """Calculate the significance of the peaks near genes using BioConductor"""
    out_genes, out_go, out_raw = out_files
    cmd = """echo '
    peaks = read.table("%s", header=FALSE, sep="\t");
    peaks = data.frame(chr=as.factor(peaks[,1]), start=as.numeric(peaks[,2]),
                        end=as.numeric(peaks[,3]));
    peaks = RangedData(IRanges(start=peaks[,2], end=peaks[,3]), space=peaks[,1])
    source("http://bioconductor.org/biocLite.R");
    biocLite("ChIPpeakAnno");
    library(ChIPpeakAnno);
    mart<-useMart(biomart="ensembl",dataset="%s");
    tss = getAnnotation(mart, featureType="TSS");
    annopeaks = annotatePeakInBatch(peaks[, ], AnnotationData=tss);
    write.table(annopeaks, file="%s", sep="\t");
    ' | R --vanilla --slave > %s""" % (in_peaks, cfg.get('DEFAULT', 'R_mart'),
                                       out_genes, out_go, out_raw)
    print cmd
    touch(out_raw)
Example #55
0
def make_track_headers(in_files, out_header):
    """For all the visualization files, create UCSC track headers"""
    with open(out_header, "w") as outfile:
        for in_track in in_files:
            if in_track.endswith(".bigwig"):
                track_extras = "type=bigWig"
            elif in_track.endswith(".bigbed"):
                track_extras = 'type=bigBed itemRgb="On"'
            else:
                raise RuntimeError("Unrecognized file type: %s" % in_track)
            url = cfg.get("visualization", "public_url_base") + "/" + in_track
            print url
            # remove cruft from the names
            short_name = re.sub(r"(mapped_reads|clipped|sorted|colored|\.)+", " ", in_track)
            track_str = 'track %s name="%s" description="%s" ' "bigDataUrl=%s\n" % (
                track_extras,
                short_name,
                short_name,
                url,
            )
            outfile.write(track_str)
Example #56
0
def glitr_range_to_bed(in_range, out_bed):
    """Convert GLITR ranges to BED format, use peak centers as summits"""
    summit_size = cfg.get('peaks', 'peak_summit_size')
    with open(in_range) as infile:
        with open(out_bed, 'w') as outfile:
            with open(out_bed + '_summits.%s_around' % summit_size, 'w') \
                                                            as outfile_summits:
                for i, line in enumerate(infile):
                    fields = line.strip('\n').split('\t')
                    chrom, start, stop = parse_ucsc_range(fields[0])
                    start = max(0, start)
                    foldchange = fields[3]
                    outfile.write('\t'.join([chrom, str(start), str(stop),
                                             'GLITR_peak_%s'%(i+1),
                                             str(int(float(foldchange))),'+'])
                                                + '\n')
                    # take bases around center as summit
                    center = start + (stop - start) / 2
                    center_start = center - summit_size / 2
                    center_stop = center + summit_size / 2
                    outfile_summits.write('\t'.join([chrom, str(center_start),
                                    str(center_stop), 'GLITR_peak_%s'%(i+1),
                                    str(int(float(foldchange))),'+']) + '\n')