def run():
    scriptPath = os.path.dirname(os.path.realpath(__file__))
    plot_script = os.path.join(scriptPath, 'plotNucleotideProbabilities.R')

    parser = create_parser()
    args = parser.parse_args()

    functions.checkExistence(args.inputfile)
    functions.checkPath(args.outdir)
    outfile = os.path.join(args.outdir, args.prefix + '_nuc_mutations.table')
    outfile_img = os.path.join(args.outdir, args.prefix + '_nuc_mutations.pdf')
    main(args.inputfile, outfile, args.coverage, args.verbose)

    cmd = [
        'R',
        '-q',
        '--slave',
        '-f %r' % plot_script,
        '--args',
        '%r' % outfile,
        '%r' % outfile_img,
        args.limit,
    ]
    execute(cmd)

    if args.remove:
        os.remove(outfile)
Beispiel #2
0
def run():
    scriptPath = os.path.dirname(os.path.realpath(__file__))
    plot_script = os.path.join(scriptPath, 'plotHeatMapSmall.R')
    parser = create_parser()
    args = parser.parse_args()

    functions.checkExistence(args.parclip)
    functions.checkExistence(args.gff)

    prefix_pattern = '%s_mat_sm_up%s_do%s_min%s_max%s_xbins%s_ybins%s'
    out_prefix = prefix_pattern % (args.prefix, args.upstream, args.downstream,
                                   args.min, args.max, args.xbins, args.ybins)

    outfile_mat_sense = os.path.join(args.outputdir,
                                     out_prefix + '_sense.table')
    outfile_mat_asense = os.path.join(args.outputdir,
                                      out_prefix + '_asense.table')
    outfile_img_sense = os.path.join(args.outputdir, out_prefix + '_sense.png')
    outfile_img_asense = os.path.join(args.outputdir,
                                      out_prefix + '_asense.png')

    sense = main(args.parclip, args.gff, args.upstream, args.downstream, True,
                 args.min, args.max, args.verbose, args.xbins, args.ybins,
                 'Collecting data from sense strand')

    asense = main(args.parclip, args.gff, args.upstream, args.downstream,
                  False, args.min, args.max, args.verbose, args.xbins,
                  args.ybins, 'Collecting data from anti-sense strand')

    total = args.upstream + args.max + 1 + args.downstream
    saveMat(outfile_mat_sense, sense[0], args.upstream, args.downstream,
            sense[1], args.xbins, total)

    saveMat(outfile_mat_asense, asense[0], args.upstream, args.downstream,
            asense[1], args.xbins, total)

    start = args.upstream / total * args.xbins

    cmd = [
        'R',
        '-q',
        '--slave',
        '-f %r' % plot_script,
        '--args',
        '%r' % outfile_mat_sense,
        '%r' % outfile_mat_asense,
        '%r' % outfile_img_sense,
        '%r' % outfile_img_asense,
        0.98,  # hard-coded qvalue
        start,
        args.ypx,
        args.xpx,
    ]
    execute(cmd)

    if args.remove:
        os.remove(outfile_mat_sense)
        os.remove(outfile_mat_asense)
Beispiel #3
0
    def cleanup(self, keep_intermed=False):
        """Cleans up temporary and intermediate files

        Args:
            keep_intermed (:obj:`boolean`): if set to True, intermediate files are not removed
        """
        if not self._keep_all:
            for rm_file in self._tmp_files:
                logger.debug('%s is marked as a temporary file, cleaning up',
                             rm_file)
                execute('rm -rf %s' % rm_file, exit=False)
            if not keep_intermed:
                for rm_file in self._intermed_files:
                    logger.debug(
                        '%s is marked as an intermediate file, cleaning up',
                        rm_file)
                    execute('rm -rf %s' % rm_file, exit=False)
def run():
    scriptPath = os.path.dirname(os.path.realpath(__file__))
    plot_script = os.path.join(scriptPath, 'plotKmerPerPosition.R')

    parser = create_parser()
    args = parser.parse_args()

    sites = ParclipSiteContainer.from_file(args.inputfile)

    if args.filterGFF != '':
        sites.remove_gff_sites(args.filterGFF, args.awidth)

    sites.sort(by=args.key, ascending=False)

    with EfficientGenome(args.genome) as genome:
        sites = sites[args.start:args.stop]
        seqs = sites.get_all_sequences(genome, args.width)

    prefix_fmt = '%s_kmerPerPosition_kmer%s_start%s_stop%s_width%s_sort_%s'

    prefix = prefix_fmt % (args.prefix, args.kmer, args.start, args.stop,
                           args.width, args.key)
    outfile_table = os.path.join(args.outdir, prefix + '.table')
    outfile_pdf = os.path.join(args.outdir, prefix + '.pdf')
    seq_len = 2 * args.width + 1
    getKmerOccurences(seqs,
                      seq_len,
                      outfile_table,
                      kmer=(args.kmer - 1),
                      verbose=args.verbose)

    cmd = [
        'R', '-q', '--slave',
        '-f %s' % plot_script, '--args', outfile_table, outfile_pdf,
        args.width, 0, args.width + 1
    ]
    execute(cmd)
    if args.remove:
        os.remove(outfile_table)
Beispiel #5
0
def run():
    scriptPath = os.path.dirname(os.path.realpath(__file__))
    parser = create_parser()
    args = parser.parse_args()

    prefix_pat = '%s_xxmotif_start%s_stop%s_width%s_sort_%s'
    file_prefix = prefix_pat % (args.prefix, args.start, args.stop, args.width,
                                args.key)

    sites = ParclipSiteContainer.from_file(args.inputfile)

    if args.filterGFF != '':
        sites.remove_gff_sites(args.filterGFF, args.awidth)

    sites.sort(by=args.key, ascending=False)
    sites = sites[args.start:args.stop]
    gen_file = os.path.join(args.outdir, file_prefix + '.fa')
    with EfficientGenome(args.genome) as genome:
        sites.save2Fasta(genome, gen_file, width=args.width)

    cmd = [
        'XXmotif',
        args.outdir,
        gen_file,
        '--zoops',
        '--merge-motif-threshold LOW',
        '--max-match-positions 10',
    ]
    if args.negSet:
        cmd.append('--negSet %s' % args.negSet)
    execute(cmd)

    tmp_dir = os.path.join(args.outdir, 'tmp')
    mini_plot_script = os.path.join(tmp_dir, 'plotDistribution.R')

    mini_plot_cmd = [
        'R',
        '-q',
        '--slave',
        '-f %r' % mini_plot_script,
        '--args',
        '%r' % args.outdir,
    ]
    execute(mini_plot_cmd)

    plot_script = os.path.join(scriptPath, '..', 'plots', 'weblogo.R')
    pwm_file = os.path.join(args.outdir, file_prefix + '.pwm')
    plot_cmd = [
        'R',
        '-q',
        '--slave',
        '-f %s' % plot_script,
        '--args',
        pwm_file,
        args.outdir,
        file_prefix,
        args.plotPWM,
    ]
    if args.plotPWM > 0:
        execute(plot_cmd)

    if not args.keep_tmp_files:
        shutil.rmtree(tmp_dir, ignore_errors=True)
Beispiel #6
0
 def execute(self):
     """Execute all queued commands"""
     for cmd in self._cmds:
         yield execute(cmd)
Beispiel #7
0
def main(parclip, outdir, prefix, genomepath, negset, gfffile, kmer, key,
         useQuantiles, verbose, args):
    scriptPath = os.path.dirname(os.path.realpath(__file__))
    plot_script = os.path.join(scriptPath, 'plotKmerLogOdds.R')
    pc = ParclipSiteContainer.from_file(parclip)

    if gfffile is not None:
        pc.remove_gff_sites(gfffile)
    pc.sort(by=key, ascending=False)

    kmers = functions.makekmers(kmer, list('ACGT'))[kmer - 1]
    negfreq = loadNegTable(negset)

    with EfficientGenome(genomepath) as genomeseq:
        allfreqs = []
        fileprefix = '%s_logodds_%smer_sort_%s' % (prefix, kmer, key)
        if useQuantiles:
            fileprefix = fileprefix + '_quantiles'
            allfreqs.append(
                getkmerLogs(pc, genomeseq, negfreq, kmers, 0, 1000, 15))
            quantiles = [
                0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.125, 0.15, 0.175, 0.2,
                0.225, 0.25, 0.275, 0.3, 0.325, 0.35, 0.375, 0.4, 0.45, 0.5,
                0.55, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9
            ]
            count = 1
            stop = 1000
            for q in quantiles:
                if verbose:
                    functions.showProgress(
                        count, len(quantiles),
                        'Getting kmer log-odds from quantiles...')
                old_stop = stop
                start = functions.getQuantileIndex(len(pc), q) - 500
                stop = functions.getQuantileIndex(len(pc), q) + 500
                if start < 0:
                    start = 0
                if stop > len(pc) - 2:
                    break
                count = count + 1
                if (stop - 500) < old_stop:
                    msg_pat = 'Bin %s and %s are overlapping by %s sites!'
                    # TODO 2x quantiles[count - 2] is probably a bug
                    msg = msg_pat % (quantiles[count - 2],
                                     quantiles[count - 2], old_stop -
                                     (stop - 500))
                    print(msg, file=sys.stderr)
                allfreqs.append(
                    getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop,
                                15))
        else:
            maxsize = 50000
            stepsize = 1000
            start = 0
            stop = 1000
            run = True
            while run:
                if stop > len(pc) - 2 or stop > maxsize:
                    print()
                    print('STOP at: %s' % +stop)
                    run = False
                    break
                if verbose:
                    functions.showProgress(
                        stop, maxsize, 'Getting kmer log-odds from bins...')
                allfreqs.append(
                    getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop,
                                15))
                start = stop
                stop = stop + stepsize

    table_file = os.path.join(outdir, fileprefix + '.table')
    pdf_file = os.path.join(outdir, fileprefix + '.pdf')
    sortAndSave(allfreqs, table_file, kmers)

    cmd = [
        'R',
        '-q',
        '--slave',
        '-f %r' % plot_script,
        '--args',
        '%r' % table_file,
        '%r' % pdf_file,
    ]
    execute(cmd)

    if not args.keep_tmp_files:
        os.remove(table_file)
Beispiel #8
0
def main():
    parser = create_parser()
    args = parser.parse_args()

    np.random.seed(args.seed)

    pc = ParclipSiteContainer.from_file(args.pc_table)

    gff_records = []
    with open(args.gff_file) as gff_handle:
        parser = GFF3Parser(gff_handle)
        for rec in parser.parse():
            if args.min_ts_len <= rec.end - rec.start + 1 <= args.max_ts_len:
                gff_records.append(rec)

    cut_len = args.downstream_bp + args.upstream_bp + 2 * args.gene_bp + 2

    gene_bp = args.gene_bp
    upstream_bp = args.upstream_bp
    downstream_bp = args.downstream_bp

    def aggregate_data(gff_rec, sense=True):
        def rev(strand):
            return '+' if strand == '-' else '-'

        chrom = gff_rec.seqid
        start = gff_rec.start
        end = gff_rec.end
        smooth_window = args.smooth_window
        anno_strand = gff_rec.strand

        query_strand = anno_strand if sense else rev(anno_strand)
        if anno_strand == '+':
            values_upstream = pc.get_occ_profile(chrom, start - upstream_bp,
                                                 start + gene_bp, query_strand)
            values_dostream = pc.get_occ_profile(chrom, end - gene_bp,
                                                 end + downstream_bp,
                                                 query_strand)
        else:
            values_upstream = pc.get_occ_profile(chrom, end - gene_bp,
                                                 end + upstream_bp,
                                                 query_strand)[::-1]
            values_dostream = pc.get_occ_profile(chrom, start - downstream_bp,
                                                 start + gene_bp,
                                                 query_strand)[::-1]

        upstr = pd.Series(values_upstream).rolling(window=smooth_window,
                                                   center=True,
                                                   min_periods=0).mean()
        dostr = pd.Series(values_dostream).rolling(window=smooth_window,
                                                   center=True,
                                                   min_periods=0).mean()
        return np.hstack((upstr, dostr))

    def write_out_data(sense, out_file, bs_file):
        global data_dict
        data_dict = {}
        for i, gff_rec in enumerate(gff_records):
            data = aggregate_data(gff_rec, sense)
            data_dict[i] = data
        ids = np.array(list(data_dict.keys()))

        # actual smoothed curve
        with open(out_file, 'w') as out:
            avg_vec = np.zeros(cut_len)
            for ts_ind in ids:
                avg_vec += data_dict[ts_ind]
            avg_vec /= len(ids)
            print(*avg_vec, sep='\t', file=out)

        with open(bs_file, 'w') as out:
            with Pool(args.n_processes) as pool:
                jobs = []
                for bs in range(args.n_bs_iterations):
                    ts_ind = np.random.choice(ids, size=len(ids))
                    job = pool.apply_async(calc_profile,
                                           args=(ts_ind, cut_len))
                    jobs.append(job)

                for job in jobs:
                    res_vec = job.get()
                    print(*res_vec, sep='\t', file=out)

    prefix_fmt = '%s_centerBoth_up%s_gene%s_do%s_min%s_max%s'
    fmt_args = (
        args.prefix,
        args.upstream_bp,
        args.gene_bp,
        args.downstream_bp,
        args.min_ts_len,
        args.max_ts_len,
    )
    fn_prefix = prefix_fmt % fmt_args

    sense_table = os.path.join(args.outputdir, fn_prefix + '_sense.table')
    sense_bs_table = os.path.join(args.outputdir,
                                  fn_prefix + '_sense_bs.table')

    asense_table = os.path.join(args.outputdir, fn_prefix + '_asense.table')
    asense_bs_table = os.path.join(args.outputdir,
                                   fn_prefix + '_asense_bs.table')

    write_out_data(True, sense_table, sense_bs_table)
    write_out_data(False, asense_table, asense_bs_table)

    scriptPath = os.path.dirname(os.path.realpath(__file__))
    plot_script = os.path.join(scriptPath, 'plotCenterBothEnds_bs.R')

    if not args.title:
        title = args.prefix
    else:
        title = args.title

    outfile_pdf = os.path.join(args.outputdir,
                               fn_prefix + '_sm%s.pdf' % args.smooth_window)

    plot_cmd = [
        'R',
        '-q',
        '--slave',
        '-f %r' % plot_script,
        '--args',
        '%r' % sense_table,
        '%r' % sense_bs_table,
        '%r' % asense_table,
        '%r' % asense_bs_table,
        '%r' % outfile_pdf,
        '%r' % title,
        args.upstream_bp,
        args.downstream_bp,
        args.gene_bp,
        args.smooth_window,
        '%r' % args.labelCenterA,
        '%r' % args.labelBody,
        '%r' % args.labelCenterB,
    ]
    execute(plot_cmd)
    if args.cleanup:
        os.remove(sense_table)
        os.remove(sense_bs_table)
        os.remove(asense_table)
        os.remove(asense_bs_table)
def run():
    scriptPath = os.path.dirname(os.path.realpath(__file__))
    plot_script = os.path.join(scriptPath, 'plotCenterBothEnds.R')
    parser = create_parser()
    args = parser.parse_args()

    if not args.title:
        title = args.prefix
    else:
        title = args.title

    prefix_fmt = '%s_centerBoth_up%s_gene%s_do%s_min%s_max%s'
    fmt_args = (
        args.prefix,
        args.upstream,
        args.gene,
        args.downstream,
        args.min,
        args.max,
    )
    fn_prefix = prefix_fmt % fmt_args
    outfile_sense = os.path.join(args.outputdir, fn_prefix + '_sense.table')
    outfile_asense = os.path.join(args.outputdir, fn_prefix + '_asense.table')
    outfile_pdf = os.path.join(args.outputdir, fn_prefix + '_sm%s.pdf' % args.plotSmooth)

    start_time = datetime.datetime.now()

    main(args.parclip, outfile_sense, args.gff, args.downstream, args.upstream,
         args.gene, True, args.min, args.max, args.verbose,
         'Collecting PAR-CLIP sense data')
    main(args.parclip, outfile_asense, args.gff, args.downstream, args.upstream,
         args.gene, False, args.min, args.max, args.verbose,
         'Collecting PAR-CLIP anti-sense data')

    if args.verbose:
        end_time = datetime.datetime.now()
        run_time = end_time - start_time
        print()
        print('time: %s seconds' % run_time.seconds)

    plot_cmd = [
        'R',
        '-q',
        '--slave',
        '-f %r' % plot_script,
        '--args',
        '%r' % outfile_sense,
        '%r' % outfile_asense,
        '%r' % outfile_pdf,
        '%r' % title,
        args.upstream,
        args.downstream,
        args.gene,
        args.plotSmooth,
        '%r' % args.labelCenterA,
        '%r' % args.labelBody,
        '%r' % args.labelCenterB,
    ]
    execute(plot_cmd)
    if args.remove:
        os.remove(outfile_sense)
        os.remove(outfile_asense)