Esempio n. 1
0
def gcn(args):
    """
    %prog gcn gencode.v26.exonunion.bed data/*.vcf.gz

    Compile gene copy njumber based on CANVAS results.
    """
    p = OptionParser(gcn.__doc__)
    p.set_cpus()
    p.set_tmpdir(tmpdir="tmp")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    exonbed = args[0]
    canvasvcfs = args[1:]
    tsvfile = opts.outfile
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    set_tempdir(tmpdir)

    df = vcf_to_df(canvasvcfs, exonbed, opts.cpus)
    for suffix in (".avgcn", ".medcn"):
        df_to_tsv(df, tsvfile, suffix)
Esempio n. 2
0
def cleanup_unwriteable():
    """
    Reset to normal tempdir operation....
    """
    if os.path.exists(unwriteable):
        os.system('rm -rf %s' % unwriteable)
    pybedtools.set_tempdir(test_tempdir)
Esempio n. 3
0
def calculate_coverage(bamfile_name, output_dir):
    os.makedirs(f'{output_dir}/tmp', exist_ok=True)
    pybedtools.set_tempdir(f'{output_dir}/tmp')
    bed = pybedtools.BedTool(bamfile_name)
    df = bed.genome_coverage(dz = True).to_dataframe(names=['contig','pos', 'depth'])
    pybedtools.cleanup()
    return df
Esempio n. 4
0
def test_stream():
    """
    Stream and file-based equality, both whole-file and Interval by
    Interval
    """
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a.intersect(b)

    # make an unwriteable dir...
    orig_tempdir = pybedtools.get_tempdir()
    if os.path.exists('unwriteable'):
        os.system('rm -rf unwriteable')
    os.system('mkdir unwriteable')
    os.system('chmod -w unwriteable')

    # ...set that to the new tempdir
    pybedtools.set_tempdir('unwriteable')

    # this should really not be written anywhere
    d = a.intersect(b, stream=True)

    assert_raises(NotImplementedError, c.__eq__, d)
    d_contents = d.fn.read()
    c_contents = open(c.fn).read()
    assert d_contents == c_contents

    # reconstruct d and check Interval-by-Interval equality
    pybedtools.set_tempdir('unwriteable')
    d = a.intersect(b, stream=True)

    for i,j in zip(c, d):
        assert str(i) == str(j)

    # Now do something similar with GFF files.
    a = pybedtools.example_bedtool('a.bed')
    f = pybedtools.example_bedtool('d.gff')

    # file-based
    pybedtools.set_tempdir(orig_tempdir)
    g1 = f.intersect(a)

    # streaming
    pybedtools.set_tempdir('unwriteable')
    g2 = f.intersect(a, stream=True)

    for i,j in zip(g1, g2):
        assert str(i) == str(j)

    # this was segfaulting at one point, just run to make sure
    g3 = f.intersect(a, stream=True)
    for i in iter(g3):
        print i

    for row in f.cut(range(3), stream=True):
        row[0], row[1], row[2]
        assert_raises(IndexError, row.__getitem__, 3)

    pybedtools.set_tempdir(orig_tempdir)
    os.system('rm -fr unwriteable')
Esempio n. 5
0
 def __init__(self, id, home_dir):
     """Create the callable locus object."""
     self.aligner_ls = ['star', 'hisat2']
     self.id = id
     self.subdir = '{}callable_comparison/{}/'.format(home_dir, id)
     if not os.path.exists(self.subdir):
         os.mkdir(self.subdir)
     # original input file (output from GATK callableloci)
     self.call_loci_ls = [
         '{}FASTQ/{}/{}_{}_callable.bed'.format(home_dir, id, id, i)
         for i in self.aligner_ls
     ]
     # intermediate files:
     self.call_only_ls = [
         '{}callable_{}.bed'.format(self.subdir, i) for i in self.aligner_ls
     ]
     self.call_inter = '{}callable_{}.bed'.format(self.subdir, 'inter')
     self.call_union = '{}callable_{}.bed'.format(self.subdir, 'union')
     self.callable_ls = self.call_only_ls + [
         self.call_inter, self.call_union
     ]
     callable_fs = ['star', 'hisat2', 'intersect', 'union']
     self.callable_dict = dict(zip(callable_fs, self.callable_ls))
     self.len_dict = dict(zip(self.callable_ls,
                              [0] * len(self.callable_ls)))
     self.len_loc = self.subdir[:-1] + '_lengths.txt'
     pybedtools.set_tempdir(home_dir + '/tmp_dir/')
Esempio n. 6
0
def cleanup_unwriteable():
    """
    Reset to normal tempdir operation....
    """
    if os.path.exists(unwriteable):
        os.system('rm -rf %s' % unwriteable)
    pybedtools.set_tempdir(test_tempdir)
Esempio n. 7
0
def genotype_intervals(intervals_file=None, bam=None, workdir=None, window=GT_WINDOW, isize_mean=ISIZE_MEAN, isize_sd=ISIZE_SD, normal_frac_threshold=GT_NORMAL_FRAC):
    func_logger = logging.getLogger("%s-%s" % (genotype_intervals.__name__, multiprocessing.current_process()))

    if workdir and not os.path.isdir(workdir):
        os.makedirs(workdir)

    pybedtools.set_tempdir(workdir)

    genotyped_intervals = []
    start_time = time.time()

    isize_min = max(0, isize_mean - 3 * isize_sd)
    isize_max = isize_mean + 3 * isize_sd

    try:
        bam_handle = pysam.Samfile(bam, "rb")
        for interval in pybedtools.BedTool(intervals_file):
            chrom, start, end, sv_type, svlen = parse_interval(interval)
            genotype = genotype_interval(chrom, start, end, sv_type, svlen, bam_handle, isize_min, isize_max, window, normal_frac_threshold)
            fields = interval.fields + [genotype]
            genotyped_intervals.append(pybedtools.create_interval_from_list(fields))
        bedtool = pybedtools.BedTool(genotyped_intervals).moveto(os.path.join(workdir, "genotyped.bed"))
    except Exception as e:
        func_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e
    func_logger.info("Genotyped %d intervals in %g minutes" % (len(genotyped_intervals), (time.time() - start_time)/60.0))

    return bedtool.fn
Esempio n. 8
0
def annotate(bed, input, bedout, rnazout):
    try:

        pybedtools.set_tempdir(
            '.')  # Make sure we do not write somewhere we are not supposed to
        anno = pybedtools.BedTool(bed)
        rnaz = readrnaz(input)
        tmpbed = pybedtools.BedTool(rnaztobed(rnaz), from_string=True)

        intersection = tmpbed.intersect(
            anno, wa=True, wb=True, s=True
        )  # intersect strand specific, keep all info on a and b files

        bedtornaz(intersection, rnaz, bedout, rnazout)

        return 1

    except Exception as err:
        exc_type, exc_value, exc_tb = sys.exc_info()
        tbe = tb.TracebackException(
            exc_type,
            exc_value,
            exc_tb,
        )
        print(''.join(tbe.format()), file=sys.stderr)
Esempio n. 9
0
File: cnv.py Progetto: xuanblo/jcvi
def gcn(args):
    """
    %prog gcn gencode.v26.exonunion.bed data/*.vcf.gz

    Compile gene copy njumber based on CANVAS results.
    """
    p = OptionParser(gcn.__doc__)
    p.set_cpus()
    p.set_tmpdir(tmpdir="tmp")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    exonbed = args[0]
    canvasvcfs = args[1:]
    tsvfile = opts.outfile
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    set_tempdir(tmpdir)

    df = vcf_to_df(canvasvcfs, exonbed, opts.cpus)
    for suffix in (".avgcn", ".medcn"):
        df_to_tsv(df, tsvfile, suffix)
Esempio n. 10
0
def test_stream():
    """
    Stream and file-based equality, both whole-file and Interval by
    Interval
    """
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a.intersect(b)

    # make an unwriteable dir...
    orig_tempdir = pybedtools.get_tempdir()
    if os.path.exists('unwriteable'):
        os.system('rm -rf unwriteable')
    os.system('mkdir unwriteable')
    os.system('chmod -w unwriteable')

    # ...set that to the new tempdir
    pybedtools.set_tempdir('unwriteable')

    # this should really not be written anywhere
    d = a.intersect(b, stream=True)

    assert_raises(NotImplementedError, c.__eq__, d)
    d_contents = d.fn.read()
    c_contents = open(c.fn).read()
    assert d_contents == c_contents

    # reconstruct d and check Interval-by-Interval equality
    pybedtools.set_tempdir('unwriteable')
    d = a.intersect(b, stream=True)

    for i, j in zip(c, d):
        assert str(i) == str(j)

    # Now do something similar with GFF files.
    a = pybedtools.example_bedtool('a.bed')
    f = pybedtools.example_bedtool('d.gff')

    # file-based
    pybedtools.set_tempdir(orig_tempdir)
    g1 = f.intersect(a)

    # streaming
    pybedtools.set_tempdir('unwriteable')
    g2 = f.intersect(a, stream=True)

    for i, j in zip(g1, g2):
        assert str(i) == str(j)

    # this was segfaulting at one point, just run to make sure
    g3 = f.intersect(a, stream=True)
    for i in iter(g3):
        print i

    for row in f.cut(range(3), stream=True):
        row[0], row[1], row[2]
        assert_raises(IndexError, row.__getitem__, 3)

    pybedtools.set_tempdir(orig_tempdir)
    os.system('rm -fr unwriteable')
Esempio n. 11
0
def batch_callable_bed(bam_files, output_bed_file, work_dir, genome_fasta_file, min_depth,
                       parall_view=None):
    """ Picking random 3 samples and getting a callable for them.
        Trade off between looping through all samples in a huge batch,
        and hitting an sample with outstanding coverage.
    """
    if can_reuse(output_bed_file, bam_files):
        return output_bed_file

    work_dir = safe_mkdir(join(work_dir, 'callable_work'))
    # random.seed(1234)  # seeding random for reproducability
    # bam_files = random.sample(bam_files, min(len(bam_files), 3))

    if parall_view:
        callable_beds = parall_view.run(_calculate, [
            [bf, work_dir, genome_fasta_file, min_depth]
            for bf in bam_files])
    else:
        with parallel_view(len(bam_files), ParallelCfg(threads=len(bam_files)), work_dir) as parall_view:
            callable_beds = parall_view.run(_calculate, [
                [bf, work_dir, genome_fasta_file, min_depth]
                for bf in bam_files])

    good_overlap_sample_fraction = 0.8  # we want to pick those regions that have coverage at 80% of samples
    good_overlap_count = max(1, good_overlap_sample_fraction * len(callable_beds))
    info(f'Intersecting callable regions and picking good overlaps with >={good_overlap_count} '
         f'samples ({100 * good_overlap_sample_fraction}% of {len(callable_beds)})')
    with file_transaction(work_dir, output_bed_file) as tx:
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        intersection = pybedtools.BedTool() \
            .multi_intersect(i=callable_beds) \
            .filter(lambda r: len(r[4].split(',')) >= good_overlap_count)
        intersection.saveas(tx)
    info(f'Saved to {output_bed_file}')
    return output_bed_file
Esempio n. 12
0
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.')
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
Esempio n. 13
0
def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[],
                        max_interval_size=SPADES_MAX_INTERVAL_SIZE,
                        timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX,
                        svs_to_assemble=SVS_ASSEMBLY_SUPPORTED,
                        stop_on_fail=False, max_read_pairs=EXTRACTION_MAX_READ_PAIRS):
    pybedtools.set_tempdir(work)

    logger.info("Running SPAdes on the intervals in %s" % bed)
    if not bed:
        logger.info("No BED file specified")
        return None, None

    bedtool = pybedtools.BedTool(bed)
    total = bedtool.count()

    chrs = set(chrs)
    all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if
                                                                         interval.chrom in chrs]
    selected_intervals = filter(partial(should_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
                                all_intervals)
    ignored_intervals = filter(partial(shouldnt_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
                               all_intervals)

    pool = multiprocessing.Pool(nthreads)
    assembly_fastas = []
    for i in xrange(nthreads):
        intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i]
        kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad,
                       "timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail,
                       "max_read_pairs": max_read_pairs}
        pool.apply_async(run_spades_single, kwds=kwargs_dict,
                         callback=partial(run_spades_single_callback, result_list=assembly_fastas))

    pool.close()
    pool.join()

    logger.info("Merging the contigs from %s" % (str(assembly_fastas)))
    assembled_fasta = os.path.join(work, "spades_assembled.fa")
    with open(assembled_fasta, "w") as assembled_fd:
        for line in fileinput.input(assembly_fastas):
            assembled_fd.write("%s\n" % (line.strip()))

    if os.path.getsize(assembled_fasta) > 0:
        logger.info("Indexing the assemblies")
        pysam.faidx(assembled_fasta)
    else:
        logger.error("No assembly generated")
        assembled_fasta = None

    ignored_bed = None
    if ignored_intervals:
        ignored_bed = os.path.join(work, "ignored.bed")
        pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed)

    pybedtools.cleanup(remove_all=True)

    return assembled_fasta, ignored_bed
Esempio n. 14
0
def bedtools_tmpdir(data):
    with tx_tmpdir(data) as tmpdir:
        orig_tmpdir = tempfile.gettempdir()
        pybedtools.set_tempdir(tmpdir)
        yield
        if orig_tmpdir and os.path.exists(orig_tmpdir):
            pybedtools.set_tempdir(orig_tmpdir)
        else:
            tempfile.tempdir = None
Esempio n. 15
0
def bedtools_tmpdir(data):
    with tx_tmpdir(data) as tmpdir:
        orig_tmpdir = tempfile.gettempdir()
        pybedtools.set_tempdir(tmpdir)
        yield
        if orig_tmpdir and os.path.exists(orig_tmpdir):
            pybedtools.set_tempdir(orig_tmpdir)
        else:
            tempfile.tempdir = None
Esempio n. 16
0
    def do_intersection(self, query_bed, base_bed):
        """Gets two bed files (supplied peaks and circle coordinates) and does an intersection
        """
        # set temporary directory for pybedtools
        pybedtools.set_tempdir(self.cli_params.tmp_directory)

        # we employ the c=true parameter to directly get the counts as part of the results
        intersect_return = base_bed.intersect(query_bed, c=True)
        return intersect_return
Esempio n. 17
0
    def set_tempdir(self, dirpath):
        '''Methods that sets temp directory for pybedtools objects

        :param dirpath: Path to temp directory.
        :type dirpath: str

        :return: Nothing to be returned.
        :rtype: None
        '''
        pybedtools.set_tempdir(dirpath)
Esempio n. 18
0
def parallel_genotype_intervals(intervals_file=None, bam=None, workdir=None, nthreads=1, chromosomes=[],
                                window=GT_WINDOW, isize_mean=ISIZE_MEAN, isize_sd=ISIZE_SD,
                                normal_frac_threshold=GT_NORMAL_FRAC):
    func_logger = logging.getLogger("%s-%s" % (parallel_genotype_intervals.__name__, multiprocessing.current_process()))

    if not intervals_file:
        func_logger.warning("No intervals file specified. Perhaps no intervals to process")
        return None

    if workdir and not os.path.isdir(workdir):
        os.makedirs(workdir)

    chromosomes = set(chromosomes)

    start_time = time.time()

    bedtool = pybedtools.BedTool(intervals_file)
    selected_intervals = [interval for interval in bedtool if not chromosomes or interval.chrom in chromosomes]
    nthreads = min(len(selected_intervals), nthreads)
    intervals_per_process = (len(selected_intervals) + nthreads - 1) / nthreads

    pool = multiprocessing.Pool(nthreads)
    genotyped_beds = []
    for i in xrange(nthreads):
        process_workdir = os.path.join(workdir, str(i))
        if not os.path.isdir(process_workdir):
            os.makedirs(process_workdir)
        process_intervals = pybedtools.BedTool(
            selected_intervals[i * intervals_per_process: (i + 1) * intervals_per_process]).saveas(
            os.path.join(process_workdir, "ungenotyped.bed"))
        kwargs_dict = {"intervals_file": process_intervals.fn, "bam": bam, "workdir": process_workdir, "window": window,
                       "isize_mean": isize_mean, "isize_sd": isize_sd, "normal_frac_threshold": normal_frac_threshold}
        pool.apply_async(genotype_intervals, kwds=kwargs_dict,
                         callback=partial(genotype_intervals_callback, result_list=genotyped_beds))

    pool.close()
    pool.join()

    func_logger.info("Following BED files will be merged: %s" % (str(genotyped_beds)))

    if not genotyped_beds:
        func_logger.warn("No intervals generated")
        return None

    pybedtools.set_tempdir(workdir)
    bedtool = pybedtools.BedTool(genotyped_beds[0])

    for bed_file in genotyped_beds[1:]:
        bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)
    bedtool = bedtool.sort().moveto(os.path.join(workdir, "genotyped.bed"))

    func_logger.info("Finished parallel genotyping of %d intervals in %g minutes" % (
    len(selected_intervals), (time.time() - start_time) / 60.0))

    return bedtool.fn
Esempio n. 19
0
def make_unwriteable():
    """
    Make a directory that cannot be written to and set the pybedtools tempdir
    to it. This is used to isolate "streaming" tests to ensure they do not
    write to disk.
    """
    if os.path.exists(unwriteable):
        os.system('rm -rf %s' % unwriteable)
    os.system('mkdir -p %s' % unwriteable)
    os.system('chmod -w %s' % unwriteable)
    pybedtools.set_tempdir(unwriteable)
Esempio n. 20
0
def make_unwriteable():
    """
    Make a directory that cannot be written to and set the pybedtools tempdir
    to it. This is used to isolate "streaming" tests to ensure they do not
    write to disk.
    """
    if os.path.exists(unwriteable):
        os.system('rm -rf %s' % unwriteable)
    os.system('mkdir -p %s' % unwriteable)
    os.system('chmod -w %s' % unwriteable)
    pybedtools.set_tempdir(unwriteable)
Esempio n. 21
0
    def shuffle_peaks_through_genome(self, iteration, bed_file, genome_file):
        """Gets a (virtual) BED files and shuffle its contents throughout the supplied genome
        Will only use supplied annotation for features (in our case only transcript regions)
        """
        # set temporary directory for pybedtools
        pybedtools.set_tempdir(self.cli_params.tmp_directory)

        self.log_entry("Processing shuffling thread %d" % (iteration+1))
        shuffled_bed = bed_file.shuffle(g=genome_file)

        return shuffled_bed
Esempio n. 22
0
def main():
    bed_path = '/stor/work/Lambowitz/cdw2854/plasmaDNA/bedFiles'
    set_tempdir(bed_path)
    ref_fasta = os.environ['REF'] + '/GRCh38/hg38_rDNA/genome_rDNA.fa'
    filenames = ['P1203-SQ2_S3.bed','SRR2130052.bed']
    regular_chrom = map(str, np.arange(1,23))
    regular_chrom.extend(['X','Y'])
    func = partial(analyze_file, bed_path, regular_chrom, ref_fasta)
    p = Pool(12)
    p.map(func, filenames)
    p.close()
    p.join()
Esempio n. 23
0
    def __init__(self, outfolder, sample, bedfile, tmp_folder, platform, cpus):

        self.folder = outfolder + sample
        self.sample = sample
        self.outfile = outfolder + sample + ".skipped_exons.txt"
        self.bedfile = bedfile
        self.tmp_folder = tmp_folder
        self.platform = platform
        self.cpus = cpus

        tempfile.tempdir = tmp_folder
        pybedtools.set_tempdir(tmp_folder)
Esempio n. 24
0
def overlap_target_counts(bam_file, target_file, config):
    """Overlap BAM alignment file with shRNA targets.
    """
    out_dir = safe_makedir(config["dir"]["counts"])
    out_file = os.path.join(out_dir,
                            "{0}.bed".format(os.path.splitext(os.path.basename(bam_file))[0]))
    if not file_exists(out_file):
        pybedtools.set_tempdir(out_dir)
        bed_read_file = pybedtools.BedTool(bam_file).bam_to_bed()
        counts = pybedtools.BedTool(target_file).intersect(bed_read_file, c=True)
        counts.saveas(out_file)
    return out_file
Esempio n. 25
0
def call_events_inner(filtered_bam, event_type, fasta_filename, events_gff,
                      events_bam, filter_event_overlap, tmp_dir):
    logging.info("Calling events on file {}:".format(filtered_bam))

    samfile = pysam.AlignmentFile(filtered_bam, "rb")
    filtered_reads = [r for r in list(samfile)]

    # Call events:
    pybedtools.set_tempdir(
        tmp_dir
    )  # Necessary to control temporary folder usage during event calling
    logging.info("Calling initial events...")
    events = list(
        call_events(filtered_reads, fasta_filename, filter_event_overlap))

    # Filter on soft-clipping support:
    logging.info("Filtering on soft-clip support...")
    filtered_events = list(
        filter(lambda event: event.has_soft_clip_support(), events))

    # Optionally filter on presence of soft-clipped regions scattered throughout the reads, depending on
    # the event type:
    logging.info("Filtering on scattered soft-clip regions...")
    if event_type == SvType.DUP.value or event_type == SvType.INV.value:
        filtered_events = list(
            filter(lambda event: not event.has_scattered_soft_clip_regions(),
                   filtered_events))

    # Print them out:
    logging.info("Printing final events...")
    for event in filtered_events:
        print(event.get_gtf(), file=events_gff)

    # Write to a temporary bam file, to facilitate subsequent sorting with pysam. NOTE:
    # Could do sorting in memory since the read count should be low, but it seems less
    # bug-prone to use pysam's sort functionality:
    unique_id = uuid.uuid4()
    tmp_bam_filename = "{}/penultimate_bamfile_{}.bam".format(
        tmp_dir, unique_id, event_type)
    with pysam.AlignmentFile(tmp_bam_filename, "wb",
                             header=samfile.header) as outf:
        for event in filtered_events:
            for read in event._terminus1_reads + event._terminus2_reads:
                outf.write(read)

    # Sort the intermediate bam file with samtools to produce the final output bam file, then index it:
    pysam.sort("-o", events_bam, tmp_bam_filename)
    pysam.index(str(events_bam))

    remove_bam_and_bai(tmp_bam_filename)

    events_gff.close()
Esempio n. 26
0
def subtract_bed_sd(bed_name, bed_filter):
    """REMOVES regions of annotation of interest that overlap with
    segmental duplications
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.noRmsk.noSD.bed'):
        bed = BedTool(bed_name + '.noRmsk.bed')
        print "Removing calls in seg dup from " + bed_name + "..."
        bed_no_overlap = bed.subtract(bed_filter)
        bed_no_overlap.saveas(bed_name + '.noRmsk.noSD.bed')
        print bed_name + " done!"
    else:
        print bed_name + " Seg dup calls already removed"
Esempio n. 27
0
def intersect_bed(bed_name, bed_filter):
    """KEEPS regions of annotation of interest that overlap with
    repeat-masked regions
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.Rmsk.bed'):
        bed = BedTool(bed_name + '.merged.sorted.bed')
        print "Keeping calls in rmsk from " + bed_name + "..."
        bed_overlap = bed.intersect(bed_filter)
        bed_overlap.saveas(bed_name + '.Rmsk.bed')
        print bed_name + " done!"
    else:
        print bed_name + " rmsk calls already isolated"
Esempio n. 28
0
def intersect_bed(bed_name, bed_filter):
    """KEEPS regions of annotation of interest that overlap with
    repeat-masked regions
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.Rmsk.bed'):
        bed = BedTool(bed_name + '.merged.sorted.bed')
        print "Keeping calls in rmsk from " + bed_name + "..."
        bed_overlap = bed.intersect(bed_filter)
        bed_overlap.saveas(bed_name + '.Rmsk.bed')
        print bed_name + " done!"
    else:
        print bed_name + " rmsk calls already isolated"
Esempio n. 29
0
def subtract_bed_rmsk(bed_name, bed_filter):
    """REMOVES regions of annotation of interest that overlap with
    repeat-masked regions
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.noRmsk.bed'):
        bed = BedTool(bed_name + '.bed') # .merged.sorted
        print "Removing calls in rmsk from " + bed_name + "..."
        bed_no_overlap = bed.subtract(bed_filter)
        bed_no_overlap.saveas(bed_name + '.noRmsk.bed')
        print bed_name + " done!"
    else:
        print bed_name + " rmsk calls already removed"
Esempio n. 30
0
def identify_targets(bam_files, config, out_base="shrna_targets"):
    """Create BED file of target regions based on input BAM alignments
    """
    work_dir = safe_makedir(config["dir"]["annotation"])
    pybedtools.set_tempdir(work_dir)
    out_file = os.path.join(work_dir, "{0}.bed".format(out_base))
    if not file_exists(out_file):
        pybed_files = [pybedtools.BedTool(x) for x in bam_files]
        bed_files = [x.bam_to_bed() for x in pybed_files]
        combined_bed = reduce(lambda x, y: x.cat(y), bed_files)
        merge_bed = combined_bed.merge(d=config["algorithm"].get("merge_distance", 0))
        merge_bed.saveas(out_file)
    return out_file
Esempio n. 31
0
def subtract_bed_rmsk(bed_name, bed_filter):
    """REMOVES regions of annotation of interest that overlap with
    repeat-masked regions
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.noRmsk.bed'):
        bed = BedTool(bed_name + '.bed')  # .merged.sorted
        print "Removing calls in rmsk from " + bed_name + "..."
        bed_no_overlap = bed.subtract(bed_filter)
        bed_no_overlap.saveas(bed_name + '.noRmsk.bed')
        print bed_name + " done!"
    else:
        print bed_name + " rmsk calls already removed"
Esempio n. 32
0
def subtract_bed_sd(bed_name, bed_filter):
    """REMOVES regions of annotation of interest that overlap with
    segmental duplications
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.noRmsk.noSD.bed'):
        bed = BedTool(bed_name + '.noRmsk.bed')
        print "Removing calls in seg dup from " + bed_name + "..."
        bed_no_overlap = bed.subtract(bed_filter)
        bed_no_overlap.saveas(bed_name + '.noRmsk.noSD.bed')
        print bed_name + " done!"
    else:
        print bed_name + " Seg dup calls already removed"
Esempio n. 33
0
def genotype_intervals(intervals_file=None,
                       bams=[],
                       workdir=None,
                       window=GT_WINDOW,
                       isize_mean=ISIZE_MEAN,
                       isize_sd=ISIZE_SD,
                       normal_frac_threshold=GT_NORMAL_FRAC):
    func_logger = logging.getLogger(
        "%s-%s" %
        (genotype_intervals.__name__, multiprocessing.current_process()))

    if workdir and not os.path.isdir(workdir):
        os.makedirs(workdir)

    pybedtools.set_tempdir(workdir)

    genotyped_intervals = []
    start_time = time.time()

    isize_min = max(0, isize_mean - 3 * isize_sd)
    isize_max = isize_mean + 3 * isize_sd

    try:
        bam_handles = [pysam.Samfile(bam, "rb") for bam in bams]
        for interval in pybedtools.BedTool(intervals_file):
            chrom, start, end, sv_type, svlen = parse_interval(interval)
            genotype = genotype_interval(str(chrom), start, end, sv_type,
                                         svlen, bam_handles, isize_min,
                                         isize_max, window,
                                         normal_frac_threshold)
            fields = interval.fields + [genotype]
            genotyped_intervals.append(
                pybedtools.create_interval_from_list(fields))
        for bam_handle in bam_handles:
            bam_handle.close()
        bedtool = pybedtools.BedTool(genotyped_intervals).moveto(
            os.path.join(workdir, "genotyped.bed"))
    except Exception as e:
        func_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e
    func_logger.info("Genotyped %d intervals in %g minutes" %
                     (len(genotyped_intervals),
                      (time.time() - start_time) / 60.0))

    return bedtool.fn
Esempio n. 34
0
def merge_bed(bed_name):
    """ MERGES a bed file after removing rmsk, sd
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    bed_in = bed_name + '.sorted.noRmsk.noSD.bed'
    bed_out = bed_name + '.merged.sorted.noRmsk.noSD.bed'
    if not os.path.isfile(bed_out):
        bed = BedTool(bed_in)
        print "Merging " + bed_in + "..."
        bed_merged = bed.merge()
        bed_merged.saveas(bed_out)
        print bed_name + " done!"
    else:
        print bed_out + " already merged"
Esempio n. 35
0
def add_annotations(target_file, annotations):
    """Association annotations with BED file of targets.

    Based on the annotate.py example from pybedtools.
    """
    out_file = apply("{0}-annotated{1}".format, os.path.splitext(target_file))
    pybedtools.set_tempdir(os.path.dirname(out_file))
    if not file_exists(out_file):
        all_ann = pybedtools.BedTool(_merge_gff(annotations))
        with_ann = pybedtools.BedTool(target_file).intersect(all_ann, wao=True)
        with open(with_ann.fn) as in_handle:
            with open(out_file, "w") as out_handle:
                _write_combined_features(in_handle, out_handle)
    return out_file
Esempio n. 36
0
def merge_bed(bed_name):
    """ MERGES a bed file after removing rmsk, sd
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    bed_in = bed_name + '.sorted.noRmsk.noSD.bed'
    bed_out = bed_name + '.merged.sorted.noRmsk.noSD.bed'
    if not os.path.isfile(bed_out):
        bed = BedTool(bed_in)
        print "Merging " + bed_in + "..."
        bed_merged = bed.merge()
        bed_merged.saveas(bed_out)
        print bed_name + " done!"
    else:
        print bed_out + " already merged"
Esempio n. 37
0
def sort_bed(bed_name):
    """ SORTS a bed file after removing rmsk, sd
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    bed_in = bed_name + '.noRmsk.noSD.bed'
    bed_out = bed_name + '.sorted.noRmsk.noSD.bed'
    if not os.path.isfile(bed_out):
        print "Sorting " + bed_in + "... "
        sort_cmd = ("sort -V -k1,1 -k2,2 %s > %s" % (bed_in, bed_out))
        print sort_cmd
        subprocess.call(sort_cmd, shell=True)
        print bed_name + " sorted!"
    else:
        print bed_out + " already sorted"
Esempio n. 38
0
def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and not any(
            x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath
Esempio n. 39
0
def initBedTool(tempPrefix=""):
    # keep temporary files in current directory, to make it a little harder to
    # lose track of them and clog up the system....
    S = string.ascii_uppercase + string.digits
    tag = ''.join(random.choice(S) for x in range(5))
    tempPath = os.path.join(os.getcwd(), "%sTempBedTool_%s" % (tempPrefix, tag))
    logger.info("Temporary directory for BedTools (you may need to manually"
                 " erase in event of crash): %s" % tempPath)
    try:
        os.makedirs(tempPath)
    except:
        pass
    pybedtools.set_tempdir(tempPath)
    return tempPath
Esempio n. 40
0
def overlap_target_counts(bam_file, target_file, config):
    """Overlap BAM alignment file with shRNA targets.
    """
    out_dir = safe_makedir(config["dir"]["counts"])
    out_file = os.path.join(
        out_dir,
        "{0}.bed".format(os.path.splitext(os.path.basename(bam_file))[0]))
    if not file_exists(out_file):
        pybedtools.set_tempdir(out_dir)
        bed_read_file = pybedtools.BedTool(bam_file).bam_to_bed()
        counts = pybedtools.BedTool(target_file).intersect(bed_read_file,
                                                           c=True)
        counts.saveas(out_file)
    return out_file
Esempio n. 41
0
def launch_coverage(dicoInit):
    printcolor("  • Compute Depth", "0", "222;220;184", dicoInit["color"])
    dicoThread = {}
    set_tempdir(dicoInit['tmp'])
    for bam_num in dicoInit['dicoBam'].keys():
        dicoThread["coverage " + dicoInit['dicoBam'][bam_num]] = {
            "bed": dicoInit["tmp"] + "/target_genes.bed",
            "bam": dicoInit['dicoBam'][bam_num],
            "bam_num": bam_num,
            "returnstatut": None,
            "returnlines": []
        }
    launch_threads(dicoInit, dicoThread, "pybedtoolcoverage",
                   pybedtoolcoverage, 1)
Esempio n. 42
0
def add_annotations(target_file, annotations):
    """Association annotations with BED file of targets.

    Based on the annotate.py example from pybedtools.
    """
    out_file = apply("{0}-annotated{1}".format, os.path.splitext(target_file))
    pybedtools.set_tempdir(os.path.dirname(out_file))
    if not file_exists(out_file):
        all_ann = pybedtools.BedTool(_merge_gff(annotations))
        with_ann = pybedtools.BedTool(target_file).intersect(all_ann, wao=True)
        with open(with_ann.fn) as in_handle:
            with open(out_file, "w") as out_handle:
                _write_combined_features(in_handle, out_handle)
    return out_file
Esempio n. 43
0
def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and
                         not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath
Esempio n. 44
0
def identify_targets(bam_files, config, out_base="shrna_targets"):
    """Create BED file of target regions based on input BAM alignments
    """
    work_dir = safe_makedir(config["dir"]["annotation"])
    pybedtools.set_tempdir(work_dir)
    out_file = os.path.join(work_dir, "{0}.bed".format(out_base))
    if not file_exists(out_file):
        pybed_files = [pybedtools.BedTool(x) for x in bam_files]
        bed_files = [x.bam_to_bed() for x in pybed_files]
        combined_bed = reduce(lambda x, y: x.cat(y), bed_files)
        merge_bed = combined_bed.merge(
            d=config["algorithm"].get("merge_distance", 0))
        merge_bed.saveas(out_file)
    return out_file
Esempio n. 45
0
def multi_gene_sets_to_dict_of_beds(df_multi_gene_set, df_gene_coord,
                                    windowsize, tmp_bed_dir, out_dir,
                                    out_prefix):
    """ 
	INPUT
		df_multi_gene_set: three columns "annotation", "gene" and "annotation_value". Gene is human Ensembl gene names.
	OUTPUT
		dict_of_beds: returns a dict of beds. Keys are annotation names from df_multi_gene_set.
	"""
    print('Making gene set bed files')
    DIR_TMP_PYBEDTOOLS = tmp_bed_dir
    try:
        os.makedirs(DIR_TMP_PYBEDTOOLS, exist_ok=True)
        pybedtools.set_tempdir(
            DIR_TMP_PYBEDTOOLS
        )  # You'll need write permissions to this directory, and it needs to already exist.
    except Exception as e:
        print("Caught exception: {}".format(e))
    print(
        "Failed setting pybedtools tempdir to {}. Will use standard tempdir /tmp"
        .format(DIR_TMP_PYBEDTOOLS))
    #n_genes_not_in_gene_coord = np.sum(np.isin(df_multi_gene_set["gene"], df_gene_coord["GENE"], invert=True)) # numpy.isin(element, test_elements). Calculates element in test_elements, broadcasting over element only. Returns a boolean array of the same shape as element that is True where an element of element is in test_elements and False otherwise.
    #if n_genes_not_in_gene_coord > 0:
    #    print("*WARNING*: {} genes in the (mapped) input multi gene set is not found in the gene coordinate file. These genes will be discarded".format(n_genes_not_in_gene_coord))
    for name_annotation, df_group in df_multi_gene_set.groupby("annotation"):
        print(
            "Merging input multi gene set with gene coordinates for annotation = {}"
            .format(name_annotation))
        df = pd.merge(df_gene_coord,
                      df_group,
                      left_on="GENE",
                      right_on="gene",
                      how="inner")
        df['START'] = np.maximum(0, df['START'] - windowsize)
        df['END'] = df['END'] + windowsize
        list_of_lists = [[
            'chr' + (str(chrom).lstrip('chr')),
            str(start),
            str(end),
            str(name),
            str(score)
        ] for (chrom, start, end, name, score) in np.array(df[
            ['CHR', 'START', 'END', 'GENE', 'annotation_value']])]
        bed_for_annot = pybedtools.BedTool(list_of_lists).sort().merge(
            c=[4, 5], o=["distinct", "max"])
        out_file_name = '{}/{}.{}.bed'.format(out_dir, out_prefix,
                                              name_annotation)
        bed_for_annot.saveas(out_file_name)
    return None
Esempio n. 46
0
def initBedTool(tempPrefix=""):
    # keep temporary files in current directory, to make it a little harder to
    # lose track of them and clog up the system....
    S = string.ascii_uppercase + string.digits
    tag = ''.join(random.choice(S) for x in range(5))
    tempPath = os.path.join(os.getcwd(),
                            "%sTempBedTool_%s" % (tempPrefix, tag))
    logger.info("Temporary directory for BedTools (you may need to manually"
                " erase in event of crash): %s" % tempPath)
    try:
        os.makedirs(tempPath)
    except:
        pass
    pybedtools.set_tempdir(tempPath)
    return tempPath
Esempio n. 47
0
def sort_bed(bed_name):
    """ SORTS a bed file after removing rmsk, sd
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    bed_in = bed_name + '.noRmsk.noSD.bed'
    bed_out = bed_name + '.sorted.noRmsk.noSD.bed'
    if not os.path.isfile(bed_out):
        print "Sorting " + bed_in + "... "
        sort_cmd = ("sort -V -k1,1 -k2,2 %s > %s"
            % (bed_in, bed_out))
        print sort_cmd
        subprocess.call(sort_cmd, shell = True)
        print bed_name + " sorted!"
    else:
        print bed_out + " already sorted"
Esempio n. 48
0
def main():
    if len(sys.argv) != 4:
        sys.exit('[usage] python %s <bed_file> <out_prefix> <spliced_exon.bed>' %sys.argv[0])

    exons = REF_PATH + '/hg19_ref/genes/exons_all.bed_temp'
    tab_file = sys.argv[1]
    out_prefix =  sys.argv[2]
    spliced_exons = sys.argv[3]

    prefix = os.path.basename(tab_file).split('.')[0]
    cov_exon = out_prefix + '_exons.bed'

    set_tempdir(os.path.dirname(out_prefix))
    make_exons(tab_file, cov_exon, exons)
    filter_bed(tab_file, out_prefix, cov_exon, spliced_exons)
Esempio n. 49
0
    def read_circ_rna_file(self, circ_rna_input, annotation_bed, has_header):
        """Reads a CircCoordinates file produced by DCC
        Will halt the program if file not accessible
        Returns a BedTool object
        """
        self.log_entry("Parsing circular RNA input file...")

        # set temporary directory for pybedtools
        pybedtools.set_tempdir(self.cli_params.tmp_directory)

        try:
            file_handle = open(circ_rna_input)
        except PermissionError:
            message = ("Input file " + str(circ_rna_input) + " cannot be read, exiting.")
            logging.info(message)
            sys.exit(message)
        else:
            with file_handle:
                line_iterator = iter(file_handle)
                # skip first line with the header
                # we assume it's there (DCC default)
                if has_header:
                    next(line_iterator)
                bed_content = ""
                bed_entries = 0
                bed_peak_sizes = 0
                for line in line_iterator:
                    columns = line.split('\t')

                    # extract chromosome, start, stop, gene name, and strand
                    entry = [self.strip_chr_name(columns[0]), columns[1], columns[2], columns[3], "0", columns[5]]

                    # concatenate lines to one string
                    bed_content += '\t'.join(entry) + "\n"

                    bed_entries += 1
                    bed_peak_sizes += (int(columns[2]) - int(columns[1]))

            # create a "virtual" BED file
            virtual_bed_file = pybedtools.BedTool(bed_content, from_string=True)
            # Todo: figure out what this code was supposed to do
            test = annotation_bed.intersect(virtual_bed_file, s=True)

        self.log_entry("Done parsing circular RNA input file:")
        self.log_entry("=> %s circular RNAs, %s nt average (theoretical unspliced) length" %
                       (bed_entries, round(bed_peak_sizes / bed_entries)))

        return test
Esempio n. 50
0
def test_getting_example_beds():
    assert 'a.bed' in pybedtools.list_example_files()

    a_fn = pybedtools.example_filename('a.bed')
    assert a_fn == os.path.join(testdir, 'data', 'a.bed')

    a = pybedtools.example_bedtool('a.bed')
    assert a.fn == os.path.join(testdir, 'data', 'a.bed')

    # complain appropriately if nonexistent paths are asked for
    e = FileNotFoundError if six.PY3 else ValueError
    with pytest.raises(e):
        pybedtools.example_filename('nonexistent')
    with pytest.raises(e):
        pybedtools.example_bedtool('nonexistent')
    with pytest.raises(e):
        pybedtools.set_tempdir('nonexistent')
Esempio n. 51
0
def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[], max_interval_size=50000,
                        timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX, disable_deletion_assembly=False, stop_on_fail=False):
    pybedtools.set_tempdir(work)

    bedtool = pybedtools.BedTool(bed)
    total = bedtool.count()

    chrs = set(chrs)
    all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if
                                                                         interval.chrom in chrs]
    selected_intervals = filter(partial(should_be_assembled, disable_deletion_assembly=disable_deletion_assembly), all_intervals)
    ignored_intervals = filter(partial(shouldnt_be_assembled, disable_deletion_assembly=disable_deletion_assembly), all_intervals)

    pool = multiprocessing.Pool(nthreads)
    assembly_fastas = []
    for i in xrange(nthreads):
        intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i]
        kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad,
                       "timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail}
        pool.apply_async(run_spades_single, kwds=kwargs_dict,
                         callback=partial(run_spades_single_callback, result_list=assembly_fastas))

    pool.close()
    pool.join()

    logger.info("Merging the contigs from %s" % (str(assembly_fastas)))
    assembled_fasta = os.path.join(work, "spades_assembled.fa")
    with open(assembled_fasta, "w") as assembled_fd:
        for line in fileinput.input(assembly_fastas):
            assembled_fd.write("%s\n" % (line.strip()))

    logger.info("Indexing the assemblies")
    pysam.faidx(assembled_fasta)

    ignored_bed = None
    if ignored_intervals:
        ignored_bed = os.path.join(work, "ignored.bed")
        pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed)

    pybedtools.cleanup(remove_all=True)

    return assembled_fasta, ignored_bed
Esempio n. 52
0
def annotate(bed, input, bedout, rnazout):
    try:

        pybedtools.set_tempdir('.')  # Make sure we do not write somewhere we are not supposed to
        anno = pybedtools.BedTool(bed)
        rnaz=readrnaz(input)
        tmpbed = pybedtools.BedTool(rnaztobed(rnaz), from_string=True)

        intersection = tmpbed.intersect(anno,wa=True,wb=True,s=True)  # intersect strand specific, keep all info on a and b files

        bedtornaz(intersection, rnaz, bedout, rnazout)

        return 1

    except Exception as err:
        exc_type, exc_value, exc_tb = sys.exc_info()
        tbe = tb.TracebackException(
            exc_type, exc_value, exc_tb,
        )
        print(''.join(tbe.format()),file=sys.stderr)
Esempio n. 53
0
def overlap_with_observed(bed_name, observed_name, observed_denovo):
    """ count the number of observed de novos that overlap with the bed file
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    bed_intersect_dir = ("/sc/orga/projects/chdiTrios/Felix/wgs/" +
        "anno_obs_intersect/" + observed_name + "/")
    bed_out = (bed_intersect_dir + bed_name +
        '.merged.sorted.noRmsk.noSD.' + observed_name + '.bed')
    denovo_bed = BedTool('/hpc/users/richtf01/whole_genome/' +
        'variant_calls/' + observed_denovo)
    # create or load intersection file
    print bed_name + " overlap with " + observed_name
    if not os.path.isfile(bed_out):
        bed = BedTool(bed_name + '.merged.sorted.noRmsk.noSD.bed')
        print "intersecting.. "
        denovo_anno = bed.intersect(denovo_bed)
        denovo_anno.saveas(bed_out)
    else:
        print "already intersected"
        denovo_anno = BedTool(bed_out)
    counter = 0
    for i in denovo_anno:
        counter += 1
    return counter
Esempio n. 54
0
import sys, os
from subprocess import call
import pybedtools
from pybedtools import BedTool
import tabix
from pandas import *
from functools import reduce
import xlwt
import tempfile
# read the GWAVA_DIR from the environment, but default to the directory above where the script is located
GWAVA_DIR = os.getenv('GWAVA_DIR', '/public/home/chendenghui/run/work/non_soft/GWAVA')

# set the pybedtools temp directory
pybedtools.set_tempdir(GWAVA_DIR+'/tmp/')

#['ATF3', 'BATF', 'BCL11A', 'BCL3', 'BCLAF1', 'BDP1', 'BHLHE40', 'BRCA1', 'BRF1', 'BRF2', 'CCNT2', 'CEBPB', 'CHD2', 'CTBP2', 'CTCF', 'CTCFL', 'DNase', 'E2F1', 'E2F4', 'E2F6', 'EBF1', 'EGR1', 'ELF1', 'ELK4', 'EP300', 'ERALPHAA', 'ESRRA', 'ETS1', 'Eralphaa', 'FAIRE', 'FAM48A', 'FOS', 'FOSL1', 'FOSL2', 'FOXA1', 'FOXA2', 'GABPA', 'GATA1', 'GATA2', 'GATA3', 'GTF2B', 'GTF2F1', 'GTF3C2', 'H2AFZ', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me2', 'H3K4me3', 'H3K79me2', 'H3K9ac', 'H3K9me1', 'H3K9me3', 'H4K20me1', 'HDAC2', 'HDAC8', 'HEY1', 'HMGN3', 'HNF4A', 'HNF4G', 'HSF1', 'IRF1', 'IRF3', 'IRF4', 'JUN', 'JUNB', 'JUND', 'KAT2A', 'MAFF', 'MAFK', 'MAX', 'MEF2_complex', 'MEF2A', 'MXI1', 'MYC', 'NANOG', 'NFE2', 'NFKB1', 'NFYA', 'NFYB', 'NR2C2', 'NR3C1', 'NR4A1', 'NRF1', 'PAX5', 'PBX3', 'POLR2A', 'POLR2A_elongating', 'POLR3A', 'POU2F2', 'POU5F1', 'PPARGC1A', 'PRDM1', 'RAD21', 'RDBP', 'REST', 'RFX5', 'RXRA', 'SETDB1', 'SIN3A', 'SIRT6', 'SIX5', 'SLC22A2', 'SMARCA4', 'SMARCB1', 'SMARCC1', 'SMARCC2', 'SMC3', 'SP1', 'SP2', 'SPI1', 'SREBF1', 'SREBF2', 'SRF', 'STAT1', 'STAT2', 'STAT3', 'SUZ12', 'TAF1', 'TAF7', 'TAL1', 'TBP', 'TCF12', 'TCF7L2', 'TFAP2A', 'TFAP2C', 'THAP1', 'TRIM28', 'USF1', 'USF2', 'WRNIP1', 'XRCC4', 'YY1', 'ZBTB33', 'ZBTB7A', 'ZEB1', 'ZNF143', 'ZNF263', 'ZNF274', 'ZZZ3']

def encode_feats(vf, af):
    results = {}
    cols = open(af+'.cols', 'r').readline().strip().split(',')
    #intersection = vs.intersect(feats, wb=True)#TRUE
    tempfile1 = tempfile.mktemp()
    sort_cmd1 = 'bedtools intersect -wb -a %s -b %s > %s' % (vf, af, tempfile1)
    call(sort_cmd1, shell=True)
    tempfile2 = tempfile.mktemp()
    sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$10"\t"$5"_"$6"_"$7"_"$8"_"$9"_"$10"_"$11"_"$12}\' %s > %s' % (tempfile1, tempfile2)
    call(sort_cmd2, shell=True)
    intersection = BedTool(tempfile2)
    annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')
    for entry in annots:
        #fs = entry[5].strip(',').split(',')
Esempio n. 55
0
def get_total_bed_size(bed_fpath, work_dir=None):
    if work_dir:
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
    return sum(len(x) for x in BedTool(bed_fpath).merge())
Esempio n. 56
0
    fields.append(str(max_cov / average_coverage if average_coverage > 0 else 1))
    fields.append(";".join([str(i) for i in very_good_coverages]))
    fields.append(str(average_very_good_coverage))
    fields.append(str(min(very_good_coverages)))
    fields.append(str(max(very_good_coverages)))
    fields.append(str(min(very_good_coverages) / average_very_good_coverage if (average_very_good_coverage > 0) else 1))
    fields.append(str(max(very_good_coverages) / average_very_good_coverage if (average_very_good_coverage > 0) else 1))

    return pybedtools.create_interval_from_list(fields)


def add_coverage_information(in_bed, bam):
    return in_bed.each(partial(annotate_coverage, bam=bam))


pybedtools.set_tempdir(args.tmpdir)

in_bed = pybedtools.BedTool(args.in_bed)

out_bed = in_bed
logger.info("Initial feature count %d" % (out_bed.count()))

if not os.path.isdir(args.tmpdir):
    os.makedirs(args.tmpdir)

bed_fields = ["#CHROM", "START", "END"]
bed_fields += ["NUM_CONTIGS_USED", "TOTAL_CONTIGS_COUNT"]
bed_fields += map(lambda x: "VERYGOOD_%s" % (x),
                  ["NUM_ASMS", "NUM_UNIQUE_ASMS", "HAS_ASM", "IS_CONSISTENT", "INSERTION_LENGTH", "ASM_START",
                   "ASM_END", "EXCISION_REF", "EXCISION_ASM"])
bed_fields += map(lambda x: "GOOD_%s" % (x),
Esempio n. 57
0
def run_age_parallel(intervals_bed=None, reference=None, assembly=None, pad=AGE_PAD, age=None, age_workdir=None,
                     timeout=AGE_TIMEOUT, keep_temp=False, assembly_tool="spades", chrs=[], nthreads=1,
                     min_contig_len=AGE_MIN_CONTIG_LENGTH,
                     max_region_len=AGE_MAX_REGION_LENGTH, sv_types=[], 
                     min_del_subalign_len=MIN_DEL_SUBALIGN_LENGTH, min_inv_subalign_len=MIN_INV_SUBALIGN_LENGTH,
                     age_window = AGE_WINDOW_SIZE):
    func_logger = logging.getLogger("%s-%s" % (run_age_parallel.__name__, multiprocessing.current_process()))

    if not os.path.isdir(age_workdir):
        func_logger.info("Creating %s" % age_workdir)
        os.makedirs(age_workdir)

    if assembly:
        if not os.path.isfile("%s.fai" % assembly):
            func_logger.info("Assembly FASTA wasn't indexed. Will attempt to index now.")
            pysam.faidx(assembly)

        func_logger.info("Loading assembly contigs from %s" % assembly)
        with open(assembly) as assembly_fd:
            if assembly_tool == "spades":
                contigs = [SpadesContig(line[1:]) for line in assembly_fd if line[0] == '>']
            elif assembly_tool == "tigra":
                contigs = [TigraContig(line[1:]) for line in assembly_fd if line[0] == '>']
    else:
        contigs = []

    chrs = set(chrs)
    sv_types = set(sv_types)
    contig_dict = {contig.sv_region.to_tuple(): [] for contig in contigs if (len(
        chrs) == 0 or contig.sv_region.chrom1 in chrs) and contig.sequence_len >= min_contig_len and contig.sv_region.length() <= max_region_len and (
                       len(sv_types) == 0 or contig.sv_type in sv_types)}

    func_logger.info("Generating the contig dictionary for parallel execution")
    small_contigs_count = 0
    for contig in contigs:
        if contig.sv_region.length() > max_region_len: 
            func_logger.info("Too large SV region length: %d > %d" % (contig.sv_region.length(),max_region_len))
            continue
        if (len(chrs) == 0 or contig.sv_region.chrom1 in chrs) and (len(sv_types) == 0 or contig.sv_type in sv_types):
            if contig.sequence_len >= min_contig_len:
                contig_dict[contig.sv_region.to_tuple()].append(contig)
            else:
                small_contigs_count += 1

    region_list = sorted(contig_dict.keys())
    nthreads = min(nthreads, len(region_list))

    if nthreads == 0:
        func_logger.warning("AGE not run since no contigs found")
        return None

    func_logger.info("Will process %d regions with %d contigs (%d small contigs ignored) using %d threads" % (
        len(region_list), sum([len(value) for value in contig_dict.values()]), small_contigs_count, nthreads))

    pybedtools.set_tempdir(age_workdir)
    pool = multiprocessing.Pool(nthreads)

    breakpoints_beds = []
    for i in xrange(nthreads):
        region_sublist = [region for (j, region) in enumerate(region_list) if (j % nthreads) == i]
        kwargs_dict = {"intervals_bed": intervals_bed, "region_list": region_sublist, "contig_dict": contig_dict,
                       "reference": reference, "assembly": assembly, "pad": pad, "age": age, "age_workdir": age_workdir,
                       "timeout": timeout, "keep_temp": keep_temp, "myid": i, 
                       "min_del_subalign_len": min_del_subalign_len, "min_inv_subalign_len": min_inv_subalign_len,
                       "age_window" : age_window}
        pool.apply_async(run_age_single, args=[], kwds=kwargs_dict,
                         callback=partial(run_age_single_callback, result_list=breakpoints_beds))

    pool.close()
    pool.join()

    func_logger.info("Finished parallel execution")

    func_logger.info("Will merge the following breakpoints beds %s" % (str(breakpoints_beds)))

    pybedtools.cleanup(remove_all=True)

    if not breakpoints_beds:
        return None

    bedtool = pybedtools.BedTool(breakpoints_beds[0])
    for bed_file in breakpoints_beds[1:]:
        bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)

    bedtool = bedtool.moveto(os.path.join(age_workdir, "breakpoints_unsorted.bed"))
    merged_bed = os.path.join(age_workdir, "breakpoints.bed")
    bedtool.sort().saveas(merged_bed)

    return merged_bed
Esempio n. 58
0
import pybedtools
import os, difflib, sys
from nose.tools import assert_raises, raises
from pybedtools.helpers import BEDToolsError

testdir = os.path.dirname(__file__)

pybedtools.set_tempdir(".")


def fix(x):
    """
    Replaces spaces with tabs, removes spurious newlines, and lstrip()s each
    line. Makes it really easy to create BED files on the fly for testing and
    checking.
    """
    s = ""
    for i in x.splitlines():
        i = i.lstrip()
        if i.endswith("\t"):
            add_tab = "\t"
        else:
            add_tab = ""
        if len(i) == 0:
            continue
        i = i.split()
        i = "\t".join(i) + add_tab + "\n"
        s += i
    return s

Esempio n. 59
0
import pybedtools
import sys
import os, difflib
from nose.tools import assert_raises

testdir = os.path.dirname(__file__)

pybedtools.set_tempdir('.')

def fix(x):
    """
    Replaces spaces with tabs, removes spurious newlines, and lstrip()s each
    line. Makes it really easy to create BED files on the fly for testing and
    checking.
    """
    s = ""
    for i in  x.splitlines():
        i = i.strip()
        if len(i) == 0:
            continue
        i = i.split()
        i = '\t'.join(i)+'\n'
        s += i
    return s


def test_isBAM():
    bam = pybedtools.example_filename('x.bam')
    notabam = pybedtools.example_filename('a.bed')
    open('tiny.txt', 'w').close()
    assert pybedtools.helpers.isBAM(bam)
Esempio n. 60
0
def setup():
    if not os.path.exists(test_tempdir):
        os.system('mkdir -p %s' % test_tempdir)
    pybedtools.set_tempdir(test_tempdir)