Python set_tempdir Examples, pybedtools.set_tempdir Python Examples

Example #1

0

Show file

File: cnv.py Project: guo-cheng/jcvi

def gcn(args):
    """
    %prog gcn gencode.v26.exonunion.bed data/*.vcf.gz

    Compile gene copy njumber based on CANVAS results.
    """
    p = OptionParser(gcn.__doc__)
    p.set_cpus()
    p.set_tmpdir(tmpdir="tmp")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    exonbed = args[0]
    canvasvcfs = args[1:]
    tsvfile = opts.outfile
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    set_tempdir(tmpdir)

    df = vcf_to_df(canvasvcfs, exonbed, opts.cpus)
    for suffix in (".avgcn", ".medcn"):
        df_to_tsv(df, tsvfile, suffix)

Example #2

0

Show file

File: test1.py Project: ml4wc/pybedtools

def cleanup_unwriteable():
    """
    Reset to normal tempdir operation....
    """
    if os.path.exists(unwriteable):
        os.system('rm -rf %s' % unwriteable)
    pybedtools.set_tempdir(test_tempdir)

Example #3

0

Show file

File: Tools.py Project: FischbachLab/ninjaMap

def calculate_coverage(bamfile_name, output_dir):
    os.makedirs(f'{output_dir}/tmp', exist_ok=True)
    pybedtools.set_tempdir(f'{output_dir}/tmp')
    bed = pybedtools.BedTool(bamfile_name)
    df = bed.genome_coverage(dz = True).to_dataframe(names=['contig','pos', 'depth'])
    pybedtools.cleanup()
    return df

Example #4

0

Show file

File: test1.py Project: fsroque/pybedtools

def test_stream():
    """
    Stream and file-based equality, both whole-file and Interval by
    Interval
    """
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a.intersect(b)

    # make an unwriteable dir...
    orig_tempdir = pybedtools.get_tempdir()
    if os.path.exists('unwriteable'):
        os.system('rm -rf unwriteable')
    os.system('mkdir unwriteable')
    os.system('chmod -w unwriteable')

    # ...set that to the new tempdir
    pybedtools.set_tempdir('unwriteable')

    # this should really not be written anywhere
    d = a.intersect(b, stream=True)

    assert_raises(NotImplementedError, c.__eq__, d)
    d_contents = d.fn.read()
    c_contents = open(c.fn).read()
    assert d_contents == c_contents

    # reconstruct d and check Interval-by-Interval equality
    pybedtools.set_tempdir('unwriteable')
    d = a.intersect(b, stream=True)

    for i,j in zip(c, d):
        assert str(i) == str(j)

    # Now do something similar with GFF files.
    a = pybedtools.example_bedtool('a.bed')
    f = pybedtools.example_bedtool('d.gff')

    # file-based
    pybedtools.set_tempdir(orig_tempdir)
    g1 = f.intersect(a)

    # streaming
    pybedtools.set_tempdir('unwriteable')
    g2 = f.intersect(a, stream=True)

    for i,j in zip(g1, g2):
        assert str(i) == str(j)

    # this was segfaulting at one point, just run to make sure
    g3 = f.intersect(a, stream=True)
    for i in iter(g3):
        print i

    for row in f.cut(range(3), stream=True):
        row[0], row[1], row[2]
        assert_raises(IndexError, row.__getitem__, 3)

    pybedtools.set_tempdir(orig_tempdir)
    os.system('rm -fr unwriteable')

Example #5

0

Show file

File: callable_class.py Project: frichter/embryo_rnaseq

 def __init__(self, id, home_dir):
     """Create the callable locus object."""
     self.aligner_ls = ['star', 'hisat2']
     self.id = id
     self.subdir = '{}callable_comparison/{}/'.format(home_dir, id)
     if not os.path.exists(self.subdir):
         os.mkdir(self.subdir)
     # original input file (output from GATK callableloci)
     self.call_loci_ls = [
         '{}FASTQ/{}/{}_{}_callable.bed'.format(home_dir, id, id, i)
         for i in self.aligner_ls
     ]
     # intermediate files:
     self.call_only_ls = [
         '{}callable_{}.bed'.format(self.subdir, i) for i in self.aligner_ls
     ]
     self.call_inter = '{}callable_{}.bed'.format(self.subdir, 'inter')
     self.call_union = '{}callable_{}.bed'.format(self.subdir, 'union')
     self.callable_ls = self.call_only_ls + [
         self.call_inter, self.call_union
     ]
     callable_fs = ['star', 'hisat2', 'intersect', 'union']
     self.callable_dict = dict(zip(callable_fs, self.callable_ls))
     self.len_dict = dict(zip(self.callable_ls,
                              [0] * len(self.callable_ls)))
     self.len_loc = self.subdir[:-1] + '_lengths.txt'
     pybedtools.set_tempdir(home_dir + '/tmp_dir/')

Example #6

0

Show file

File: test1.py Project: dnewkirk/pybedtools

def cleanup_unwriteable():
    """
    Reset to normal tempdir operation....
    """
    if os.path.exists(unwriteable):
        os.system('rm -rf %s' % unwriteable)
    pybedtools.set_tempdir(test_tempdir)

Example #7

0

Show file

File: genotype.py Project: joelmartin/metasv

def genotype_intervals(intervals_file=None, bam=None, workdir=None, window=GT_WINDOW, isize_mean=ISIZE_MEAN, isize_sd=ISIZE_SD, normal_frac_threshold=GT_NORMAL_FRAC):
    func_logger = logging.getLogger("%s-%s" % (genotype_intervals.__name__, multiprocessing.current_process()))

    if workdir and not os.path.isdir(workdir):
        os.makedirs(workdir)

    pybedtools.set_tempdir(workdir)

    genotyped_intervals = []
    start_time = time.time()

    isize_min = max(0, isize_mean - 3 * isize_sd)
    isize_max = isize_mean + 3 * isize_sd

    try:
        bam_handle = pysam.Samfile(bam, "rb")
        for interval in pybedtools.BedTool(intervals_file):
            chrom, start, end, sv_type, svlen = parse_interval(interval)
            genotype = genotype_interval(chrom, start, end, sv_type, svlen, bam_handle, isize_min, isize_max, window, normal_frac_threshold)
            fields = interval.fields + [genotype]
            genotyped_intervals.append(pybedtools.create_interval_from_list(fields))
        bedtool = pybedtools.BedTool(genotyped_intervals).moveto(os.path.join(workdir, "genotyped.bed"))
    except Exception as e:
        func_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e
    func_logger.info("Genotyped %d intervals in %g minutes" % (len(genotyped_intervals), (time.time() - start_time)/60.0))

    return bedtool.fn

Example #8

0

Show file

def annotate(bed, input, bedout, rnazout):
    try:

        pybedtools.set_tempdir(
            '.')  # Make sure we do not write somewhere we are not supposed to
        anno = pybedtools.BedTool(bed)
        rnaz = readrnaz(input)
        tmpbed = pybedtools.BedTool(rnaztobed(rnaz), from_string=True)

        intersection = tmpbed.intersect(
            anno, wa=True, wb=True, s=True
        )  # intersect strand specific, keep all info on a and b files

        bedtornaz(intersection, rnaz, bedout, rnazout)

        return 1

    except Exception as err:
        exc_type, exc_value, exc_tb = sys.exc_info()
        tbe = tb.TracebackException(
            exc_type,
            exc_value,
            exc_tb,
        )
        print(''.join(tbe.format()), file=sys.stderr)

Example #9

0

Show file

File: cnv.py Project: xuanblo/jcvi

def gcn(args):
    """
    %prog gcn gencode.v26.exonunion.bed data/*.vcf.gz

    Compile gene copy njumber based on CANVAS results.
    """
    p = OptionParser(gcn.__doc__)
    p.set_cpus()
    p.set_tmpdir(tmpdir="tmp")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    exonbed = args[0]
    canvasvcfs = args[1:]
    tsvfile = opts.outfile
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    set_tempdir(tmpdir)

    df = vcf_to_df(canvasvcfs, exonbed, opts.cpus)
    for suffix in (".avgcn", ".medcn"):
        df_to_tsv(df, tsvfile, suffix)

Example #10

0

Show file

def test_stream():
    """
    Stream and file-based equality, both whole-file and Interval by
    Interval
    """
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a.intersect(b)

    # make an unwriteable dir...
    orig_tempdir = pybedtools.get_tempdir()
    if os.path.exists('unwriteable'):
        os.system('rm -rf unwriteable')
    os.system('mkdir unwriteable')
    os.system('chmod -w unwriteable')

    # ...set that to the new tempdir
    pybedtools.set_tempdir('unwriteable')

    # this should really not be written anywhere
    d = a.intersect(b, stream=True)

    assert_raises(NotImplementedError, c.__eq__, d)
    d_contents = d.fn.read()
    c_contents = open(c.fn).read()
    assert d_contents == c_contents

    # reconstruct d and check Interval-by-Interval equality
    pybedtools.set_tempdir('unwriteable')
    d = a.intersect(b, stream=True)

    for i, j in zip(c, d):
        assert str(i) == str(j)

    # Now do something similar with GFF files.
    a = pybedtools.example_bedtool('a.bed')
    f = pybedtools.example_bedtool('d.gff')

    # file-based
    pybedtools.set_tempdir(orig_tempdir)
    g1 = f.intersect(a)

    # streaming
    pybedtools.set_tempdir('unwriteable')
    g2 = f.intersect(a, stream=True)

    for i, j in zip(g1, g2):
        assert str(i) == str(j)

    # this was segfaulting at one point, just run to make sure
    g3 = f.intersect(a, stream=True)
    for i in iter(g3):
        print i

    for row in f.cut(range(3), stream=True):
        row[0], row[1], row[2]
        assert_raises(IndexError, row.__getitem__, 3)

    pybedtools.set_tempdir(orig_tempdir)
    os.system('rm -fr unwriteable')

Example #11

0

Show file

def batch_callable_bed(bam_files, output_bed_file, work_dir, genome_fasta_file, min_depth,
                       parall_view=None):
    """ Picking random 3 samples and getting a callable for them.
        Trade off between looping through all samples in a huge batch,
        and hitting an sample with outstanding coverage.
    """
    if can_reuse(output_bed_file, bam_files):
        return output_bed_file

    work_dir = safe_mkdir(join(work_dir, 'callable_work'))
    # random.seed(1234)  # seeding random for reproducability
    # bam_files = random.sample(bam_files, min(len(bam_files), 3))

    if parall_view:
        callable_beds = parall_view.run(_calculate, [
            [bf, work_dir, genome_fasta_file, min_depth]
            for bf in bam_files])
    else:
        with parallel_view(len(bam_files), ParallelCfg(threads=len(bam_files)), work_dir) as parall_view:
            callable_beds = parall_view.run(_calculate, [
                [bf, work_dir, genome_fasta_file, min_depth]
                for bf in bam_files])

    good_overlap_sample_fraction = 0.8  # we want to pick those regions that have coverage at 80% of samples
    good_overlap_count = max(1, good_overlap_sample_fraction * len(callable_beds))
    info(f'Intersecting callable regions and picking good overlaps with >={good_overlap_count} '
         f'samples ({100 * good_overlap_sample_fraction}% of {len(callable_beds)})')
    with file_transaction(work_dir, output_bed_file) as tx:
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        intersection = pybedtools.BedTool() \
            .multi_intersect(i=callable_beds) \
            .filter(lambda r: len(r[4].split(',')) >= good_overlap_count)
        intersection.saveas(tx)
    info(f'Saved to {output_bed_file}')
    return output_bed_file

Example #12

0

Show file

File: sex.py Project: vladsaveliev/Utils

def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.')
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex

Example #13

0

Show file

File: run_spades.py Project: thongnt2/metasv

def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[],
                        max_interval_size=SPADES_MAX_INTERVAL_SIZE,
                        timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX,
                        svs_to_assemble=SVS_ASSEMBLY_SUPPORTED,
                        stop_on_fail=False, max_read_pairs=EXTRACTION_MAX_READ_PAIRS):
    pybedtools.set_tempdir(work)

    logger.info("Running SPAdes on the intervals in %s" % bed)
    if not bed:
        logger.info("No BED file specified")
        return None, None

    bedtool = pybedtools.BedTool(bed)
    total = bedtool.count()

    chrs = set(chrs)
    all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if
                                                                         interval.chrom in chrs]
    selected_intervals = filter(partial(should_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
                                all_intervals)
    ignored_intervals = filter(partial(shouldnt_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
                               all_intervals)

    pool = multiprocessing.Pool(nthreads)
    assembly_fastas = []
    for i in xrange(nthreads):
        intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i]
        kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad,
                       "timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail,
                       "max_read_pairs": max_read_pairs}
        pool.apply_async(run_spades_single, kwds=kwargs_dict,
                         callback=partial(run_spades_single_callback, result_list=assembly_fastas))

    pool.close()
    pool.join()

    logger.info("Merging the contigs from %s" % (str(assembly_fastas)))
    assembled_fasta = os.path.join(work, "spades_assembled.fa")
    with open(assembled_fasta, "w") as assembled_fd:
        for line in fileinput.input(assembly_fastas):
            assembled_fd.write("%s\n" % (line.strip()))

    if os.path.getsize(assembled_fasta) > 0:
        logger.info("Indexing the assemblies")
        pysam.faidx(assembled_fasta)
    else:
        logger.error("No assembly generated")
        assembled_fasta = None

    ignored_bed = None
    if ignored_intervals:
        ignored_bed = os.path.join(work, "ignored.bed")
        pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed)

    pybedtools.cleanup(remove_all=True)

    return assembled_fasta, ignored_bed

Example #14

0

Show file

File: shared.py Project: jielovedata/bcbio-nextgen

def bedtools_tmpdir(data):
    with tx_tmpdir(data) as tmpdir:
        orig_tmpdir = tempfile.gettempdir()
        pybedtools.set_tempdir(tmpdir)
        yield
        if orig_tmpdir and os.path.exists(orig_tmpdir):
            pybedtools.set_tempdir(orig_tmpdir)
        else:
            tempfile.tempdir = None

Example #15

0

Show file

File: shared.py Project: Cyberbio-Lab/bcbio-nextgen

def bedtools_tmpdir(data):
    with tx_tmpdir(data) as tmpdir:
        orig_tmpdir = tempfile.gettempdir()
        pybedtools.set_tempdir(tmpdir)
        yield
        if orig_tmpdir and os.path.exists(orig_tmpdir):
            pybedtools.set_tempdir(orig_tmpdir)
        else:
            tempfile.tempdir = None

Example #16

0

Show file

File: enrichment_check.py Project: venkan/circtools

    def do_intersection(self, query_bed, base_bed):
        """Gets two bed files (supplied peaks and circle coordinates) and does an intersection
        """
        # set temporary directory for pybedtools
        pybedtools.set_tempdir(self.cli_params.tmp_directory)

        # we employ the c=true parameter to directly get the counts as part of the results
        intersect_return = base_bed.intersect(query_bed, c=True)
        return intersect_return

Example #17

0

Show file

    def set_tempdir(self, dirpath):
        '''Methods that sets temp directory for pybedtools objects

        :param dirpath: Path to temp directory.
        :type dirpath: str

        :return: Nothing to be returned.
        :rtype: None
        '''
        pybedtools.set_tempdir(dirpath)

Example #18

0

Show file

File: genotype.py Project: vswilliamson/metasv

def parallel_genotype_intervals(intervals_file=None, bam=None, workdir=None, nthreads=1, chromosomes=[],
                                window=GT_WINDOW, isize_mean=ISIZE_MEAN, isize_sd=ISIZE_SD,
                                normal_frac_threshold=GT_NORMAL_FRAC):
    func_logger = logging.getLogger("%s-%s" % (parallel_genotype_intervals.__name__, multiprocessing.current_process()))

    if not intervals_file:
        func_logger.warning("No intervals file specified. Perhaps no intervals to process")
        return None

    if workdir and not os.path.isdir(workdir):
        os.makedirs(workdir)

    chromosomes = set(chromosomes)

    start_time = time.time()

    bedtool = pybedtools.BedTool(intervals_file)
    selected_intervals = [interval for interval in bedtool if not chromosomes or interval.chrom in chromosomes]
    nthreads = min(len(selected_intervals), nthreads)
    intervals_per_process = (len(selected_intervals) + nthreads - 1) / nthreads

    pool = multiprocessing.Pool(nthreads)
    genotyped_beds = []
    for i in xrange(nthreads):
        process_workdir = os.path.join(workdir, str(i))
        if not os.path.isdir(process_workdir):
            os.makedirs(process_workdir)
        process_intervals = pybedtools.BedTool(
            selected_intervals[i * intervals_per_process: (i + 1) * intervals_per_process]).saveas(
            os.path.join(process_workdir, "ungenotyped.bed"))
        kwargs_dict = {"intervals_file": process_intervals.fn, "bam": bam, "workdir": process_workdir, "window": window,
                       "isize_mean": isize_mean, "isize_sd": isize_sd, "normal_frac_threshold": normal_frac_threshold}
        pool.apply_async(genotype_intervals, kwds=kwargs_dict,
                         callback=partial(genotype_intervals_callback, result_list=genotyped_beds))

    pool.close()
    pool.join()

    func_logger.info("Following BED files will be merged: %s" % (str(genotyped_beds)))

    if not genotyped_beds:
        func_logger.warn("No intervals generated")
        return None

    pybedtools.set_tempdir(workdir)
    bedtool = pybedtools.BedTool(genotyped_beds[0])

    for bed_file in genotyped_beds[1:]:
        bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)
    bedtool = bedtool.sort().moveto(os.path.join(workdir, "genotyped.bed"))

    func_logger.info("Finished parallel genotyping of %d intervals in %g minutes" % (
    len(selected_intervals), (time.time() - start_time) / 60.0))

    return bedtool.fn

Example #19

0

Show file

File: test1.py Project: dnewkirk/pybedtools

def make_unwriteable():
    """
    Make a directory that cannot be written to and set the pybedtools tempdir
    to it. This is used to isolate "streaming" tests to ensure they do not
    write to disk.
    """
    if os.path.exists(unwriteable):
        os.system('rm -rf %s' % unwriteable)
    os.system('mkdir -p %s' % unwriteable)
    os.system('chmod -w %s' % unwriteable)
    pybedtools.set_tempdir(unwriteable)

Example #20

0

Show file

File: test1.py Project: ml4wc/pybedtools

def make_unwriteable():
    """
    Make a directory that cannot be written to and set the pybedtools tempdir
    to it. This is used to isolate "streaming" tests to ensure they do not
    write to disk.
    """
    if os.path.exists(unwriteable):
        os.system('rm -rf %s' % unwriteable)
    os.system('mkdir -p %s' % unwriteable)
    os.system('chmod -w %s' % unwriteable)
    pybedtools.set_tempdir(unwriteable)

Example #21

0

Show file

File: enrichment_check.py Project: venkan/circtools

    def shuffle_peaks_through_genome(self, iteration, bed_file, genome_file):
        """Gets a (virtual) BED files and shuffle its contents throughout the supplied genome
        Will only use supplied annotation for features (in our case only transcript regions)
        """
        # set temporary directory for pybedtools
        pybedtools.set_tempdir(self.cli_params.tmp_directory)

        self.log_entry("Processing shuffling thread %d" % (iteration+1))
        shuffled_bed = bed_file.shuffle(g=genome_file)

        return shuffled_bed

Example #22

0

Show file

def main():
    bed_path = '/stor/work/Lambowitz/cdw2854/plasmaDNA/bedFiles'
    set_tempdir(bed_path)
    ref_fasta = os.environ['REF'] + '/GRCh38/hg38_rDNA/genome_rDNA.fa'
    filenames = ['P1203-SQ2_S3.bed','SRR2130052.bed']
    regular_chrom = map(str, np.arange(1,23))
    regular_chrom.extend(['X','Y'])
    func = partial(analyze_file, bed_path, regular_chrom, ref_fasta)
    p = Pool(12)
    p.map(func, filenames)
    p.close()
    p.join()

Example #23

0

Show file

File: detect_skipped_exons.py Project: daaaaande/FUCHS

    def __init__(self, outfolder, sample, bedfile, tmp_folder, platform, cpus):

        self.folder = outfolder + sample
        self.sample = sample
        self.outfile = outfolder + sample + ".skipped_exons.txt"
        self.bedfile = bedfile
        self.tmp_folder = tmp_folder
        self.platform = platform
        self.cpus = cpus

        tempfile.tempdir = tmp_folder
        pybedtools.set_tempdir(tmp_folder)

Example #24

0

Show file

File: count.py Project: Xiuying/projects

def overlap_target_counts(bam_file, target_file, config):
    """Overlap BAM alignment file with shRNA targets.
    """
    out_dir = safe_makedir(config["dir"]["counts"])
    out_file = os.path.join(out_dir,
                            "{0}.bed".format(os.path.splitext(os.path.basename(bam_file))[0]))
    if not file_exists(out_file):
        pybedtools.set_tempdir(out_dir)
        bed_read_file = pybedtools.BedTool(bam_file).bam_to_bed()
        counts = pybedtools.BedTool(target_file).intersect(bed_read_file, c=True)
        counts.saveas(out_file)
    return out_file

Example #25

0

Show file

File: call.py Project: drvenki/svcaller

def call_events_inner(filtered_bam, event_type, fasta_filename, events_gff,
                      events_bam, filter_event_overlap, tmp_dir):
    logging.info("Calling events on file {}:".format(filtered_bam))

    samfile = pysam.AlignmentFile(filtered_bam, "rb")
    filtered_reads = [r for r in list(samfile)]

    # Call events:
    pybedtools.set_tempdir(
        tmp_dir
    )  # Necessary to control temporary folder usage during event calling
    logging.info("Calling initial events...")
    events = list(
        call_events(filtered_reads, fasta_filename, filter_event_overlap))

    # Filter on soft-clipping support:
    logging.info("Filtering on soft-clip support...")
    filtered_events = list(
        filter(lambda event: event.has_soft_clip_support(), events))

    # Optionally filter on presence of soft-clipped regions scattered throughout the reads, depending on
    # the event type:
    logging.info("Filtering on scattered soft-clip regions...")
    if event_type == SvType.DUP.value or event_type == SvType.INV.value:
        filtered_events = list(
            filter(lambda event: not event.has_scattered_soft_clip_regions(),
                   filtered_events))

    # Print them out:
    logging.info("Printing final events...")
    for event in filtered_events:
        print(event.get_gtf(), file=events_gff)

    # Write to a temporary bam file, to facilitate subsequent sorting with pysam. NOTE:
    # Could do sorting in memory since the read count should be low, but it seems less
    # bug-prone to use pysam's sort functionality:
    unique_id = uuid.uuid4()
    tmp_bam_filename = "{}/penultimate_bamfile_{}.bam".format(
        tmp_dir, unique_id, event_type)
    with pysam.AlignmentFile(tmp_bam_filename, "wb",
                             header=samfile.header) as outf:
        for event in filtered_events:
            for read in event._terminus1_reads + event._terminus2_reads:
                outf.write(read)

    # Sort the intermediate bam file with samtools to produce the final output bam file, then index it:
    pysam.sort("-o", events_bam, tmp_bam_filename)
    pysam.index(str(events_bam))

    remove_bam_and_bai(tmp_bam_filename)

    events_gff.close()

Example #26

0

Show file

def subtract_bed_sd(bed_name, bed_filter):
    """REMOVES regions of annotation of interest that overlap with
    segmental duplications
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.noRmsk.noSD.bed'):
        bed = BedTool(bed_name + '.noRmsk.bed')
        print "Removing calls in seg dup from " + bed_name + "..."
        bed_no_overlap = bed.subtract(bed_filter)
        bed_no_overlap.saveas(bed_name + '.noRmsk.noSD.bed')
        print bed_name + " done!"
    else:
        print bed_name + " Seg dup calls already removed"

Example #27

0

Show file

def intersect_bed(bed_name, bed_filter):
    """KEEPS regions of annotation of interest that overlap with
    repeat-masked regions
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.Rmsk.bed'):
        bed = BedTool(bed_name + '.merged.sorted.bed')
        print "Keeping calls in rmsk from " + bed_name + "..."
        bed_overlap = bed.intersect(bed_filter)
        bed_overlap.saveas(bed_name + '.Rmsk.bed')
        print bed_name + " done!"
    else:
        print bed_name + " rmsk calls already isolated"

Example #28

0

Show file

File: funcs_large_bed.py Project: frichter/BedPyMP

def intersect_bed(bed_name, bed_filter):
    """KEEPS regions of annotation of interest that overlap with
    repeat-masked regions
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.Rmsk.bed'):
        bed = BedTool(bed_name + '.merged.sorted.bed')
        print "Keeping calls in rmsk from " + bed_name + "..."
        bed_overlap = bed.intersect(bed_filter)
        bed_overlap.saveas(bed_name + '.Rmsk.bed')
        print bed_name + " done!"
    else:
        print bed_name + " rmsk calls already isolated"

Example #29

0

Show file

File: funcs_large_bed.py Project: frichter/BedPyMP

def subtract_bed_rmsk(bed_name, bed_filter):
    """REMOVES regions of annotation of interest that overlap with
    repeat-masked regions
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.noRmsk.bed'):
        bed = BedTool(bed_name + '.bed') # .merged.sorted
        print "Removing calls in rmsk from " + bed_name + "..."
        bed_no_overlap = bed.subtract(bed_filter)
        bed_no_overlap.saveas(bed_name + '.noRmsk.bed')
        print bed_name + " done!"
    else:
        print bed_name + " rmsk calls already removed"

Example #30

0

Show file

File: target.py Project: bachu-mahe/projects

def identify_targets(bam_files, config, out_base="shrna_targets"):
    """Create BED file of target regions based on input BAM alignments
    """
    work_dir = safe_makedir(config["dir"]["annotation"])
    pybedtools.set_tempdir(work_dir)
    out_file = os.path.join(work_dir, "{0}.bed".format(out_base))
    if not file_exists(out_file):
        pybed_files = [pybedtools.BedTool(x) for x in bam_files]
        bed_files = [x.bam_to_bed() for x in pybed_files]
        combined_bed = reduce(lambda x, y: x.cat(y), bed_files)
        merge_bed = combined_bed.merge(d=config["algorithm"].get("merge_distance", 0))
        merge_bed.saveas(out_file)
    return out_file

Example #31

0

Show file

def subtract_bed_rmsk(bed_name, bed_filter):
    """REMOVES regions of annotation of interest that overlap with
    repeat-masked regions
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.noRmsk.bed'):
        bed = BedTool(bed_name + '.bed')  # .merged.sorted
        print "Removing calls in rmsk from " + bed_name + "..."
        bed_no_overlap = bed.subtract(bed_filter)
        bed_no_overlap.saveas(bed_name + '.noRmsk.bed')
        print bed_name + " done!"
    else:
        print bed_name + " rmsk calls already removed"

Example #32

0

Show file

File: funcs_large_bed.py Project: frichter/BedPyMP

def subtract_bed_sd(bed_name, bed_filter):
    """REMOVES regions of annotation of interest that overlap with
    segmental duplications
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    if not os.path.isfile(bed_name + '.noRmsk.noSD.bed'):
        bed = BedTool(bed_name + '.noRmsk.bed')
        print "Removing calls in seg dup from " + bed_name + "..."
        bed_no_overlap = bed.subtract(bed_filter)
        bed_no_overlap.saveas(bed_name + '.noRmsk.noSD.bed')
        print bed_name + " done!"
    else:
        print bed_name + " Seg dup calls already removed"

Example #33

0

Show file

def genotype_intervals(intervals_file=None,
                       bams=[],
                       workdir=None,
                       window=GT_WINDOW,
                       isize_mean=ISIZE_MEAN,
                       isize_sd=ISIZE_SD,
                       normal_frac_threshold=GT_NORMAL_FRAC):
    func_logger = logging.getLogger(
        "%s-%s" %
        (genotype_intervals.__name__, multiprocessing.current_process()))

    if workdir and not os.path.isdir(workdir):
        os.makedirs(workdir)

    pybedtools.set_tempdir(workdir)

    genotyped_intervals = []
    start_time = time.time()

    isize_min = max(0, isize_mean - 3 * isize_sd)
    isize_max = isize_mean + 3 * isize_sd

    try:
        bam_handles = [pysam.Samfile(bam, "rb") for bam in bams]
        for interval in pybedtools.BedTool(intervals_file):
            chrom, start, end, sv_type, svlen = parse_interval(interval)
            genotype = genotype_interval(str(chrom), start, end, sv_type,
                                         svlen, bam_handles, isize_min,
                                         isize_max, window,
                                         normal_frac_threshold)
            fields = interval.fields + [genotype]
            genotyped_intervals.append(
                pybedtools.create_interval_from_list(fields))
        for bam_handle in bam_handles:
            bam_handle.close()
        bedtool = pybedtools.BedTool(genotyped_intervals).moveto(
            os.path.join(workdir, "genotyped.bed"))
    except Exception as e:
        func_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e
    func_logger.info("Genotyped %d intervals in %g minutes" %
                     (len(genotyped_intervals),
                      (time.time() - start_time) / 60.0))

    return bedtool.fn

Example #34

0

Show file

File: funcs_large_bed.py Project: frichter/BedPyMP

def merge_bed(bed_name):
    """ MERGES a bed file after removing rmsk, sd
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    bed_in = bed_name + '.sorted.noRmsk.noSD.bed'
    bed_out = bed_name + '.merged.sorted.noRmsk.noSD.bed'
    if not os.path.isfile(bed_out):
        bed = BedTool(bed_in)
        print "Merging " + bed_in + "..."
        bed_merged = bed.merge()
        bed_merged.saveas(bed_out)
        print bed_name + " done!"
    else:
        print bed_out + " already merged"

Example #35

0

Show file

File: target.py Project: bachu-mahe/projects

def add_annotations(target_file, annotations):
    """Association annotations with BED file of targets.

    Based on the annotate.py example from pybedtools.
    """
    out_file = apply("{0}-annotated{1}".format, os.path.splitext(target_file))
    pybedtools.set_tempdir(os.path.dirname(out_file))
    if not file_exists(out_file):
        all_ann = pybedtools.BedTool(_merge_gff(annotations))
        with_ann = pybedtools.BedTool(target_file).intersect(all_ann, wao=True)
        with open(with_ann.fn) as in_handle:
            with open(out_file, "w") as out_handle:
                _write_combined_features(in_handle, out_handle)
    return out_file

Example #36

0

Show file

def merge_bed(bed_name):
    """ MERGES a bed file after removing rmsk, sd
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    bed_in = bed_name + '.sorted.noRmsk.noSD.bed'
    bed_out = bed_name + '.merged.sorted.noRmsk.noSD.bed'
    if not os.path.isfile(bed_out):
        bed = BedTool(bed_in)
        print "Merging " + bed_in + "..."
        bed_merged = bed.merge()
        bed_merged.saveas(bed_out)
        print bed_name + " done!"
    else:
        print bed_out + " already merged"

Example #37

0

Show file

def sort_bed(bed_name):
    """ SORTS a bed file after removing rmsk, sd
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    bed_in = bed_name + '.noRmsk.noSD.bed'
    bed_out = bed_name + '.sorted.noRmsk.noSD.bed'
    if not os.path.isfile(bed_out):
        print "Sorting " + bed_in + "... "
        sort_cmd = ("sort -V -k1,1 -k2,2 %s > %s" % (bed_in, bed_out))
        print sort_cmd
        subprocess.call(sort_cmd, shell=True)
        print bed_name + " sorted!"
    else:
        print bed_out + " already sorted"

Example #38

0

Show file

File: bed_utils.py Project: vladsaveliev/TargQC

def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and not any(
            x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath

Example #39

0

Show file

File: common.py Project: glennhickey/teHmm

def initBedTool(tempPrefix=""):
    # keep temporary files in current directory, to make it a little harder to
    # lose track of them and clog up the system....
    S = string.ascii_uppercase + string.digits
    tag = ''.join(random.choice(S) for x in range(5))
    tempPath = os.path.join(os.getcwd(), "%sTempBedTool_%s" % (tempPrefix, tag))
    logger.info("Temporary directory for BedTools (you may need to manually"
                 " erase in event of crash): %s" % tempPath)
    try:
        os.makedirs(tempPath)
    except:
        pass
    pybedtools.set_tempdir(tempPath)
    return tempPath

Example #40

0

Show file

File: count.py Project: pamonlan/projects

def overlap_target_counts(bam_file, target_file, config):
    """Overlap BAM alignment file with shRNA targets.
    """
    out_dir = safe_makedir(config["dir"]["counts"])
    out_file = os.path.join(
        out_dir,
        "{0}.bed".format(os.path.splitext(os.path.basename(bam_file))[0]))
    if not file_exists(out_file):
        pybedtools.set_tempdir(out_dir)
        bed_read_file = pybedtools.BedTool(bam_file).bam_to_bed()
        counts = pybedtools.BedTool(target_file).intersect(bed_read_file,
                                                           c=True)
        counts.saveas(out_file)
    return out_file

Example #41

0

Show file

def launch_coverage(dicoInit):
    printcolor("  • Compute Depth", "0", "222;220;184", dicoInit["color"])
    dicoThread = {}
    set_tempdir(dicoInit['tmp'])
    for bam_num in dicoInit['dicoBam'].keys():
        dicoThread["coverage " + dicoInit['dicoBam'][bam_num]] = {
            "bed": dicoInit["tmp"] + "/target_genes.bed",
            "bam": dicoInit['dicoBam'][bam_num],
            "bam_num": bam_num,
            "returnstatut": None,
            "returnlines": []
        }
    launch_threads(dicoInit, dicoThread, "pybedtoolcoverage",
                   pybedtoolcoverage, 1)

Example #42

0

Show file

File: target.py Project: pamonlan/projects

def add_annotations(target_file, annotations):
    """Association annotations with BED file of targets.

    Based on the annotate.py example from pybedtools.
    """
    out_file = apply("{0}-annotated{1}".format, os.path.splitext(target_file))
    pybedtools.set_tempdir(os.path.dirname(out_file))
    if not file_exists(out_file):
        all_ann = pybedtools.BedTool(_merge_gff(annotations))
        with_ann = pybedtools.BedTool(target_file).intersect(all_ann, wao=True)
        with open(with_ann.fn) as in_handle:
            with open(out_file, "w") as out_handle:
                _write_combined_features(in_handle, out_handle)
    return out_file

Example #43

0

Show file

File: bed_utils.py Project: vladsaveliev/Utils

def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and
                         not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath

Example #44

0

Show file

File: target.py Project: pamonlan/projects

def identify_targets(bam_files, config, out_base="shrna_targets"):
    """Create BED file of target regions based on input BAM alignments
    """
    work_dir = safe_makedir(config["dir"]["annotation"])
    pybedtools.set_tempdir(work_dir)
    out_file = os.path.join(work_dir, "{0}.bed".format(out_base))
    if not file_exists(out_file):
        pybed_files = [pybedtools.BedTool(x) for x in bam_files]
        bed_files = [x.bam_to_bed() for x in pybed_files]
        combined_bed = reduce(lambda x, y: x.cat(y), bed_files)
        merge_bed = combined_bed.merge(
            d=config["algorithm"].get("merge_distance", 0))
        merge_bed.saveas(out_file)
    return out_file

Example #45

0

Show file

def multi_gene_sets_to_dict_of_beds(df_multi_gene_set, df_gene_coord,
                                    windowsize, tmp_bed_dir, out_dir,
                                    out_prefix):
    """ 
	INPUT
		df_multi_gene_set: three columns "annotation", "gene" and "annotation_value". Gene is human Ensembl gene names.
	OUTPUT
		dict_of_beds: returns a dict of beds. Keys are annotation names from df_multi_gene_set.
	"""
    print('Making gene set bed files')
    DIR_TMP_PYBEDTOOLS = tmp_bed_dir
    try:
        os.makedirs(DIR_TMP_PYBEDTOOLS, exist_ok=True)
        pybedtools.set_tempdir(
            DIR_TMP_PYBEDTOOLS
        )  # You'll need write permissions to this directory, and it needs to already exist.
    except Exception as e:
        print("Caught exception: {}".format(e))
    print(
        "Failed setting pybedtools tempdir to {}. Will use standard tempdir /tmp"
        .format(DIR_TMP_PYBEDTOOLS))
    #n_genes_not_in_gene_coord = np.sum(np.isin(df_multi_gene_set["gene"], df_gene_coord["GENE"], invert=True)) # numpy.isin(element, test_elements). Calculates element in test_elements, broadcasting over element only. Returns a boolean array of the same shape as element that is True where an element of element is in test_elements and False otherwise.
    #if n_genes_not_in_gene_coord > 0:
    #    print("*WARNING*: {} genes in the (mapped) input multi gene set is not found in the gene coordinate file. These genes will be discarded".format(n_genes_not_in_gene_coord))
    for name_annotation, df_group in df_multi_gene_set.groupby("annotation"):
        print(
            "Merging input multi gene set with gene coordinates for annotation = {}"
            .format(name_annotation))
        df = pd.merge(df_gene_coord,
                      df_group,
                      left_on="GENE",
                      right_on="gene",
                      how="inner")
        df['START'] = np.maximum(0, df['START'] - windowsize)
        df['END'] = df['END'] + windowsize
        list_of_lists = [[
            'chr' + (str(chrom).lstrip('chr')),
            str(start),
            str(end),
            str(name),
            str(score)
        ] for (chrom, start, end, name, score) in np.array(df[
            ['CHR', 'START', 'END', 'GENE', 'annotation_value']])]
        bed_for_annot = pybedtools.BedTool(list_of_lists).sort().merge(
            c=[4, 5], o=["distinct", "max"])
        out_file_name = '{}/{}.{}.bed'.format(out_dir, out_prefix,
                                              name_annotation)
        bed_for_annot.saveas(out_file_name)
    return None

Example #46

0

Show file

def initBedTool(tempPrefix=""):
    # keep temporary files in current directory, to make it a little harder to
    # lose track of them and clog up the system....
    S = string.ascii_uppercase + string.digits
    tag = ''.join(random.choice(S) for x in range(5))
    tempPath = os.path.join(os.getcwd(),
                            "%sTempBedTool_%s" % (tempPrefix, tag))
    logger.info("Temporary directory for BedTools (you may need to manually"
                " erase in event of crash): %s" % tempPath)
    try:
        os.makedirs(tempPath)
    except:
        pass
    pybedtools.set_tempdir(tempPath)
    return tempPath

Example #47

0

Show file

File: funcs_large_bed.py Project: frichter/BedPyMP

def sort_bed(bed_name):
    """ SORTS a bed file after removing rmsk, sd
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    bed_in = bed_name + '.noRmsk.noSD.bed'
    bed_out = bed_name + '.sorted.noRmsk.noSD.bed'
    if not os.path.isfile(bed_out):
        print "Sorting " + bed_in + "... "
        sort_cmd = ("sort -V -k1,1 -k2,2 %s > %s"
            % (bed_in, bed_out))
        print sort_cmd
        subprocess.call(sort_cmd, shell = True)
        print bed_name + " sorted!"
    else:
        print bed_out + " already sorted"

Example #48

0

Show file

File: process_bed.py Project: wckdouglas/cfNA

def main():
    if len(sys.argv) != 4:
        sys.exit('[usage] python %s <bed_file> <out_prefix> <spliced_exon.bed>' %sys.argv[0])

    exons = REF_PATH + '/hg19_ref/genes/exons_all.bed_temp'
    tab_file = sys.argv[1]
    out_prefix =  sys.argv[2]
    spliced_exons = sys.argv[3]

    prefix = os.path.basename(tab_file).split('.')[0]
    cov_exon = out_prefix + '_exons.bed'

    set_tempdir(os.path.dirname(out_prefix))
    make_exons(tab_file, cov_exon, exons)
    filter_bed(tab_file, out_prefix, cov_exon, spliced_exons)

Example #49

0

Show file

File: enrichment_check.py Project: venkan/circtools

    def read_circ_rna_file(self, circ_rna_input, annotation_bed, has_header):
        """Reads a CircCoordinates file produced by DCC
        Will halt the program if file not accessible
        Returns a BedTool object
        """
        self.log_entry("Parsing circular RNA input file...")

        # set temporary directory for pybedtools
        pybedtools.set_tempdir(self.cli_params.tmp_directory)

        try:
            file_handle = open(circ_rna_input)
        except PermissionError:
            message = ("Input file " + str(circ_rna_input) + " cannot be read, exiting.")
            logging.info(message)
            sys.exit(message)
        else:
            with file_handle:
                line_iterator = iter(file_handle)
                # skip first line with the header
                # we assume it's there (DCC default)
                if has_header:
                    next(line_iterator)
                bed_content = ""
                bed_entries = 0
                bed_peak_sizes = 0
                for line in line_iterator:
                    columns = line.split('\t')

                    # extract chromosome, start, stop, gene name, and strand
                    entry = [self.strip_chr_name(columns[0]), columns[1], columns[2], columns[3], "0", columns[5]]

                    # concatenate lines to one string
                    bed_content += '\t'.join(entry) + "\n"

                    bed_entries += 1
                    bed_peak_sizes += (int(columns[2]) - int(columns[1]))

            # create a "virtual" BED file
            virtual_bed_file = pybedtools.BedTool(bed_content, from_string=True)
            # Todo: figure out what this code was supposed to do
            test = annotation_bed.intersect(virtual_bed_file, s=True)

        self.log_entry("Done parsing circular RNA input file:")
        self.log_entry("=> %s circular RNAs, %s nt average (theoretical unspliced) length" %
                       (bed_entries, round(bed_peak_sizes / bed_entries)))

        return test

Example #50

0

Show file

File: test_helpers.py Project: daler/pybedtools

def test_getting_example_beds():
    assert 'a.bed' in pybedtools.list_example_files()

    a_fn = pybedtools.example_filename('a.bed')
    assert a_fn == os.path.join(testdir, 'data', 'a.bed')

    a = pybedtools.example_bedtool('a.bed')
    assert a.fn == os.path.join(testdir, 'data', 'a.bed')

    # complain appropriately if nonexistent paths are asked for
    e = FileNotFoundError if six.PY3 else ValueError
    with pytest.raises(e):
        pybedtools.example_filename('nonexistent')
    with pytest.raises(e):
        pybedtools.example_bedtool('nonexistent')
    with pytest.raises(e):
        pybedtools.set_tempdir('nonexistent')

Example #51

0

Show file

File: run_spades.py Project: BioinformaticsArchive/metasv

def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[], max_interval_size=50000,
                        timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX, disable_deletion_assembly=False, stop_on_fail=False):
    pybedtools.set_tempdir(work)

    bedtool = pybedtools.BedTool(bed)
    total = bedtool.count()

    chrs = set(chrs)
    all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if
                                                                         interval.chrom in chrs]
    selected_intervals = filter(partial(should_be_assembled, disable_deletion_assembly=disable_deletion_assembly), all_intervals)
    ignored_intervals = filter(partial(shouldnt_be_assembled, disable_deletion_assembly=disable_deletion_assembly), all_intervals)

    pool = multiprocessing.Pool(nthreads)
    assembly_fastas = []
    for i in xrange(nthreads):
        intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i]
        kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad,
                       "timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail}
        pool.apply_async(run_spades_single, kwds=kwargs_dict,
                         callback=partial(run_spades_single_callback, result_list=assembly_fastas))

    pool.close()
    pool.join()

    logger.info("Merging the contigs from %s" % (str(assembly_fastas)))
    assembled_fasta = os.path.join(work, "spades_assembled.fa")
    with open(assembled_fasta, "w") as assembled_fd:
        for line in fileinput.input(assembly_fastas):
            assembled_fd.write("%s\n" % (line.strip()))

    logger.info("Indexing the assemblies")
    pysam.faidx(assembled_fasta)

    ignored_bed = None
    if ignored_intervals:
        ignored_bed = os.path.join(work, "ignored.bed")
        pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed)

    pybedtools.cleanup(remove_all=True)

    return assembled_fasta, ignored_bed

Example #52

0

Show file

File: AnnotateRNAz.py Project: bgruening/galaxytools

def annotate(bed, input, bedout, rnazout):
    try:

        pybedtools.set_tempdir('.')  # Make sure we do not write somewhere we are not supposed to
        anno = pybedtools.BedTool(bed)
        rnaz=readrnaz(input)
        tmpbed = pybedtools.BedTool(rnaztobed(rnaz), from_string=True)

        intersection = tmpbed.intersect(anno,wa=True,wb=True,s=True)  # intersect strand specific, keep all info on a and b files

        bedtornaz(intersection, rnaz, bedout, rnazout)

        return 1

    except Exception as err:
        exc_type, exc_value, exc_tb = sys.exc_info()
        tbe = tb.TracebackException(
            exc_type, exc_value, exc_tb,
        )
        print(''.join(tbe.format()),file=sys.stderr)

Example #53

0

Show file

File: funcs_large_bed.py Project: frichter/BedPyMP

def overlap_with_observed(bed_name, observed_name, observed_denovo):
    """ count the number of observed de novos that overlap with the bed file
    """
    pybedtools.set_tempdir('/sc/orga/scratch/richtf01')
    bed_intersect_dir = ("/sc/orga/projects/chdiTrios/Felix/wgs/" +
        "anno_obs_intersect/" + observed_name + "/")
    bed_out = (bed_intersect_dir + bed_name +
        '.merged.sorted.noRmsk.noSD.' + observed_name + '.bed')
    denovo_bed = BedTool('/hpc/users/richtf01/whole_genome/' +
        'variant_calls/' + observed_denovo)
    # create or load intersection file
    print bed_name + " overlap with " + observed_name
    if not os.path.isfile(bed_out):
        bed = BedTool(bed_name + '.merged.sorted.noRmsk.noSD.bed')
        print "intersecting.. "
        denovo_anno = bed.intersect(denovo_bed)
        denovo_anno.saveas(bed_out)
    else:
        print "already intersected"
        denovo_anno = BedTool(bed_out)
    counter = 0
    for i in denovo_anno:
        counter += 1
    return counter

Example #54

0

Show file

File: anno_info2.py Project: tianxiahuihui/bioinformatics

import sys, os
from subprocess import call
import pybedtools
from pybedtools import BedTool
import tabix
from pandas import *
from functools import reduce
import xlwt
import tempfile
# read the GWAVA_DIR from the environment, but default to the directory above where the script is located
GWAVA_DIR = os.getenv('GWAVA_DIR', '/public/home/chendenghui/run/work/non_soft/GWAVA')

# set the pybedtools temp directory
pybedtools.set_tempdir(GWAVA_DIR+'/tmp/')

#['ATF3', 'BATF', 'BCL11A', 'BCL3', 'BCLAF1', 'BDP1', 'BHLHE40', 'BRCA1', 'BRF1', 'BRF2', 'CCNT2', 'CEBPB', 'CHD2', 'CTBP2', 'CTCF', 'CTCFL', 'DNase', 'E2F1', 'E2F4', 'E2F6', 'EBF1', 'EGR1', 'ELF1', 'ELK4', 'EP300', 'ERALPHAA', 'ESRRA', 'ETS1', 'Eralphaa', 'FAIRE', 'FAM48A', 'FOS', 'FOSL1', 'FOSL2', 'FOXA1', 'FOXA2', 'GABPA', 'GATA1', 'GATA2', 'GATA3', 'GTF2B', 'GTF2F1', 'GTF3C2', 'H2AFZ', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me2', 'H3K4me3', 'H3K79me2', 'H3K9ac', 'H3K9me1', 'H3K9me3', 'H4K20me1', 'HDAC2', 'HDAC8', 'HEY1', 'HMGN3', 'HNF4A', 'HNF4G', 'HSF1', 'IRF1', 'IRF3', 'IRF4', 'JUN', 'JUNB', 'JUND', 'KAT2A', 'MAFF', 'MAFK', 'MAX', 'MEF2_complex', 'MEF2A', 'MXI1', 'MYC', 'NANOG', 'NFE2', 'NFKB1', 'NFYA', 'NFYB', 'NR2C2', 'NR3C1', 'NR4A1', 'NRF1', 'PAX5', 'PBX3', 'POLR2A', 'POLR2A_elongating', 'POLR3A', 'POU2F2', 'POU5F1', 'PPARGC1A', 'PRDM1', 'RAD21', 'RDBP', 'REST', 'RFX5', 'RXRA', 'SETDB1', 'SIN3A', 'SIRT6', 'SIX5', 'SLC22A2', 'SMARCA4', 'SMARCB1', 'SMARCC1', 'SMARCC2', 'SMC3', 'SP1', 'SP2', 'SPI1', 'SREBF1', 'SREBF2', 'SRF', 'STAT1', 'STAT2', 'STAT3', 'SUZ12', 'TAF1', 'TAF7', 'TAL1', 'TBP', 'TCF12', 'TCF7L2', 'TFAP2A', 'TFAP2C', 'THAP1', 'TRIM28', 'USF1', 'USF2', 'WRNIP1', 'XRCC4', 'YY1', 'ZBTB33', 'ZBTB7A', 'ZEB1', 'ZNF143', 'ZNF263', 'ZNF274', 'ZZZ3']

def encode_feats(vf, af):
    results = {}
    cols = open(af+'.cols', 'r').readline().strip().split(',')
    #intersection = vs.intersect(feats, wb=True)#TRUE
    tempfile1 = tempfile.mktemp()
    sort_cmd1 = 'bedtools intersect -wb -a %s -b %s > %s' % (vf, af, tempfile1)
    call(sort_cmd1, shell=True)
    tempfile2 = tempfile.mktemp()
    sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$10"\t"$5"_"$6"_"$7"_"$8"_"$9"_"$10"_"$11"_"$12}\' %s > %s' % (tempfile1, tempfile2)
    call(sort_cmd2, shell=True)
    intersection = BedTool(tempfile2)
    annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')
    for entry in annots:
        #fs = entry[5].strip(',').split(',')

Example #55

0

Show file

File: bed_utils.py Project: vladsaveliev/Utils

def get_total_bed_size(bed_fpath, work_dir=None):
    if work_dir:
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
    return sum(len(x) for x in BedTool(bed_fpath).merge())

Example #56

0

Show file

File: annotate_bed_for_ml.py Project: MMesbahU/metasv

    fields.append(str(max_cov / average_coverage if average_coverage > 0 else 1))
    fields.append(";".join([str(i) for i in very_good_coverages]))
    fields.append(str(average_very_good_coverage))
    fields.append(str(min(very_good_coverages)))
    fields.append(str(max(very_good_coverages)))
    fields.append(str(min(very_good_coverages) / average_very_good_coverage if (average_very_good_coverage > 0) else 1))
    fields.append(str(max(very_good_coverages) / average_very_good_coverage if (average_very_good_coverage > 0) else 1))

    return pybedtools.create_interval_from_list(fields)


def add_coverage_information(in_bed, bam):
    return in_bed.each(partial(annotate_coverage, bam=bam))


pybedtools.set_tempdir(args.tmpdir)

in_bed = pybedtools.BedTool(args.in_bed)

out_bed = in_bed
logger.info("Initial feature count %d" % (out_bed.count()))

if not os.path.isdir(args.tmpdir):
    os.makedirs(args.tmpdir)

bed_fields = ["#CHROM", "START", "END"]
bed_fields += ["NUM_CONTIGS_USED", "TOTAL_CONTIGS_COUNT"]
bed_fields += map(lambda x: "VERYGOOD_%s" % (x),
                  ["NUM_ASMS", "NUM_UNIQUE_ASMS", "HAS_ASM", "IS_CONSISTENT", "INSERTION_LENGTH", "ASM_START",
                   "ASM_END", "EXCISION_REF", "EXCISION_ASM"])
bed_fields += map(lambda x: "GOOD_%s" % (x),

Example #57

0

Show file

File: run_age.py Project: sbandara/metasv

def run_age_parallel(intervals_bed=None, reference=None, assembly=None, pad=AGE_PAD, age=None, age_workdir=None,
                     timeout=AGE_TIMEOUT, keep_temp=False, assembly_tool="spades", chrs=[], nthreads=1,
                     min_contig_len=AGE_MIN_CONTIG_LENGTH,
                     max_region_len=AGE_MAX_REGION_LENGTH, sv_types=[], 
                     min_del_subalign_len=MIN_DEL_SUBALIGN_LENGTH, min_inv_subalign_len=MIN_INV_SUBALIGN_LENGTH,
                     age_window = AGE_WINDOW_SIZE):
    func_logger = logging.getLogger("%s-%s" % (run_age_parallel.__name__, multiprocessing.current_process()))

    if not os.path.isdir(age_workdir):
        func_logger.info("Creating %s" % age_workdir)
        os.makedirs(age_workdir)

    if assembly:
        if not os.path.isfile("%s.fai" % assembly):
            func_logger.info("Assembly FASTA wasn't indexed. Will attempt to index now.")
            pysam.faidx(assembly)

        func_logger.info("Loading assembly contigs from %s" % assembly)
        with open(assembly) as assembly_fd:
            if assembly_tool == "spades":
                contigs = [SpadesContig(line[1:]) for line in assembly_fd if line[0] == '>']
            elif assembly_tool == "tigra":
                contigs = [TigraContig(line[1:]) for line in assembly_fd if line[0] == '>']
    else:
        contigs = []

    chrs = set(chrs)
    sv_types = set(sv_types)
    contig_dict = {contig.sv_region.to_tuple(): [] for contig in contigs if (len(
        chrs) == 0 or contig.sv_region.chrom1 in chrs) and contig.sequence_len >= min_contig_len and contig.sv_region.length() <= max_region_len and (
                       len(sv_types) == 0 or contig.sv_type in sv_types)}

    func_logger.info("Generating the contig dictionary for parallel execution")
    small_contigs_count = 0
    for contig in contigs:
        if contig.sv_region.length() > max_region_len: 
            func_logger.info("Too large SV region length: %d > %d" % (contig.sv_region.length(),max_region_len))
            continue
        if (len(chrs) == 0 or contig.sv_region.chrom1 in chrs) and (len(sv_types) == 0 or contig.sv_type in sv_types):
            if contig.sequence_len >= min_contig_len:
                contig_dict[contig.sv_region.to_tuple()].append(contig)
            else:
                small_contigs_count += 1

    region_list = sorted(contig_dict.keys())
    nthreads = min(nthreads, len(region_list))

    if nthreads == 0:
        func_logger.warning("AGE not run since no contigs found")
        return None

    func_logger.info("Will process %d regions with %d contigs (%d small contigs ignored) using %d threads" % (
        len(region_list), sum([len(value) for value in contig_dict.values()]), small_contigs_count, nthreads))

    pybedtools.set_tempdir(age_workdir)
    pool = multiprocessing.Pool(nthreads)

    breakpoints_beds = []
    for i in xrange(nthreads):
        region_sublist = [region for (j, region) in enumerate(region_list) if (j % nthreads) == i]
        kwargs_dict = {"intervals_bed": intervals_bed, "region_list": region_sublist, "contig_dict": contig_dict,
                       "reference": reference, "assembly": assembly, "pad": pad, "age": age, "age_workdir": age_workdir,
                       "timeout": timeout, "keep_temp": keep_temp, "myid": i, 
                       "min_del_subalign_len": min_del_subalign_len, "min_inv_subalign_len": min_inv_subalign_len,
                       "age_window" : age_window}
        pool.apply_async(run_age_single, args=[], kwds=kwargs_dict,
                         callback=partial(run_age_single_callback, result_list=breakpoints_beds))

    pool.close()
    pool.join()

    func_logger.info("Finished parallel execution")

    func_logger.info("Will merge the following breakpoints beds %s" % (str(breakpoints_beds)))

    pybedtools.cleanup(remove_all=True)

    if not breakpoints_beds:
        return None

    bedtool = pybedtools.BedTool(breakpoints_beds[0])
    for bed_file in breakpoints_beds[1:]:
        bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)

    bedtool = bedtool.moveto(os.path.join(age_workdir, "breakpoints_unsorted.bed"))
    merged_bed = os.path.join(age_workdir, "breakpoints.bed")
    bedtool.sort().saveas(merged_bed)

    return merged_bed

Example #58

0

Show file

File: test1.py Project: Fabrices/pybedtools

import pybedtools
import os, difflib, sys
from nose.tools import assert_raises, raises
from pybedtools.helpers import BEDToolsError

testdir = os.path.dirname(__file__)

pybedtools.set_tempdir(".")


def fix(x):
    """
    Replaces spaces with tabs, removes spurious newlines, and lstrip()s each
    line. Makes it really easy to create BED files on the fly for testing and
    checking.
    """
    s = ""
    for i in x.splitlines():
        i = i.lstrip()
        if i.endswith("\t"):
            add_tab = "\t"
        else:
            add_tab = ""
        if len(i) == 0:
            continue
        i = i.split()
        i = "\t".join(i) + add_tab + "\n"
        s += i
    return s

Example #59

0

Show file

File: test_helpers.py Project: Fabrices/pybedtools

import pybedtools
import sys
import os, difflib
from nose.tools import assert_raises

testdir = os.path.dirname(__file__)

pybedtools.set_tempdir('.')

def fix(x):
    """
    Replaces spaces with tabs, removes spurious newlines, and lstrip()s each
    line. Makes it really easy to create BED files on the fly for testing and
    checking.
    """
    s = ""
    for i in  x.splitlines():
        i = i.strip()
        if len(i) == 0:
            continue
        i = i.split()
        i = '\t'.join(i)+'\n'
        s += i
    return s


def test_isBAM():
    bam = pybedtools.example_filename('x.bam')
    notabam = pybedtools.example_filename('a.bed')
    open('tiny.txt', 'w').close()
    assert pybedtools.helpers.isBAM(bam)

Example #60

0

Show file

File: tfuncs.py Project: dnewkirk/pybedtools

def setup():
    if not os.path.exists(test_tempdir):
        os.system('mkdir -p %s' % test_tempdir)
    pybedtools.set_tempdir(test_tempdir)