Ejemplo n.º 1
0
def run(parser):
    args = parser.parse_args()
    reffile = args.reference
    if args.reference == 'hg19':
        reffile = './hg19.tss.bed'
    elif args.reference == 'mm10':
        reffile = './mm10.tss.bed'
    ref = pb.BedTool(reffile)
    meth = pb.BedTool(args.bedfile)
    methtss = ref.window(meth, l=args.upstream,
                         r=args.downstream).groupby(g=[1, 2, 3, 4, 5, 6],
                                                    c=9,
                                                    o=['mean'])
    with open(args.RNAseq) as f:
        lines = f.readlines()
    dic = {}
    for line in lines:
        t = line.strip().split()
        dic[t[0]] = [float(t[1])]
    for m in methtss:
        if m[3] in dic:  #postion of genename
            dic[m[3]].append(float(m[-1]))
    #print(dic)
    plt.figure()
    rexp = []
    mlevel = []
    for d in dic:
        if len(dic[d]) != 2: continue
        rexp.append(dic[d][0])
        mlevel.append(dic[d][1])
    rexp = np.array(rexp)
    mlevel = np.array(mlevel)
    pos = np.where(rexp > -100)
    mlevel = mlevel[pos]
    rexp = rexp[pos]
    #rexp[np.where(rexp==0)]=0.01
    #low=np.sort(rexp)[len(rexp)//10]
    ##log transform the gene expression values
    rexp = np.log(rexp + 1) / np.log(10)
    max_exp = np.max(rexp)
    min_exp = np.nanmin(rexp)
    max_mlevel = np.max(mlevel)
    min_mlevel = np.min(mlevel)
    #i#print(low)
    #print(np.where(rexp>low)[0])
    #    plt.plot(rexp,mlevel,'b.',alpha=0.2)
    #t = np.arange(len(rexp))
    plt.scatter(mlevel, rexp, c='b', alpha=0.1)
    #plt.colorbar()
    #<<<<<<< HEAD
    #plt.xlim(0,15)
    plt.ylim(min_exp * 1.03, max_exp * 1.03)
    #plt.ylim(-1,1)
    plt.xlim(min_mlevel * 1.05, max_mlevel * 1.05)
    #plt.xlabel('Gene Expression Level (Log10)')
    #plt.ylabel('Methylation Ratio')
    plt.xlabel(args.xaxislabel)
    plt.ylabel(args.yaxislabel)
    #=======
    #   plt.xlim(0,max_exp)
    #  plt.ylim(0,1)
    # plt.xlabel('Gene Expression Level')
    # plt.ylabel('Ratio')
    #>>>>>>> ee2d90f8cae2348451a0949bf28b02bc12f2d1ae
    #    plt.plot([0,np.max(rexp)],[1,0],'r-')
    spearman, p1 = spearmanr(rexp, mlevel)
    pearson, p2 = pearsonr(rexp, mlevel)
    geneNum = len(rexp)
    #print geneNum

    #Decimal(str(r[j])).quantize(Decimal('0.00'))
    from decimal import Decimal
    s1 = 'Spearman correlation Coefficient: ' + str(
        Decimal(str(spearman)).quantize(
            Decimal('0.000'))) + ' p-value: ' + str(
                Decimal(str(p1)).quantize(Decimal('0.000')))
    s2 = 'Pearson correlation Coefficient: ' + str(
        Decimal(str(pearson)).quantize(Decimal('0.000'))) + ' p-value: ' + str(
            Decimal(str(p2)).quantize(Decimal('0.000')))
    s3 = 'Total Genes:' + str(geneNum)
    #plt.text()
    #plt.text(0,1.1,s2)
    #plt.text(0,1.05,s1)
    plt.text(min_mlevel, max_exp * 0.9, s3)
    plt.text(min_mlevel, max_exp * 1.1, s2)
    plt.text(min_mlevel, max_exp * 1.05, s1)
    plt.savefig(args.output + '.pdf')
Ejemplo n.º 2
0
def run_age_parallel(intervals_bed=None,
                     reference=None,
                     assembly=None,
                     pad=AGE_PAD,
                     age=None,
                     age_workdir=None,
                     timeout=AGE_TIMEOUT,
                     keep_temp=False,
                     assembly_tool="spades",
                     chrs=[],
                     nthreads=1,
                     min_contig_len=AGE_MIN_CONTIG_LENGTH,
                     max_region_len=AGE_MAX_REGION_LENGTH,
                     sv_types=[]):
    func_logger = logging.getLogger(
        "%s-%s" %
        (run_age_parallel.__name__, multiprocessing.current_process()))

    if not os.path.isdir(age_workdir):
        func_logger.info("Creating %s" % age_workdir)
        os.makedirs(age_workdir)

    if not os.path.isfile("%s.fai" % assembly):
        func_logger.info(
            "Assembly FASTA wasn't indexed. Will attempt to index now.")
        pysam.faidx(assembly)

    func_logger.info("Loading assembly contigs from %s" % assembly)
    with open(assembly) as assembly_fd:
        if assembly_tool == "spades":
            contigs = [
                SpadesContig(line[1:]) for line in assembly_fd
                if line[0] == '>'
            ]
        elif assembly_tool == "tigra":
            contigs = [
                TigraContig(line[1:]) for line in assembly_fd if line[0] == '>'
            ]

    chrs = set(chrs)
    sv_types = set(sv_types)
    contig_dict = {
        contig.sv_region.to_tuple(): []
        for contig in contigs
        if (len(chrs) == 0 or contig.sv_region.chrom1 in chrs)
        and contig.sequence_len >= min_contig_len
        and contig.sv_region.length() <= max_region_len and (
            len(sv_types) == 0 or contig.sv_type in sv_types)
    }

    func_logger.info("Generating the contig dictionary for parallel execution")
    small_contigs_count = 0
    for contig in contigs:
        if contig.sv_region.length() > max_region_len: continue
        if (len(chrs) == 0 or contig.sv_region.chrom1 in chrs) and (
                len(sv_types) == 0 or contig.sv_type in sv_types):
            if contig.sequence_len >= min_contig_len:
                contig_dict[contig.sv_region.to_tuple()].append(contig)
            else:
                small_contigs_count += 1

    region_list = sorted(contig_dict.keys())
    nthreads = min(nthreads, len(region_list))

    func_logger.info(
        "Will process %d regions with %d contigs (%d small contigs ignored) using %d threads"
        %
        (len(region_list), sum([len(value) for value in contig_dict.values()
                                ]), small_contigs_count, nthreads))

    pybedtools.set_tempdir(age_workdir)
    pool = multiprocessing.Pool(nthreads)

    breakpoints_beds = []
    for i in xrange(nthreads):
        region_sublist = [
            region for (j, region) in enumerate(region_list)
            if (j % nthreads) == i
        ]
        kwargs_dict = {
            "intervals_bed": intervals_bed,
            "region_list": region_sublist,
            "contig_dict": contig_dict,
            "reference": reference,
            "assembly": assembly,
            "pad": pad,
            "age": age,
            "age_workdir": age_workdir,
            "timeout": timeout,
            "keep_temp": keep_temp,
            "myid": i
        }
        pool.apply_async(run_age_single,
                         args=[],
                         kwds=kwargs_dict,
                         callback=partial(run_age_single_callback,
                                          result_list=breakpoints_beds))

    pool.close()
    pool.join()

    func_logger.info("Finished parallel execution")

    func_logger.info("Will merge the following breakpoints beds %s" %
                     (str(breakpoints_beds)))

    pybedtools.cleanup(remove_all=True)

    if not breakpoints_beds:
        return None

    bedtool = pybedtools.BedTool(breakpoints_beds[0])
    for bed_file in breakpoints_beds[1:]:
        bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)

    merged_bed = os.path.join(age_workdir, "breakpoints.bed")
    bedtool.sort().saveas(merged_bed)

    return merged_bed
Ejemplo n.º 3
0
def _save_dict(bed, out_fname, val_index=None):
    """Save data from dict to BED file."""
    sites = pybedtools.BedTool(_iter_bed_dict(bed,
                                              val_index=val_index)).saveas()
    sites1 = sites.sort().saveas(out_fname)
    return sites1
Ejemplo n.º 4
0
import random
import numpy
import pybedtools
import operator
from operator import itemgetter
from pybedtools import BedTool

count_list = []
total_list = []
average_list = []
sorted_list = []
token_list = []

print('Loading variants.\n')
b = pybedtools.BedTool('cosmicchr1.bed').sort()
print('Loaded ' + str(b.count()))

num_trials = 10
max_rand_shift = 1000
pre_test = ""

print('Loading matches.\n')
with open("matchestest.txt", "r") as m:
    matches = m.readlines()
    for line in matches:
        tokens = line.split('\t')
        #         chromo = tokens[0]
        #         start = int(tokens[1])
        #         end=int(tokens[2])
        #         name=tokens[3]
        #pre_test = pre_test + str(token_list)
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('-v', '--variants', required=True, help='Default VCF')
    parser.add_argument('-r', '--RDtest')
    parser.add_argument('-b', '--BAFtest')
    parser.add_argument('-s', '--SRtest')
    parser.add_argument('-p', '--PEtest')
    parser.add_argument('--batch-list', type=argparse.FileType('r'))
    parser.add_argument('--segdups', required=True)
    parser.add_argument('--rmsk', required=True)
    parser.add_argument('--fam')
    parser.add_argument('-d', '--bed', action='store_true', default=False)
    parser.add_argument('fout')
    args = parser.parse_args()

    if args.bed:
        if not hasattr(args, 'batch_list'):
            raise Exception('batch list must be specified when passing a bed')
        variants = open(args.variants)
        dtypes = 'RD BAF'.split()
    else:
        variants = pysam.VariantFile(args.variants)
        dtypes = 'PE SR RD BAF'.split()

    metadata = process_metadata(variants, args.bed, args.batch_list)

    # Calculate segdup coverage
    bt = pbt.BedTool.from_dataframe(metadata['chrom start end'.split()])
    segdups = pbt.BedTool(args.segdups)
    cov = bt.coverage(segdups).to_dataframe()
    metadata['poor_region_cov'] = cov.thickStart

    # Check if endpoints are in repeat-masked sequence
    starts = metadata['chrom start end name'.split()].copy()
    starts['end'] = starts['start'] + 1
    ends = metadata['chrom start end name'.split()].copy()
    ends['start'] = ends['end'] - 1
    endpoints = pd.concat([starts, ends])
    bt = pbt.BedTool.from_dataframe(endpoints)
    rmsk = pbt.BedTool(args.rmsk)
    sect = bt.intersect(rmsk, u=True)
    rmasked_names = [i.fields[3] for i in sect.intervals]
    metadata['rmsk'] = metadata.name.isin(rmasked_names)

    metadata = metadata.set_index('name')

    evidence = deque()

    for dtype in dtypes:
        dtable = getattr(args, dtype + 'test')
        if dtable is None:
            continue

        df = pd.read_table(dtable)

        df = preprocess(df, dtype)
        df = df.rename(columns=lambda c: dtype + '_' + c if c != 'name' else c)
        df = df.set_index('name')
        evidence.append(df)

    evidence = list(evidence)
    evidence = metadata.join(evidence, how='outer', sort=True)
    evidence = evidence.reset_index().rename(columns={'index': 'name'})

    has_petest = (getattr(args, 'PEtest') is not None)
    has_srtest = (getattr(args, 'SRtest') is not None)
    if not args.bed and has_petest and has_srtest:
        evidence = add_pesr(evidence)

    # Replace infinite log-pvals
    LOG_CEIL = 300
    evidence = evidence.replace(np.inf, LOG_CEIL)

    evidence = evidence.reindex(columns=make_columns())
    evidence.to_csv(args.fout, index=False, sep='\t', na_rep='NA')
Ejemplo n.º 6
0
def get_aligned_reads_from_multi_mp(obj, nproc, passed_cells):

    global my_read_dict
    global my_intersect_all
    global my_ex_coord
    global my_uniq_r_bclist

    my_intersect_all = None
    my_read_dict = None
    my_ex_coord = None
    my_uniq_r_bclist = obj.uniq_r_bclist.copy()

    samfile = pysam.AlignmentFile(obj.in_bam_multi, "rc")
    try:
        r_iterator = samfile.fetch(obj.chrom, int(obj.start), int(obj.end))
    except:
        return obj

    rcds = np.array([[r_idx, x.to_dict(), x.get_blocks()]
                     for r_idx, x in enumerate(r_iterator)
                     if x.flag in obj.strand_flags[obj.strand]
                     and list(filter(regx1.match,
                                     x.to_dict()['tags']))[0].replace(
                                         'BC:Z:', '') in passed_cells])
    pool = mp.Pool(processes=nproc)
    func = partial(_make_dict2_mp, obj.chrom, obj.strand, obj.gene)

    read_dict_list = pool.map(func, rcds, chunksize=1)
    pool.close()

    my_read_dict = {}
    tmp = [
        my_read_dict.update(elemt) for elemt in read_dict_list
        if elemt is not None
    ]  # fast!!
    df = [my_read_dict[r_idx]['r_blocks'] for r_idx in my_read_dict.keys()]
    samfile.close()

    if len(df) == 0: return obj
    pd.concat(df, axis=0).to_csv('%s/.tempDir/_%s_multi_reads_blocks.bed' %
                                 (obj.outdir, obj.gene),
                                 index=False,
                                 sep="\t",
                                 header=False)
    read_bed = pybedtools.BedTool('%s/.tempDir/_%s_multi_reads_blocks.bed' %
                                  (obj.outdir, obj.gene))

    tmp = obj.ex_bed.intersect(read_bed, wa=True, wb=True)
    if os.stat(tmp.fn).st_size == 0:
        return obj

    my_intersect_all = tmp.to_dataframe()
    read_idx_list = list(set(my_intersect_all.iloc[:, 9].values))

    my_ex_coord = ','.join(
        obj.exons.apply(lambda x: '%s-%s' % (x[1], x[2]), axis=1).values)

    pool = mp.Pool(processes=nproc, initializer=_initialize_make_list_aligned)
    aligned_reads = pool.map(_make_list_aligned_reads_mp,
                             read_idx_list,
                             chunksize=1)
    pool.close()

    colnames = [
        'name', 'flag', 'ref_name', 'ref_pos', 'map_quality', 'cigar',
        'next_ref_name', 'next_ref_pos', 'length', 'seq', 'qual', 'tags',
        'read_mapped_position', 'geneid', 'Exon_Index', 'Category', 'BC', 'UB',
        'exon_coordinates'
    ]

    obj.multi_aligned_reads = pd.DataFrame(aligned_reads,
                                           columns=colnames).drop_duplicates()
    obj.multi_aligned_reads.insert(19, 'MapFlag', 'multi')

    return obj
Ejemplo n.º 7
0
def get_bedtools_features(strFileName):
    btFeatures = pbt.BedTool(strFileName)
    return btFeatures
Ejemplo n.º 8
0
def parallel_genotype_intervals(intervals_file=None,
                                bam=None,
                                workdir=None,
                                nthreads=1,
                                chromosomes=[],
                                window=DEFAULT_GT_WINDOW,
                                isize_mean=DEFAULT_GT_ISIZE_MEAN,
                                isize_sd=DEFAULT_GT_ISIZE_SD,
                                normal_frac_threshold=DEFAULT_GT_NORMAL_FRAC):
    func_logger = logging.getLogger("%s-%s" %
                                    (parallel_genotype_intervals.__name__,
                                     multiprocessing.current_process()))
    if workdir and not os.path.isdir(workdir):
        os.makedirs(workdir)

    chromosomes = set(chromosomes)

    start_time = time.time()

    bedtool = pybedtools.BedTool(intervals_file)
    selected_intervals = [
        interval for interval in bedtool
        if not chromosomes or interval.chrom in chromosomes
    ]
    nthreads = min(len(selected_intervals), nthreads)
    intervals_per_process = (len(selected_intervals) + nthreads - 1) / nthreads

    pool = multiprocessing.Pool(nthreads)
    genotyped_beds = []
    for i in xrange(nthreads):
        process_workdir = os.path.join(workdir, str(i))
        if not os.path.isdir(process_workdir):
            os.makedirs(process_workdir)
        process_intervals = pybedtools.BedTool(
            selected_intervals[i * intervals_per_process:(i + 1) *
                               intervals_per_process]).saveas(
                                   os.path.join(process_workdir,
                                                "ungenotyped.bed"))
        kwargs_dict = {
            "intervals_file": process_intervals.fn,
            "bam": bam,
            "workdir": process_workdir,
            "window": window,
            "isize_mean": isize_mean,
            "isize_sd": isize_sd,
            "normal_frac_threshold": normal_frac_threshold
        }
        pool.apply_async(genotype_intervals,
                         kwds=kwargs_dict,
                         callback=partial(genotype_intervals_callback,
                                          result_list=genotyped_beds))

    pool.close()
    pool.join()

    func_logger.info("Following BED files will be merged: %s" %
                     (str(genotyped_beds)))

    if not genotyped_beds:
        func_logger.warn("No intervals generated")
        return None

    pybedtools.set_tempdir(workdir)
    bedtool = pybedtools.BedTool(genotyped_beds[0])

    for bed_file in genotyped_beds[1:]:
        bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)
    bedtool = bedtool.sort().moveto(os.path.join(workdir, "genotyped.bed"))

    func_logger.info(
        "Finished parallel genotyping of %d intervals in %g minutes" %
        (len(selected_intervals), (time.time() - start_time) / 60.0))

    return bedtool.fn
Ejemplo n.º 9
0
    def binned_stats(self, in_fname, nbins, split=False, **args):
        rpkm = args.get("rpkm", False)
        readlength = self.read_length()
        fragmentsize = self.fragmentsize
        if not fragmentsize:
            fragmentsize = readlength
        total_reads = 1
        if rpkm:
            total_reads = self.count() / 1000000.0
        ret = []
        count = 1
        # Only use a BedTool if really necessary, as BedTools does not close open files
        # on object deletion
        if self.ftype == "bam":
            in_track = SimpleBed(in_fname)
        else:
            in_track = pybedtools.BedTool(in_fname)

        #extend = fragmentsize - readlength
        for feature, min_strand, plus_strand in self.fetch_to_counts(in_track):
            binsize = (feature.end - feature.start) / float(nbins)
            row = []
            min_strand = [x - (fragmentsize - readlength) for x in min_strand]
            bin_start = feature.start
            while int(bin_start + 0.5) < feature.end:
                num_reads = 0
                i = 0
                c = 0
                while i < len(min_strand) and min_strand[i] <= int(
                        bin_start + binsize + 0.5):
                    if min_strand[i] + fragmentsize <= int(bin_start +
                                                           binsize + 0.5):
                        c += 1
                    num_reads += 1
                    i += 1
                min_strand = min_strand[c:]

                i = 0
                c = 0
                while i < len(plus_strand) and plus_strand[i] <= int(
                        bin_start + binsize + 0.5):
                    if plus_strand[i] + fragmentsize <= int(bin_start +
                                                            binsize + 0.5):
                        c += 1
                    num_reads += 1
                    i += 1
                plus_strand = plus_strand[c:]

                if rpkm:
                    per_kb = num_reads * (1000.0 / binsize)
                    row.append(per_kb / total_reads)
                else:
                    row.append(num_reads)
                bin_start += binsize
            if feature.strand == "-":
                row = row[::-1]
            ret.append([feature.chrom, feature.start, feature.end] + row)
            count += 1

        del in_track
        if split:
            return ret
        else:
            return ["\t".join([str(x) for x in r]) for r in ret]
                             index=False,
                             compression="gzip")

    return None


###################################### MAIN ######################################

snake_log_obj = snakemake.log  # class(snakemake.log) = 'snakemake.io.Log
sys.stdout = open(
    str(snake_log_obj), "w"
)  # could not find better ways than calling str(snake_log_obj) to get the log filepath

out_dir = snakemake.params['out_dir']
out_prefix = snakemake.params['run_prefix']
annot_per_geneset = False
chromosome = snakemake.params['chromosome']
annotations = snakemake.params['annotations']
all_genes = snakemake.params['all_genes']
bimfile = '{}.{}.bim'.format(snakemake.params['bfile'], chromosome)
dict_of_beds = {}

for name_annot in annotations:
    dict_of_beds[name_annot] = pybedtools.BedTool('{}/{}.{}.bed'.format(
        out_dir + '/bed', out_prefix, name_annot))

make_annot_file_per_chromosome(chromosome, dict_of_beds, out_dir, out_prefix,
                               annot_per_geneset, bimfile, all_genes)

print("Make annot script is done!")
def make_annot_file_per_chromosome(chromosome, dict_of_beds, out_dir,
                                   out_prefix, annot_per_geneset, bimfile,
                                   all_genes):
    """ 
	Input
		chromosome: integer (1..22)
	
	*OBS* this function RELIES on MANY GLOBAL scope VARIABLES
	"""
    # TODO: parse variables to function

    ### make annot file
    print('making annot files for chromosome {}'.format(chromosome))
    df_bim = pd.read_csv(bimfile,
                         delim_whitespace=True,
                         usecols=[0, 1, 2, 3],
                         names=['CHR', 'SNP', 'CM', 'BP'])
    # (Pdb) df_bim.head()
    #    CHR          SNP        CM       BP
    # 0   21  rs146134162 -0.908263  9412099
    # 1   21  rs578050168 -0.908090  9412377
    # 2   21  rs527616997 -0.907297  9413645
    # 3   21  rs544748596 -0.906578  9414796
    # 4   21  rs528236937 -0.906500  9414921
    # iter_bim = [['chr'+str(x1), x2, x2] for (x1, x2) in np.array(df_bim[['CHR', 'BP']])]
    # ^ Python3 (but not Python2.7) gives the following error when calling "bimbed = BedTool(iter_bim)" in make_annot_file_per_chromosome()
    # /tools/anaconda/3-4.4.0/lib/python3.6/site-packages/pybedtools/cbedtools.pyx in pybedtools.cbedtools.IntervalIterator.__next__()
    # /tools/anaconda/3-4.4.0/lib/python3.6/site-packages/pybedtools/cbedtools.pyx in pybedtools.cbedtools.create_interval_from_list()
    # /tools/anaconda/3-4.4.0/lib/python3.6/site-packages/pybedtools/cbedtools.pyx in pybedtools.cbedtools.isdigit()
    # AttributeError: 'numpy.int64' object has no attribute 'isdigit'
    # SOLUTION: convert everything to strings --> ['chr'+str(x1), str(x2), str(x2)]
    # print(df_bim.head) --> useful for debugging
    # print(bimfile)
    iter_bim = [['chr' + str(x1), str(x2), str(x2)]
                for (x1, x2) in np.array(df_bim[['CHR', 'BP']])]
    bimbed = pybedtools.BedTool(iter_bim)
    counter = 1  # just to print status message
    list_df_annot = []
    for name_annotation in sorted(
            dict_of_beds):  # we sort to make output more consistent.
        print("CHR={} | annotation={}, #{}/#{}".format(chromosome,
                                                       name_annotation,
                                                       counter,
                                                       len(dict_of_beds)))
        bed_for_annot = dict_of_beds[name_annotation]  # get bed
        # (Pdb)len(bed_for_annot)
        # 66
        #  (Pdb) bed_for_annot.head()
        # chr1    51619935        52185000
        #  chr1   70410488        70871303
        #  chr1   85584164        86243933
        #  chr1   202948059       203355877
        #  chr10  43851792        44270066
        #  chr10  75681524        76110821
        #  chr10  76769912        77191206
        #  chr10  120663598       121138345
        #  chr11  118030300       118469926
        #  chr12  21454715        21871342

        annotbed = bimbed.intersect(
            bed_for_annot, wb=True
        )  # PT NOTE: this finds SNPs in bim file that OVERLAP with the annotation bed (gene)
        # chr22  24008141    24008141    chr22   24008021    24210630    ENSG00000250479 0.03038823367
        # chr22  24008403    24008403    chr22   24008021    24210630    ENSG00000250479 0.03038823367
        # chr22  24008409    24008409    chr22   24008021    24210630    ENSG00000250479 0.03038823367
        # chr22  24008465    24008465    chr22   24008021    24210630    ENSG00000250479 0.03038823367
        # chr22  24008495    24008495    chr22   24008021    24210630    ENSG00000250479 0.03038823367
        # chr22  24008497    24008497    chr22   24008021    24210630    ENSG00000250479 0.03038823367
        # chr22  24008503    24008503    chr22   24008021    24210630    ENSG00000250479 0.03038823367
        # chr22  24008699    24008699    chr22   24008021    24210630    ENSG00000250479 0.03038823367
        # chr22  24008773    24008773    chr22   24008021    24210630    ENSG00000250479 0.03038823367

        # annotbed = bed_for_annot.intersect(bimbed, wb=True) # PT NOTE: this finds the positions/intervals in annotation bed (gene) that OVERLAP with the bim file. Only the part of the record intersections occurred
        # *IMPORTANT*: bimbed.intersect(bed_for_annot) and bed_for_annot.intersect(bimbed) DOES NOT return the same positions. However, they do return the same number of 'intersected features'. That is, the returned BedTool object as the same length.
        # bed_for_annot.intersect(bimbed) returns features that span two bp (e.g. start=24008140, end=24008142), whereas bimbed.intersect(bed_for_annot) returns features that span a single bp (start=24008141, end=24008141)
        # use bed_for_annot.intersect(bimbed, wb=True) to understand this behavior better.
        # chr22  24008140    24008142    ENSG00000250479 0.03038823367   chr22   24008141    24008141
        # chr22  24008402    24008404    ENSG00000250479 0.03038823367   chr22   24008403    24008403
        # chr22  24008408    24008410    ENSG00000250479 0.03038823367   chr22   24008409    24008409
        # chr22  24008464    24008466    ENSG00000250479 0.03038823367   chr22   24008465    24008465
        # chr22  24008494    24008496    ENSG00000250479 0.03038823367   chr22   24008495    24008495
        # chr22  24008496    24008498    ENSG00000250479 0.03038823367   chr22   24008497    24008497
        # chr22  24008502    24008504    ENSG00000250479 0.03038823367   chr22   24008503    24008503
        # chr22  24008698    24008700    ENSG00000250479 0.03038823367   chr22   24008699    24008699
        # chr22  24008772    24008774    ENSG00000250479 0.03038823367   chr22   24008773    24008773

        ### DOCS .intersect()
        # the intervals reported are NOT the original gene intervals, but rather a refined interval reflecting solely the portion of each original gene interval that overlapped with the SNPs
        # The -wa (write A) and -wb (write B) options allow one to see the original records from the A and B files that overlapped. As such, instead of not only showing you where the intersections occurred, it shows you what intersected.
        # SEE MORE HERE: http://quinlanlab.org/tutorials/bedtools/bedtools.html
        # SEE https://daler.github.io/pybedtools/intersections.html
        # SEE https://daler.github.io/pybedtools/autodocs/pybedtools.bedtool.BedTool.intersect.html#pybedtools.bedtool.BedTool.intersect

        ### Extract data from annotbed before deleting annotbed.fn
        ### These iterations REQUIRE the tmp bed file (annotbed.fn) to exist. SEE cbedtools.IntervalFile or .cbedtools.IntervalIterator,
        ### Since the iterables in annotbed (x.start or .fields[7]) are all strings (or integers?), they are immutable objects and python will insert the value of the object/string, not a reference, in the list comprehension
        ### CONCLUSION: pass-by-value of immutable objects means that we can SAFELY DELETE annotbed.fn after this data.
        ### REF Parameter Passing for Mutable & Immutable Objects: https://medium.com/@tyastropheus/tricky-python-ii-parameter-passing-for-mutable-immutable-objects-10e968cbda35
        bp = [
            x.start for x in annotbed
        ]  # PT NOTE: make list of all bp positions for the overlapping SNPs | All features, no matter what the file type, have chrom, start, stop, name, score, and strand attributes.
        annotation_value = [
            x.fields[7] for x in annotbed
        ]  # returns list of strings. Extract the 'score' column. This is column 7 in the 0-based column indexing. *OBS*: x.fields[7] is a string.

        ### pybedtools cleanup V1: deletes all pybedtools session files [does not work - see below]
        ### KEEP THIS AS A WIKI/EXPLANATION
        ### REF 1 Pybedtools Design principles: https://daler.github.io/pybedtools/topical-design-principles.html
        ### REF 2 https://daler.github.io/pybedtools/autodocs/pybedtools.helpers.cleanup.html#pybedtools.helpers.cleanup
        # Using BedTool instances typically has the side effect of creating temporary files on disk: every BedTools operation results in a new temporary file.
        # Temporary files may be created in order to run BEDTools programs, so BedTool objects must always point to a file on disk.
        # Temporary files are stored in /tmp by default, and have the form /tmp/pybedtools.*.tmp.
        # By default, at exit all temp files created during the session will be deleted.
        # However, if Python does not exit cleanly (e.g., from a bug in client code), then the temp files will not be deleted.
        # print("CHR={} | annotation={}, #{}/#{}. Doing pybedtools cleanup...".format(chromosome, name_annotation, counter, len(dict_of_beds)))
        # pybedtools.cleanup(verbose=True) # force deletion of all temp files from the current session.
        #  ---> YOU CANNOT CLEAN UP FILES at this point because it REMOVES ALL tmp files in dict_of_beds.
        #  ---> e.g. you get the execption: pybedtools.cbedtools.BedToolsFileError: /tmp/pybedtools.izu3ifzg.tmp does not exist

        ### pybedtools cleanup V2: deletes current annotbed (specific to a chromosome and annotation)
        # we need to cleanup files because a lot of tmp bed files are written to pybedtools.get_tempdir().
        # tmp bed files can take up to 200 MB per file. The file size is dependent on the number of genes in the annotation.
        # So inputs with "raw SEMs" annotations where each annotation contains all genes in the dataset (all genes have a non-zero SEM value) will generate large tmp bed files.
        # >>900 GB storage is used if running 2-4 parallel processes of make_annot_file_per_chromosome() and ~1500 annotations
        # Summary of storage use for this function if not doing forced clean-up : N_files = N_annotations * N_parallel_procs. e.g. 1500 annotations * 4 proc * 0.2 GB per file = 1200 GB
        # By default, tmp bed files would only cleaned up after completing this function.
        # OUR SOLUTION: after doing the Bedtools intersect opertation, we no longer need the tmp file (the annotbed object lives in python memory). Force removal of the tmp bed file specific to a chromosome and annotation
        # NOTE: deleting a tmp file will not cause any problems later on for pybedtools automatic cleanup. I check the source code.
        os.remove(annotbed.fn)

        ### Create data frame
        df_annot_overlap_bp = pd.DataFrame({
            'BP': bp,
            name_annotation: annotation_value
        })  # FINUCANE ORIG: df_int = pd.DataFrame({'BP': bp, 'ANNOT':1})
        #             BP  blue
        # 0     34605531     1
        # 1     34605604     1
        # 2     34605644     1
        # 3     34605778     1
        # 4     34606634     1
        # 5     34606840     1
        # 6     34607223     1
        df_annot = pd.merge(
            df_bim, df_annot_overlap_bp, how='left', on='BP'
        )  # *IMPORTANT*: how='left' --> resulting data frame will include ALL snps from the bim file.
        # ^ how="left": use only keys from left frame, PRESERVE KEY ORDER
        # (Pdb) df_annot.head()
        #    CHR          SNP        CM       BP  blue
        # 0   21  rs146134162 -0.908263  9412099   NaN
        # 1   21  rs578050168 -0.908090  9412377   NaN
        # 2   21  rs527616997 -0.907297  9413645   NaN
        # 3   21  rs544748596 -0.906578  9414796   NaN
        # 4   21  rs528236937 -0.906500  9414921   NaN
        df_annot = df_annot[[
            name_annotation
        ]]  # get rid of all columns but the name_annotation. Important: return 1 column data frame (and not series, which would loose the column name)
        # df[[name_annotation]] or df.loc[:, [name_annotation]] --> returns dataframe
        # df[name_annotation] or df.loc[:, name_annotation] --> returns series
        df_annot.fillna(
            0, inplace=True
        )  # SNPs not in df_annot_overlap_bp will have NA values in name_annotation
        # Do data type conversion AFTER .fillna() to avoid problems with NA (float) that cannot be converted to int.
        df_annot[name_annotation] = df_annot[name_annotation].astype(float)
        list_df_annot.append(df_annot)
        if annot_per_geneset == True:  # write annot file per annotation per chromosome
            file_out_annot = "{}/{}.{}.{}.annot.gz".format(
                out_dir, out_prefix, name_annotation,
                chromosome)  # set output filename. ${prefix}.${chr}.annot.gz
            df_annot.to_csv(file_out_annot,
                            sep="\t",
                            index=False,
                            compression="gzip")
        counter += 1
        # if counter == 4: break
    print("CHR={} | Concatenating annotations...".format(chromosome))
    df_annot_combined = pd.concat(
        list_df_annot, axis='columns'
    )  # stack horizontally (there is no joining on indexes, just stacking)
    # *IMPORTANT*: since we did pd.merge(df_bin, df_annot_overlap_bp) with how='left' the know that ALL dfs in list_df_annot have ALL SNPs in df_bim and the order of the SNPs are preserved.
    # ALTERNATIVELY if you don't want 'thin-annot' use this (i.e. adding 'CHR','SNP','CM','BP' columns): df_annot_combined = pd.concat([df_bim]+list_df_annot, axis='columns') # stack horizontally

    # print("CHR={} | Calculating standard deviation for annotations...".format(chromosome))
    # df_annot_sd = pd.DataFrame(df_annot_combined.drop(columns=["CHR", "SNP", "CM", "BP"]).std(), columns=["sd"])
    # df_annot_sd.index.name = "annotation"
    # df_annot_sd["n"] = df_annot.shape[1] # number of SNPs in the data frame. This makes it easier to calculate the combined standard deviation across chromosomes later.
    # file_out_annot_combined_sd = "{}/{}.{}.{}.annot_sd".format(args.out_dir, args.out_prefix, "COMBINED_ANNOT", chromosome)
    # df_annot_sd.to_csv(file_out_annot_combined_sd, sep="\t", index=True)
    ### Output file
    ### annotation      sd      n
    ### antiquewhite3   0.16847050545855485     5
    ### blue1   0.1197907423131066      5
    ### chocolate       0.0     5

    print("CHR={} | Writing annotations...".format(chromosome))
    if all_genes == True:
        file_out_annot_combined = "{}/all_genes_in_{}.{}.annot.gz".format(
            out_dir, out_prefix, chromosome)
    else:
        file_out_annot_combined = "{}/{}.{}.{}.annot.gz".format(
            out_dir, out_prefix, "COMBINED_ANNOT", chromosome)

    df_annot_combined.to_csv(file_out_annot_combined,
                             sep="\t",
                             index=False,
                             compression="gzip")

    return None
Ejemplo n.º 12
0
def test_cleaned_intersect():
    x = pybedtools.BedTool(
        """
    chr1 1 10      1
    chr1 20 30     2
    chr1 100 120   3
    """,
        from_string=True,
    )
    y = pybedtools.BedTool(
        """
    chr1 2 7       4
    chr1 110 120   5
    chr1 200 210   6
    """,
        from_string=True,
    )
    z = pybedtools.BedTool(
        """
    chr1 25 40     7
    chr1 190 205   8
    chr1 1000 1001 9
    """,
        from_string=True,
    )

    # Two-way test
    #
    x2, y2 = pybedtools.contrib.venn_maker.cleaned_intersect([x, y])

    # x should be the same -- 1, 2, 3
    # y should have 1, 3, 6

    assert x2 == fix("""
    chr1 1 10
    chr1 20 30
    chr1 100 120
    """)

    assert y2 == fix("""
    chr1 1 10
    chr1 100 120
    chr1 200 210""")

    # Three-way test
    #
    x3, y3, z3 = pybedtools.contrib.venn_maker.cleaned_intersect([x, y, z])

    # x should be the same -- 1, 2, 3
    # y should have 1, 3, 6
    # z should have 2, 6

    assert x3 == fix("""
    chr1 1 10
    chr1 20 30
    chr1 100 120
    """)

    assert y3 == fix("""
    chr1 1 10
    chr1 100 120
    chr1 200 210""")

    assert z3 == fix("""
    chr1 20 30
    chr1 200 210
    chr1 1000 1001""")

    try:
        pybedtools.helpers._check_for_R()
        print(
            pybedtools.contrib.venn_maker.venn_maker(
                beds=[x, y, z],
                names=["x", "y", "z"],
                figure_filename="out.tiff",
                additional_args=[
                    "euler.d=TRUE",
                    "scaled=TRUE",
                    'fill=c("red","blue", "orange")',
                ],
                run=True,
            ))
    except ValueError:
        sys.stderr.write("R installation not found; skipping test")

    if os.path.exists("out.tiff"):
        os.unlink("out.tiff")
Ejemplo n.º 13
0
    def get_feature_locations(self, limit_genes=False, flush_cashe=False):

        
        """
        
        Gets locations of genic features, five prime sites, 3 prime sites, poly a sites stop codons start codons and tss
        based off annotated gtf _db file
        
        _db - _db handle generated by gtf utils
        
        returns dict of bedfiles     { five_prime_ends : bedtool 
                                       three_prime_ends
                                       poly_a_sites
                                       stop_codons
                                       transcription_start_sites 
                                    } 
        
        """

        transcriptome = { "five_prime_ends" : [],
                    "three_prime_ends" : [],
                    "poly_a_sites" : [],
                    "stop_codons" :  [],
                    "start_codons" :  [],
                    "transcription_start_sites" : []}
        
        region_and_species = os.path.join(self._regions_dir, self._species)
        try:
            if flush_cashe:
                raise ValueError
         
            return {region : pybedtools.BedTool("%s_%s.bed" % (region_and_species, 
                                                               region)) for region in transcriptome}
    
        except ValueError:
            pass

        for i, gene_id in enumerate(self._db.features_of_type('gene')):
            if i % 2000 == 0:
                print "processed %d genes" % (i)
                if i == 2000 and limit_genes:
                    break
                
            gene = { "five_prime_ends": [],
                    "three_prime_ends": [],
                    "poly_a_sites": [],
                    "stop_codons":  [],
                    "start_codons":  [],
                    "transcription_start_sites": []}
            try:
                for exon in self._db.children(gene_id, featuretype='exon'):
                    exon_start = copy.deepcopy(exon)
                    exon_start.start = exon.start + 1
   
                    exon_stop = copy.deepcopy(exon)
                    exon_stop.start = exon_stop.stop
                    exon_stop.stop += 1
                    
                    if exon.strand == "-":
                        exon_start, exon_stop = exon_stop, exon_start 
                        
                    gene['five_prime_ends'].append(exon_start)
                    gene['three_prime_ends'].append(exon_stop)
                
                #transcript vs mRNA need to look at the difference
                for transcript in self._db.children(gene_id, featuretype=self._feature_names['transcript']):
                    transcript_start = copy.deepcopy(transcript)
                    transcript_start.stop = transcript.start + 1
                    
                    transcript_stop = copy.deepcopy(transcript)
                    transcript_stop.start = transcript_stop.stop
                    transcript_stop.stop += 1
                 
                    if transcript.strand == "-":
                        transcript_start, transcript_stop = transcript_stop, transcript_start
                        
                    gene['poly_a_sites'].append(transcript_stop)
                    gene['transcription_start_sites'].append(transcript_start)
                if self._species == "ce10": #need to generalize later
                    for transcript in self._db.children(gene_id, featuretype=self._feature_names['transcript']):
                        try:
                            cds = list(self._db.children(transcript, 
                                                         featuretype='CDS'))
                            
                            first_cds, last_cds = cds[0], cds[-1]

                            if first_cds.strand == '-':
                                first_cds, last_cds = last_cds, first_cds
                                
                            start_codon = first_cds
                            start_codon.stop = first_cds.start + 1
                            gene['start_codons'].append(start_codon)
                                
                            stop_codon = last_cds
                            stop_codon.start = stop_codon.stop
                            stop_codon.stop  = stop_codon.stop + 1
                            gene['stop_codons'].append(stop_codon)

                        except:
                            pass
                else: #for hg19 and mm9 gencode 
                    for start_codon in self._db.children(gene_id, featuretype='start_codon'):
                        start_codon.stop = start_codon.start + 1
                        gene['start_codons'].append(start_codon)
                        
                    for stop_codon in self._db.children(gene_id, featuretype='stop_codon'):
                        stop_codon.start = stop_codon.stop
                        stop_codon.stop  = stop_codon.stop + 1
                        gene['stop_codons'].append(stop_codon)
                    
            except IndexError:
                pass
            gene_id = gene_id.attributes[self._feature_names['gene_id']]
            for region in gene:
                transcriptome[region] += self._merge_and_rename_regions(gene[region], gene_id)

        for name, intervals in transcriptome.items():
            transcriptome[name] = pybedtools.BedTool(map(self._to_bed, intervals)).\
                remove_invalid().sort().each(self._fix_chrom).saveas("%s_%s.bed" % (region_and_species, name))

        return transcriptome
Ejemplo n.º 14
0
    def get_genomic_regions(self, prox_size=500, limit_genes=False, flush_cashe=False):
        
        """
        
        returns bedtool of all non-overlapping regions in the genome, exons, cds, 3' utrs and 5' utrs
        _species - string of the _species to analyze
        _db - _db handle generated by gtf utils
        
        Potental off by one bug here, need to examine more closely
        """
        region_and_species = os.path.join(self._regions_dir, self._species)
        regions = ["genes", "five_prime_utrs", "three_prime_utrs", "cds", 
                   "exons", "introns", "proxintron", "distintron",
                   ]
        try:
            if flush_cashe:
                raise ValueError
            results = {}
            for region in regions:
                if region in ["proxintron", "distintron"]:
                    results[region] = pybedtools.BedTool("%s_%s%d.bed" % (region_and_species, 
                                                                           region, prox_size))
                else:
                    results[region] = pybedtools.BedTool("%s_%s.bed" % (region_and_species, 
                                                                         region))
            return results
        except ValueError as e:
            print e
            pass
        
        three_prime_utrs = []
        five_prime_utrs = []
        cds = []
        exons = []
        dist_introns = []
        prox_introns = []
        gene_list = []
        introns = []
        for i, gene in enumerate(self._feature_hash.keys()):
            gene_list.append(self._feature_hash[gene]['gene'])
            if i % 2000 == 0:
                print "processed %d genes" % (i)
                if i == 2000 and limit_genes:
                    break


            gene_cds, gene_dist_introns, gene_exons, gene_five_prime_utrs, gene_introns, gene_prox_introns, gene_three_prime_utrs = self._gene_regions(gene)
            three_prime_utrs += gene_three_prime_utrs
            five_prime_utrs += gene_five_prime_utrs
            cds += gene_cds
            exons += gene_exons
            dist_introns += gene_dist_introns
            prox_introns += gene_prox_introns
            introns += gene_introns

        #make exons and introns
        results = {"genes": gene_list,
                   "five_prime_utrs": five_prime_utrs,
                   "three_prime_utrs": three_prime_utrs,
                   "cds": cds,
                   "proxintron": prox_introns,
                   "distintron": dist_introns,
                   "exons": exons,
                   "introns": introns}
        
        for name, intervals in results.items():
            intervals = pybedtools.BedTool(map(self._to_bed, intervals)).remove_invalid().sort().each(self._fix_chrom)
            
            if name in ["proxintron", "distintron"]:
                results[name] = intervals.saveas(region_and_species + "_%s%d.bed" % (name, 
                                                                                     prox_size))
            else:
                results[name] = intervals.saveas(region_and_species + "_%s.bed" % (name))

        return results
Ejemplo n.º 15
0
def make_normalization(segmentation, normalization):
    """
    Make normalization file for RNAmaps (for given segmentation).

    Parameters
    ----------
    segmentation : str
        Segmentation file.
    normalization : str
        Output txt file with normalization.

    Returns
    -------
    str
        Path to file with normalizations.

    """
    iCount.logger.log_inputs(LOGGER)

    data = {}  # Container for normalization data

    def add_entry(start_type, stop_type, start_len, stop_len, strand):
        """Add normalization entry in ``data``."""
        if strand == '-':
            start_type, stop_type = stop_type, start_type
            start_len, stop_len = stop_len, start_len

        # Cut long segments to some managable size:
        start_len = start_len if start_len < RNA_WINDOW_SIZE else RNA_WINDOW_SIZE
        stop_len = stop_len if stop_len < RNA_WINDOW_SIZE else RNA_WINDOW_SIZE

        rna_map_type = '{}-{}'.format(start_type, stop_type)
        # Left side:
        segments = data.setdefault(rna_map_type, {}).setdefault(-start_len, 0)
        data[rna_map_type][-start_len] = segments + 1
        # Right side:
        segments = data.setdefault(rna_map_type,
                                   {}).setdefault(stop_len - 1, 0)
        data[rna_map_type][stop_len - 1] = segments + 1

    LOGGER.info('Reading segmentation to internal format...')

    # pylint: disable=protected-access
    chroms = set()
    for segment in pybedtools.BedTool(segmentation):
        chroms.add(segment.chrom)
    chroms_strands = [(chrom, strand) for chrom in chroms
                      for strand in ('+', '-')]

    for (chrom, strand) in chroms_strands:
        LOGGER.debug("Processing chromosome %s...", chrom)
        last_intergenic = None  # Store last intergenic segment.
        last_segments = [
        ]  # Store segments with highest stop coordinate (can be more of them).

        chrom_content = iCount.genomes.segment._prepare_segmentation(
            segmentation, chrom, strand=strand)

        # Iter through all genes in given chromosome/strand sorted by start position:
        for gene_content in sorted(chrom_content.values(),
                                   key=lambda x: x['gene_segment'].start):
            gene_segment = gene_content.pop('gene_segment')

            # In case, intergenic region if found, add entries from all
            # segments that stop where intergenic starts.
            if gene_segment[2] == 'intergenic':
                last_intergenic = gene_segment
                for seg in last_segments:
                    add_entry(seg[2], 'integrenic', len(seg),
                              len(gene_segment), strand)

            else:
                # Iterate by ascending transcript coordinate:
                for transcript_content in sorted(gene_content.values(),
                                                 key=lambda x: x[0].start):
                    transcript_segment = transcript_content.pop(0)

                    # Update list "last_segments", if necessary:
                    if not last_segments or last_segments[
                            0].stop < transcript_segment.stop:
                        last_segments = [transcript_content[-1]]
                    elif last_segments[0].stop == transcript_segment.stop:
                        last_segments.append(transcript_content[-1])

                    # If transcript starts where intergenic ends, add also entry for this:
                    if last_intergenic.stop == transcript_content[0].start:
                        add_entry('integrenic', transcript_content[0][2],
                                  len(last_intergenic),
                                  len(transcript_content[0]), strand)

                    # This is the "normal" case - add entries for all segments in transcript:
                    for seg1, seg2 in zip(transcript_content,
                                          transcript_content[1:]):
                        add_entry(seg1[2], seg2[2], len(seg1), len(seg2),
                                  strand)

                    # Consider also exon-exon junctions:
                    exons = [
                        seg for seg in transcript_content
                        if seg[2] in EXON_TYPES
                    ]
                    if len(exons) > 1:
                        for exon1, exon2 in zip(exons, exons[1:]):
                            add_entry(exon1[2], exon2[2], len(exon1),
                                      len(exon2), strand)

    # Data must be transformed: Consider all segment length for normalization, not just the last
    # nucleotide. Example:
    # data_before = {-10, :1, -5: 1, 10: 2}
    # data_after = {-10: 1, -9: 1 ... -6: 1, -5: 2, -4: 2 ... -1: 2, 0: 2, 1: 2 ... 9: 2, 10: 2}

    LOGGER.info('Flattening normalization data...')
    for rna_map_type, distances in data.items():
        cumulative = 0
        for i in range(min(distances.keys()), 0):
            cumulative += data[rna_map_type].get(i, 0)
            data[rna_map_type][i] = cumulative

        cumulative = 0
        for i in range(max(distances.keys()) + 1)[::-1]:
            cumulative += data[rna_map_type].get(i, 0)
            data[rna_map_type][i] = cumulative

    # Write to file:
    LOGGER.info('Writing normalization to file')
    with open(normalization, 'wt') as nfile:
        print('\t'.join(['RNAmap_type', 'distance', 'segments']), file=nfile)
        for rna_map_type, distances in sorted(data.items()):
            for distance, segments in sorted(distances.items()):
                print('\t'.join(map(str, [rna_map_type, distance, segments])),
                      file=nfile)
output_file = sys.argv[3].strip()

# sample name
sample = sys.argv[4].strip()

bands = []
with open(cyto_bed, "r") as f:
	for line in f:
		line_arr = line.replace("chr", "").strip().split()
		bands.append(line_arr[0]+line_arr[3])

df = pd.DataFrame(bands, columns=["cytoBand"])
df = df.set_index("cytoBand")
df[sample] = 0.0

a = pybedtools.BedTool(cyto_bed)
b = pybedtools.BedTool(seg_file)
a.intersect(b, wao=True).saveas("intersected_seg_file.cns")

data = dict()
with open("intersected_seg_file.cns", "r") as f:
	for line in f:
		line_arr = line.strip().split()

		# Get the cytoband and seg part
		key = "\t".join(line_arr[:5]).replace("chr", "")
		value = "\t".join(line_arr[5:])

		# If a cytoband has more than one segment, we can access it because 
		# it is being stored as a list; which works even if there's only one
		if key in data:
Ejemplo n.º 17
0
def tophat_map(gtf,
               out_dir,
               prefix,
               fastq,
               thread,
               bw=False,
               scale=False,
               gtf_flag=1):
    '''
    1. Map reads with TopHat2
    2. Extract unmapped reads
    3. Create BigWig file if needed
    '''
    # tophat2 mapping
    print('Map reads with TopHat2...')
    tophat_cmd = 'tophat2 -g 1 --microexon-search -m 2 '
    if gtf_flag:
        tophat_cmd += '-G %s ' % gtf
    tophat_cmd += '-p %s -o %s ' % (thread, out_dir + '/tophat')
    tophat_cmd += '%s/bowtie2_index/%s ' % (out_dir, prefix) + ','.join(fastq)
    tophat_cmd += ' 2> %s/tophat.log' % out_dir
    print('TopHat2 mapping command:')
    print(tophat_cmd)
    return_code = os.system(tophat_cmd) >> 8
    if return_code:
        sys.exit('Error: cannot map reads with TopHat2!')
    # extract unmapped reads
    print('Extract unmapped reads...')
    unmapped_bam = pybedtools.BedTool('%s/tophat/unmapped.bam' % out_dir)
    unmapped_bam.bam_to_fastq(fq='%s/tophat/unmapped.fastq' % out_dir)
    # create Bigwig file if needed
    if bw and which('bedGraphToBigWig') is not None:
        print('Create BigWig file...')
        map_bam_fname = '%s/tophat/accepted_hits.bam' % out_dir
        # index bam if not exist
        if not os.path.isfile(map_bam_fname + '.bai'):
            pysam.index(map_bam_fname)
        map_bam = pysam.AlignmentFile(map_bam_fname, 'rb')
        # extract chrom size file
        chrom_size_fname = '%s/tophat/chrom.size' % out_dir
        with open(chrom_size_fname, 'w') as chrom_size_f:
            for seq in map_bam.header['SQ']:
                chrom_size_f.write('%s\t%s\n' % (seq['SN'], seq['LN']))
        if scale:  # scale to HPB
            mapped_reads = map_bam.mapped
            for read in map_bam:
                read_length = read.query_length
                break
            s = 1000000000.0 / mapped_reads / read_length
        else:
            s = 1
        map_bam = pybedtools.BedTool(map_bam_fname)
        bedgraph_fname = '%s/tophat/accepted_hits.bg' % out_dir
        with open(bedgraph_fname, 'w') as bedgraph_f:
            for line in map_bam.genome_coverage(bg=True,
                                                g=chrom_size_fname,
                                                scale=s,
                                                split=True):
                value = str(int(float(line[3]) + 0.5))
                bedgraph_f.write('\t'.join(line[:3]) + '\t%s\n' % value)
        bigwig_fname = '%s/tophat/accepted_hits.bw' % out_dir
        return_code = os.system(
            'bedGraphToBigWig %s %s %s' %
            (bedgraph_fname, chrom_size_fname, bigwig_fname)) >> 8
        if return_code:
            sys.exit('Error: cannot convert bedGraph to BigWig!')
    else:
        print('Could not find bedGraphToBigWig, so skip this step!')
Ejemplo n.º 18
0
def _merge_target_information(samples):
    metrics_dir = utils.safe_makedir("metrics")
    out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml"))
    if utils.file_exists(out_file):
        return samples

    genomes = set(dd.get_genome_build(data) for data in samples)
    coverage_beds = set(dd.get_coverage(data) for data in samples)
    original_variant_regions = set(
        dd.get_variant_regions_orig(data) for data in samples)

    data = samples[0]
    info = {}

    # Reporting in MultiQC only if the genome is the same across all samples
    if len(genomes) == 1:
        info["genome_info"] = {
            "name":
            dd.get_genome_build(data),
            "size":
            sum([
                c.size for c in ref.file_contigs(dd.get_ref_file(data),
                                                 data["config"])
            ]),
        }

    # Reporting in MultiQC only if the target is the same across all samples
    vcr_orig = None
    if len(original_variant_regions) == 1 and list(
            original_variant_regions)[0] is not None:
        vcr_orig = list(original_variant_regions)[0]
        vcr_clean = bedutils.clean_file(vcr_orig, data)
        info["variants_regions_info"] = {
            "bed":
            vcr_orig,
            "size":
            sum(
                len(x) for x in pybedtools.BedTool(
                    dd.get_variant_regions_merged(data))),
            "regions":
            pybedtools.BedTool(vcr_clean).count(),
        }
        gene_num = annotate.count_genes(vcr_clean, data)
        if gene_num is not None:
            info["variants_regions_info"]["genes"] = gene_num
    else:
        info["variants_regions_info"] = {
            "bed": "callable regions",
        }
    # Reporting in MultiQC only if the target is the same across samples
    if len(coverage_beds) == 1:
        cov_bed = list(coverage_beds)[0]
        if cov_bed not in [None, "None"]:
            if vcr_orig and vcr_orig == cov_bed:
                info["coverage_bed_info"] = info["variants_regions_info"]
            else:
                clean_bed = bedutils.clean_file(cov_bed,
                                                data,
                                                prefix="cov-",
                                                simple=True)
                info["coverage_bed_info"] = {
                    "bed": cov_bed,
                    "size": pybedtools.BedTool(cov_bed).total_coverage(),
                    "regions": pybedtools.BedTool(clean_bed).count(),
                }
                gene_num = annotate.count_genes(clean_bed, data)
                if gene_num is not None:
                    info["coverage_bed_info"]["genes"] = gene_num
        else:
            info["coverage_bed_info"] = info["variants_regions_info"]

    coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"]
                             for data in samples)
    if len(coverage_intervals) == 1:
        info["coverage_interval"] = list(coverage_intervals)[0]

    if info:
        with open(out_file, "w") as out_handle:
            yaml.safe_dump(info, out_handle)

    return samples
Ejemplo n.º 19
0
    def get_aligned_reads(self, n_read_limit, passed_cells):

        samfile = pysam.AlignmentFile(self.in_bam_uniq, "rc")
        try:
            r_iterator = samfile.fetch(self.chrom, int(self.start),
                                       int(self.end))
        except:
            return None

        nreads = len([
            r_idx for r_idx, x in enumerate(r_iterator)
            if x.flag in self.strand_flags[self.strand]
        ])
        if nreads > n_read_limit: return self.gene

        r_iterator = samfile.fetch(self.chrom, int(self.start), int(self.end))
        read_dict = {
            r_idx: _make_dict(x, self.chrom, self.strand, self.gene, r_idx)
            for r_idx, x in enumerate(r_iterator)
            if x.flag in self.strand_flags[self.strand]
            and list(filter(regx1.match,
                            x.to_dict()['tags']))[0].replace(
                                'BC:Z:', '') in passed_cells
        }
        samfile.close()

        df = [read_dict[r_idx]['r_blocks'] for r_idx in read_dict.keys()]

        if len(df) == 0: return None
        pd.concat(df, axis=0).to_csv('%s/.tempDir/_%s_reads_blocks.bed' %
                                     (self.outdir, self.gene),
                                     index=False,
                                     sep="\t",
                                     header=False)
        read_bed = pybedtools.BedTool('%s/.tempDir/_%s_reads_blocks.bed' %
                                      (self.outdir, self.gene))

        tmp = self.ex_bed.intersect(read_bed, wa=True, wb=True)
        if os.stat(tmp.fn).st_size == 0:
            return None

        intersect_all = tmp.to_dataframe()
        read_idx_list = list(set(intersect_all.iloc[:, 9].values))

        ex_coord = ','.join(
            self.exons.apply(lambda x: '%s-%s' % (x[1], x[2]), axis=1).values)

        aligned_reads = [
            _make_list_aligned_reads2(r_idx, read_dict, intersect_all,
                                      ex_coord) for r_idx in read_idx_list
        ]

        colnames = [
            'name', 'flag', 'ref_name', 'ref_pos', 'map_quality', 'cigar',
            'next_ref_name', 'next_ref_pos', 'length', 'seq', 'qual', 'tags',
            'read_mapped_position', 'geneid', 'Exon_Index', 'Category', 'BC',
            'UB', 'exon_coordinates'
        ]
        self.uniq_aligned_reads = pd.DataFrame(
            aligned_reads, columns=colnames).drop_duplicates()
        self.uniq_r_bclist = list(
            set(
                self.uniq_aligned_reads.apply(lambda x: '%s+%s' %
                                              (x['BC'], x['UB']),
                                              axis=1).values))
        self.uniq_aligned_reads.insert(19, 'MapFlag', 'unique')

        return None
Ejemplo n.º 20
0
def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf,
                ensemble_tsv, tumor_bam, min_len, postprocess_max_dist,
                long_read, lr_pad, lr_chunck_size, lr_chunck_scale,
                lr_snp_min_af, lr_ins_min_af, lr_del_min_af, lr_match_score,
                lr_mismatch_penalty, lr_gap_open_penalty, lr_gap_ext_penalty,
                pass_threshold, lowqual_threshold, msa_binary, num_threads):
    logger = logging.getLogger(postprocess.__name__)

    logger.info("----------------------Postprocessing-----------------------")
    if not os.path.exists(work):
        os.mkdir(work)

    candidates_preds = os.path.join(work, "candidates_preds.vcf")
    ensembled_preds = os.path.join(work, "ensembled_preds.vcf")
    pred_vcf = pybedtools.BedTool(pred_vcf_file)
    pred_vcf.window(candidates_vcf, w=5, v=True).saveas(ensembled_preds)
    pred_vcf.window(candidates_vcf, w=5, u=True).saveas(candidates_preds)

    logger.info("Extract targets")
    postprocess_pad = 1 if not long_read else 10
    extract_postprocess_targets(candidates_preds, min_len,
                                postprocess_max_dist, postprocess_pad)

    no_resolve = os.path.join(work, "candidates_preds.no_resolve.vcf")
    target_vcf = os.path.join(work, "candidates_preds.resolve_target.vcf")
    target_bed = os.path.join(work, "candidates_preds.resolve_target.bed")
    resolved_vcf = os.path.join(work, "candidates_preds.resolved.vcf")

    logger.info("Resolve targets")
    if not long_read:
        resolve_variants(tumor_bam, resolved_vcf, reference, target_vcf,
                         target_bed, num_threads)
    else:
        work_lr_indel_realign = os.path.join(work, "work_lr_indel_realign")
        if os.path.exists(work_lr_indel_realign):
            shutil.rmtree(work_lr_indel_realign)
        os.mkdir(work_lr_indel_realign)
        ra_resolved_vcf = os.path.join(work,
                                       "candidates_preds.ra_resolved.vcf")
        long_read_indelrealign(work_lr_indel_realign, tumor_bam, None,
                               ra_resolved_vcf, target_bed, reference,
                               num_threads, lr_pad, lr_chunck_size,
                               lr_chunck_scale, lr_snp_min_af, lr_del_min_af,
                               lr_ins_min_af, lr_match_score,
                               lr_mismatch_penalty, lr_gap_open_penalty,
                               lr_gap_ext_penalty, msa_binary)
        resolve_scores(tumor_bam, ra_resolved_vcf, target_vcf, resolved_vcf)

    all_no_resolve = concatenate_files([no_resolve, ensembled_preds],
                                       os.path.join(work, "no_resolve.vcf"))

    logger.info("Merge vcfs")
    merged_vcf = os.path.join(work, "merged_preds.vcf")
    merge_post_vcfs(reference, resolved_vcf, all_no_resolve, merged_vcf,
                    pass_threshold, lowqual_threshold)
    add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv,
                 output_vcf, pass_threshold, lowqual_threshold)

    logger.info("Output NeuSomatic prediction at {}".format(output_vcf))
    logger.info("Postprocessing is Done.")
    return output_vcf
Ejemplo n.º 21
0
def get_intersect_bed_ix(reference_bed,
                         query_bed,
                         just_names=True,
                         araport11_file=None):
    ## here query_bed is either a file or a pandas dataframe
    ## we can rely on bedops -- very fast and efficient
    # https://www.biostars.org/p/319840/
    if isinstance(query_bed, str):
        if os.path.isfile(query_bed):
            queryBed = pybed.BedTool(query_bed)
    elif isinstance(query_bed, pd.DataFrame):
        # query_bed.iloc[:,0] = query_bed.iloc[:,0].astype()
        query_bed.iloc[:, 1] = query_bed.iloc[:, 1].astype(int)
        query_bed.iloc[:, 2] = query_bed.iloc[:, 2].astype(int)
        queryBed = pybed.BedTool.from_dataframe(query_bed.iloc[:, [0, 1, 2]])
    elif isinstance(query_bed, pybed.bedtool.BedTool):
        queryBed = query_bed
    else:
        raise (NotImplementedError(
            "either input a bed file or pandas dataframe for query"))
    if isinstance(reference_bed, str):
        if os.path.isfile(reference_bed):
            refBed = pybed.BedTool(reference_bed)
    elif just_names:
        reference_bed_df = identify_positions_given_names(
            reference_bed, araport11_file)
        refBed = pybed.BedTool.from_dataframe(reference_bed_df.iloc[:,
                                                                    [0, 1, 2]])
    elif isinstance(reference_bed, pd.DataFrame):
        reference_bed.iloc[:, 1] = reference_bed.iloc[:, 1].astype(int)
        reference_bed.iloc[:, 2] = reference_bed.iloc[:, 2].astype(int)
        refBed = pybed.BedTool.from_dataframe(reference_bed.iloc[:, [0, 1, 2]])
    elif isinstance(reference_bed, pybed.bedtool.BedTool):
        refBed = reference_bed
    else:
        raise (NotImplementedError(
            "either input a bed file or pandas dataframe for reference"))
    f_newrefBed = open(refBed.fn + ".new.tmp", 'w')
    cmd_out = Popen(''' awk '{ print $0 "\t" NR-1 }' ''' + refBed.fn,
                    shell=True,
                    stdout=f_newrefBed)
    cmd_out.wait()
    f_newrefBed.close()
    newRefBed = pybed.BedTool(refBed.fn + ".new.tmp")
    f_newqueryBed = open(queryBed.fn + ".new.tmp", 'w')
    cmd_out = Popen(''' awk '{ print $0 "\t" NR-1 }' ''' + queryBed.fn,
                    shell=True,
                    stdout=f_newqueryBed)
    cmd_out.wait()
    f_newqueryBed.close()
    newqueryBed = pybed.BedTool(queryBed.fn + ".new.tmp")
    ## Just taking first three columns for bedtools
    unionBed = newRefBed.intersect(newqueryBed, wa=True, wb=True)
    if unionBed.count() == 0:  ## Return if there are no matching lines.
        return (None)
    unionBed = unionBed.to_dataframe()
    unionBed.columns = np.array([
        'ref_chr', 'ref_start', 'ref_end', 'ref_ix', 'query_chr',
        'query_start', 'query_end', 'query_ix'
    ])
    return (unionBed)  ## third column is the index I added
Ejemplo n.º 22
0
def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv,
                 output_vcf, pass_threshold, lowqual_threshold):
    merged_vcf = pybedtools.BedTool(merged_vcf)
    candidates_vcf = pybedtools.BedTool(candidates_vcf)
    ensemble_candids_vcf = []
    if ensemble_tsv:
        ensemble_candids_vcf = os.path.join(work, "ensemble_candids.vcf")
        with open(ensemble_tsv) as e_f:
            with open(ensemble_candids_vcf, "w") as c_f:
                c_f.write("##fileformat=VCFv4.2\n")
                c_f.write(
                    "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"
                )
                for line in e_f:
                    if "T_REF_FOR" in line:
                        header = line.strip().split()
                        chrom_id = header.index("CHROM")
                        pos_id = header.index("POS")
                        ref_id = header.index("REF")
                        alt_id = header.index("ALT")
                        dp_id = header.index("T_DP")
                        ref_fw_id = header.index("T_REF_FOR")
                        ref_rv_id = header.index("T_REF_REV")
                        alt_fw_id = header.index("T_ALT_FOR")
                        alt_rv_id = header.index("T_ALT_REV")
                        continue
                    fields = line.strip().split()
                    chrom = fields[chrom_id]
                    pos = fields[pos_id]
                    ref = fields[ref_id]
                    alt = fields[alt_id]
                    dp = int(fields[dp_id])
                    ro_fw = int(fields[ref_fw_id])
                    ro_rv = int(fields[ref_rv_id])
                    ao_fw = int(fields[alt_fw_id])
                    ao_rv = int(fields[alt_rv_id])
                    ro = ro_fw + ro_rv
                    ao = ao_fw + ao_rv
                    af = np.round(ao / float(ao + ro + 0.0001), 4)
                    c_f.write("\t".join(
                        map(str, [
                            chrom, pos, ".", ref, alt, ".", ".", ".",
                            "GT:DP:RO:AO:AF", ":".join(
                                map(str, ["0/1", dp, ro, ao, af]))
                        ])) + "\n")

    ensemble_candids_vcf = pybedtools.BedTool(ensemble_candids_vcf)
    in_candidates = merged_vcf.window(candidates_vcf, w=5)
    notin_candidates = merged_vcf.window(candidates_vcf, w=5, v=True)
    in_ensemble = merged_vcf.window(ensemble_candids_vcf, w=5)
    notin_any = notin_candidates.window(ensemble_candids_vcf, w=5, v=True)
    chroms_order = get_chromosomes_order(reference=reference)
    with pysam.FastaFile(reference) as rf:
        chroms = rf.references

    scores = {}
    tags_info = {}
    for s_e, dd in [0, in_candidates], [1, in_ensemble]:
        for x in dd:
            tag = "-".join([str(chroms_order[x[0]]), x[1], x[3], x[4]])
            scores[tag] = [x[5], x[6], x[7], x[9]]
            if tag not in tags_info:
                tags_info[tag] = []
            info = x[19].split(":")
            dp, ro, ao = map(int, info[1:4])
            af = float(info[4])
            is_same = x[1] == x[11] and x[3] == x[13] and x[4] == x[14]
            is_same_type = np.sign(len(x[3]) - len(x[13])) == np.sign(
                len(x[4]) - len(x[14]))
            dist = abs(int(x[1]) - int(x[11]))
            len_diff = abs((len(x[3]) - len(x[13])) - (len(x[4]) - len(x[14])))
            tags_info[tag].append(
                [~is_same, ~is_same_type, dist, len_diff, s_e, dp, ro, ao, af])
    fina_info_tag = {}
    for tag, hits in tags_info.iteritems():
        hits = sorted(hits, key=lambda x: x[0:5])
        fina_info_tag[tag] = hits[0][5:]

    for x in notin_any:
        tag = "-".join([str(chroms_order[x[0]]), x[1], x[3], x[4]])
        fina_info_tag[tag] = [0, 0, 0, 0]
        scores[tag] = [x[5], x[6], x[7], x[9]]

    tags = sorted(fina_info_tag.keys(),
                  key=lambda x: map(int,
                                    x.split("-")[0:2]))
    with open(output_vcf, "w") as o_f:
        o_f.write("##fileformat=VCFv4.2\n")
        o_f.write("##NeuSomatic Version={}\n".format(__version__))
        o_f.write(
            "##FORMAT=<ID=SCORE,Number=1,Type=Float,Description=\"Prediction probability score\">\n"
        )
        o_f.write(
            "##FILTER=<ID=PASS,Description=\"Accept as a higher confidence somatic mutation calls with probability score value at least {}\">\n"
            .format(pass_threshold))
        o_f.write(
            "##FILTER=<ID=LowQual,Description=\"Less confident somatic mutation calls with probability score value at least {}\">\n"
            .format(lowqual_threshold))
        o_f.write(
            "##FILTER=<ID=REJECT,Description=\"Rejected as a confident somatic mutation with probability score value below {}\">\n"
            .format(lowqual_threshold))
        o_f.write(
            "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
        o_f.write(
            "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth in the tumor\">\n"
        )
        o_f.write(
            "##FORMAT=<ID=RO,Number=1,Type=Integer,Description=\"Reference allele observation count in the tumor\">\n"
        )
        o_f.write(
            "##FORMAT=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count in the tumor\">\n"
        )
        o_f.write(
            "##FORMAT=<ID=AF,Number=1,Type=Float,Description=\"Allele fractions of alternate alleles in the tumor\">\n"
        )
        o_f.write(
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
        for tag in tags:
            chrom_id, pos, ref, alt = tag.split("-")
            qual, filter_, score, gt = scores[tag]
            dp, ro, ao, af = fina_info_tag[tag]
            info_field = "{};DP={};RO={};AO={};AF={};".format(
                score, dp, ro, ao, af)
            gt_field = "{}:{}:{}:{}:{}".format(gt, dp, ro, ao, af)
            o_f.write("\t".join(
                map(str, [
                    chroms[int(chrom_id)],
                    str(pos), ".", ref, alt, qual, filter_, info_field,
                    "GT:DP:RO:AO:AF", gt_field
                ])) + "\n")
Ejemplo n.º 23
0
def assign_to_regions(tool,
                      clusters=None,
                      assigned_dir=".",
                      species="hg19",
                      nrand=3,
                      data_dir=DATA_DIR):
    """
    
    Assigns each cluster to a genic region
    finally saves all generated bed and fasta files for future analysis...

    tool - a bed tool (each line represnting a cluster)
    clusters - name of cluster file (optional)
    assigned_dir - location to save files in
    species - str species to segment
    nrand - int number offsets times to shuffle for null hypothesis


    """
    if clusters is None:
        clusters, ext = os.path.splitext(os.path.basename(tool.fn))
    bedtracks = {}

    regions, assigned_regions = regions_generator()
    short_species = species.split("_")[0]
    if short_species == "GRCh38":
        short_species = "hg38"

    for region in regions:
        bedtracks[region] = pybedtools.BedTool(
            os.path.join(data_dir, "regions", "%s_%s.bed" % (species, region)))
    #creates the basics of bed dict
    bed_dict = {'all': {'rand': {}}}

    genes = pybedtools.BedTool(
        os.path.join(data_dir, "regions", "%s_genes.bed" % (species)))

    offsets = get_offsets_bed12(tool)
    if tool.field_count() <= 5:
        tool.sort().merge().saveas()
    elif 6 <= tool.field_count() < 8:
        #Hack to get around not having gene name assigned by peak caller, due to overlapping genes this won't be perfect
        #move_name_real = functools.partial(move_name, original_length=len(tool[0].fields))
        #tool = tool.intersect(genes, wo=True, s=True).each(move_name_real).saveas()
        #fix_strand_ok = functools.partial(fix_strand, warn=False)
        tool = tool.sort().merge(
            s=True, c="4,5,6",
            o="collapse,collapse,collapse").each(fix_strand_v26).saveas()
    #elif not tool[0][7].isdigit():
    #    tool = tool.sort().merge(s=True, c="4,5,6", o="collapse,collapse,collapse").each(fix_strand).each(fix_name).saveas()
    else:  #Clipper, this is ideal we like this technique
        tool = tool.sort().merge(s=True,
                                 c="4,5,6,7,8",
                                 o="collapse,collapse,collapse,min,min").each(
                                     fix_strand_v26).saveas()

    remaining_clusters = adjust_offsets(tool, offsets)

    # print "There are a total %d clusters I'll examine" % (len(tool))
    for region in regions:
        remaining_clusters, overlapping = intersection(remaining_clusters,
                                                       b=bedtracks[region])

        #if for some reason there isn't a peak in the region skip it
        if len(overlapping) == 0:
            # print "ignoring %s " % region
            continue

        #sets up bed dict for this region
        bed_dict[region] = {
            'real': overlapping.sort(stream=True).saveas(),
            'rand': {}
        }

        no_overlapping_count = len(remaining_clusters)
        overlapping_count = len(bed_dict[region]['real'])
        # print "For region: %s found %d that overlap and %d that don't" % (region,
        #                                                                   overlapping_count,
        #                                                                   no_overlapping_count)

        if 'real' not in bed_dict['all']:
            bed_dict['all']['real'] = bed_dict[region]['real']
        else:
            bed_dict['all']['real'] = bed_dict['all']['real'].cat(
                bed_dict[region]['real'], stream=True,
                postmerge=False).saveas()

        #saves offsets so after shuffling the offsets can be readjusted
        offset_dict = get_offsets_bed12(bed_dict[region]['real'])
        for i in range(nrand):
            random_intervals = bed_dict[region]['real'].shuffle(
                genome=short_species, incl=bedtracks[region].fn).sort()
            random_intervals = fix_shuffled_strand(random_intervals,
                                                   bedtracks[region].fn)
            random_intervals = adjust_offsets(random_intervals, offset_dict)
            bed_dict[region]['rand'][i] = random_intervals.saveas()

            if i not in bed_dict['all']['rand']:
                bed_dict['all']['rand'][i] = bed_dict[region]['rand'][i]
            else:
                bed_dict['all']['rand'][i] = bed_dict['all']['rand'][i].cat(
                    bed_dict[region]['rand'][i], stream=True, postmerge=False)

        #if there are no more clusters to assign stop trying
        if no_overlapping_count == 0:
            break

    # print "After assigning %d un-categorized regions" % len(remaining_clusters)

    if len(remaining_clusters) > 0:
        bed_dict['uncatagorized'] = {
            'real': remaining_clusters.sort(stream=True).saveas()
        }

    bed_dict = save_bedtools(bed_dict, clusters, assigned_dir)
    return bed_dict
Ejemplo n.º 24
0
def generate_fastafile_frombed(ref, bed):
    bedfile = pybedtools.BedTool(bed)
    fasta = pybedtools.BedTool(ref)
    bedfile = bedfile.sequence(fi=fasta, s=True, name=True)
    return bedfile.seqfn
Ejemplo n.º 25
0
 def _has_larger_regions(f):
     return any(r.stop - r.start > max_size for r in pybedtools.BedTool(f))
Ejemplo n.º 26
0
def main():
    args = parser_args(sys.argv[1:])

    if args.temp:
        pybedtools.helpers.set_tempdir(args.temp)

    if not args.vcf and not args.zippedvcf:
        print("ERROR. VCF required, please use -v or -vz")
        if not args.zippedvcf.endswith(".gz"):
            print(
                "ERROR. --vz used with non-gzipped file (must end with '.gz')")
        if args.vcf.endswith(".gz"):
            print("ERROR. --vz option should be used, gzipped file detected")
        sys.exit(2)
    if not args.reference and not args.zippedreference:
        print("ERROR. Reference fasta required, please use -r or -rz")
        sys.exit(2)

    if not os.path.exists(args.output):
        os.makedirs(args.output)
    if not os.path.exists(args.intermediate):
        os.makedirs(args.intermediate)

    # Go through the exon-boundaries bed file, and generate a new file
    #  called $INTERMED/splice-site.bed. This bed file contains the
    #  organism's canonical splice site coordinates.
    canonical_splicesites_bedfile = os.path.join(args.intermediate,
                                                 "splice-site.bed")
    if os.path.exists(canonical_splicesites_bedfile):
        print(
            "Bed file containing splice sites is already generated. Moving on!"
        )
    else:
        print("Generating a splicing bed file using exon boundaries...")
        start = time.time()
        generate_splicingbed_withexonbound(canonical_splicesites_bedfile,
                                           args.chrlens, args.bed)
        end = time.time()
        print("Finished generating. Time took %s" % (end - start))

    # Find the subset of the VCF which contains non-coding variants
    #    use this VCF for NovaSplice calculations.
    print("Subsetting VCF to noncoding variants and SNPs")
    start = time.time()
    pybedtools.cleanup()
    vcfheader_file = os.path.join(args.output, "vcf-header")
    if args.vcf:
        writevcfheader(vcfheader_file, args.vcf, False)
        vcf = pybedtools.BedTool(args.vcf)
    else:
        writevcfheader(vcfheader_file, args.zippedvcf, True)
        vcf = pybedtools.BedTool(args.zippedvcf)
    exon_bounds = pybedtools.BedTool(args.bed).sort()
    subset_vcf_location = os.path.join(args.output,
                                       "coding-excludedvariants.vcf.gz")
    subsetvcf = vcf.intersect(exon_bounds, v=True,
                              sorted=True).filter(filter_snps_only)
    merge_two_files(subsetvcf, vcfheader_file, subset_vcf_location)
    end = time.time()
    print("Finished subsetting. Time took %s" % (end - start))

    # We now find the closest upstream/downstream canonical splice sites
    #    non-coding variant in the subsetted VCF
    print("Finding closest canonical splice sites to each non-coding variant")
    start = time.time()
    pybedtools.cleanup()
    subset_vcf = pybedtools.BedTool(subset_vcf_location)
    canon_bed = pybedtools.BedTool(canonical_splicesites_bedfile)
    subset_vcf.closest(canon_bed,
                       D="b",
                       id=True,
                       io=True,
                       output=os.path.join(args.output, "close-up.bed"))
    subset_vcf.closest(canon_bed,
                       D="b",
                       iu=True,
                       io=True,
                       output=os.path.join(args.output, "close-down.bed"))
    end = time.time()
    print("Finished generating. Time took %s" % (end - start))

    # DONOR SPECIFIC
    # For every variant, compute the set of 9 possible donor sites with that variant. The
    #    output is a bed file that has 9 entries for every variant.
    variantsite_location = os.path.join(args.output,
                                        "variant-site-donorsites.bed")
    print("Generating a variant bed file from vcf...")
    start = time.time()
    #generate_variantbedfile_fromvcf(subset_vcf_location, variantsite_location, True, True)
    generate_variantbedfile_fromclosest(
        os.path.join(args.output, "close-down.bed"), variantsite_location,
        True)
    end = time.time()
    print("Finished generating. Time took %s" % (end - start))

    # Extract the sequences of every variant and store them in a file. Note that this
    #    file is not saved anywhere except $TMP and is solely based on the reference
    print("Generating a fasta file from variant bed file...")
    start = time.time()
    if args.reference:
        fastaref = generate_fastafile_frombed(args.reference,
                                              variantsite_location)
    else:
        fastaref = generate_fastafile_frombed(args.zippedreference,
                                              variantsite_location)
    end = time.time()
    print("Finished generating. Time took %s" % (end - start))

    # Mutate the fasta file with the variants found in the vcf file
    print("Mutating fasta file per SNPs in VCF...")
    start = time.time()
    generate_variantfastafile(
        fastaref, os.path.join(args.output, "variant-site-donorsites.fa"))
    end = time.time()
    print("Finished generating. Time took %s" % (end - start))

    # Create a dictionary that relates a given variant to the closest upstream canonical
    #   splice site's score
    print("Scoring canonical and novel splice-sites...")
    start = time.time()
    if args.reference:
        nametoscore = extract_canonical_score(
            os.path.join(args.output, 'close-down.bed'), args.output,
            args.reference, True)
    else:
        nametoscore = extract_canonical_score(
            os.path.join(args.output, 'close-down.bed'), args.output,
            args.zippedreference, True)
    with open(os.path.join(args.output, args.libraryname), 'w') as out:
        out.write(
            "Novel SS\tNovel Score\tScore before variant\tClosest Canonical Score\tLocation of closest canonical ss\n"
        )
    compare_scores(os.path.join(args.output,
                                "variant-site-donorsites.fa"), nametoscore,
                   args.percent, args.output, True, args.libraryname)
    end = time.time()
    print("Finished generating. Time took %s" % (end - start))
    print(
        "Finished analysis for donor sites. Now doing same for acceptor sites."
    )

    #ACCEPTOR SITES

    variantsite_location = os.path.join(args.output,
                                        "variant-site-acceptorsites.bed")
    print("Generating a variant bed file from vcf...")
    start = time.time()
    #generate_variantbedfile_fromvcf(subset_vcf_location, variantsite_location, False, True)
    generate_variantbedfile_fromclosest(
        os.path.join(args.output, "close-up.bed"), variantsite_location, False)
    end = time.time()
    print("Finished generating. Time took %s" % (end - start))

    print("Generating a fasta file from variant bed file...")
    start = time.time()
    if args.reference:
        fastaref = generate_fastafile_frombed(args.reference,
                                              variantsite_location)
    else:
        fastaref = generate_fastafile_frombed(args.zippedreference,
                                              variantsite_location)
    end = time.time()
    print("Finished generating. Time took %s" % (end - start))

    print("Mutating fasta file per SNPs in VCF...")
    start = time.time()
    generate_variantfastafile(
        fastaref, os.path.join(args.output, "variant-site-acceptorsites.fa"))
    end = time.time()
    print("Finished generating. Time took %s" % (end - start))

    print("Scoring canonical and novel splice-sites...")
    start = time.time()
    if args.reference:
        nametoscore = extract_canonical_score(
            os.path.join(args.output, 'close-up.bed'), args.output,
            args.reference, False)
    else:
        nametoscore = extract_canonical_score(
            os.path.join(args.output, 'close-up.bed'), args.output,
            args.zippedreference, False)
    compare_scores(os.path.join(args.output,
                                "variant-site-acceptorsites.fa"), nametoscore,
                   args.percent, args.output, False, args.libraryname)
    end = time.time()
    print("Finished generating. Time took %s" % (end - start))

    print("Finished! Removing intermediate files now...")

    toRem = []
    toRem += (glob.glob(os.path.join(args.output, '*.bed')))
    toRem += (glob.glob(os.path.join(args.output, '*.vcf.gz')))
    toRem += (glob.glob(os.path.join(args.output, '*.fa')))
    for i in (toRem):
        os.remove(i)
Ejemplo n.º 27
0
def run_age_single(intervals_bed=None,
                   region_list=[],
                   contig_dict={},
                   reference=None,
                   assembly=None,
                   pad=AGE_PAD,
                   age=None,
                   age_workdir=None,
                   timeout=AGE_TIMEOUT,
                   keep_temp=False,
                   myid=0):
    thread_logger = logging.getLogger(
        "%s-%s" % (run_age_single.__name__, multiprocessing.current_process()))

    bedtools_intervals = []
    intervals_bedtool = pybedtools.BedTool(intervals_bed)

    assembly_fasta = pysam.Fastafile(assembly)
    reference_fasta = pysam.Fastafile(reference)

    breakpoints_bed = None

    thread_logger.info("Will process %d intervals" % (len(region_list)))

    try:
        for region in region_list:
            bedtools_interval = pybedtools.Interval(region[0], region[1],
                                                    region[3])
            matching_intervals = [
                interval for interval in intervals_bedtool
                if (interval.start == bedtools_interval.start
                    and interval.end == bedtools_interval.end
                    and interval.chrom == bedtools_interval.chrom)
            ]
            if not matching_intervals:
                thread_logger.info("Matching interval not found for %s" %
                                   (str(bedtools_interval)))
                matching_interval = bedtools_interval
            else:
                matching_interval = matching_intervals[0]
            thread_logger.info("Matching interval %s" %
                               (str(matching_interval)))

            if region not in contig_dict:
                continue
            if not contig_dict[region]:
                continue

            region_object = SVRegion(region[0], region[1], region[2],
                                     region[3])
            if region_object.pos1 - pad < 0:
                thread_logger.error(
                    "Region too close to start of chromosome. Skipping.")
                continue

            reference_sequence = reference_fasta.fetch(
                reference=region_object.chrom1,
                start=region_object.pos1 - pad,
                end=region_object.pos2 + pad)
            region_name = "%s.%d.%d" % (region_object.chrom1,
                                        region_object.pos1, region_object.pos2)
            ref_name = os.path.join(age_workdir, "%s.ref.fa" % region_name)

            thread_logger.info("Writing the ref sequence for region %s" %
                               region_name)
            with open(ref_name, "w") as file_handle:
                file_handle.write(">{}.ref\n{}".format(region_name,
                                                       reference_sequence))

            age_records = []
            thread_logger.info("Processing %d contigs for region %s" %
                               (len(contig_dict[region]), str(region_object)))
            for contig in contig_dict[region]:
                thread_logger.info(
                    "Writing the assembeled sequence %s of length %s" %
                    (contig.raw_name, contig.sequence_len))
                if contig.sequence_len * region_object.length() >= 100000000:
                    thread_logger.info(
                        "Skipping contig because AGE problem is large")
                    continue

                contig_sequence = assembly_fasta.fetch(contig.raw_name)

                prefix = get_age_file_prefix(contig)
                asm_name = os.path.join(age_workdir, "%s.as.fa" % prefix)
                out = os.path.join(age_workdir, "%s.age.out" % prefix)
                err = os.path.join(age_workdir, "%s.age.err" % prefix)

                with open(asm_name, "w") as file_handle:
                    file_handle.write(">{}.as\n{}".format(
                        region_name, contig_sequence))

                age_cmd = "%s %s -both -go=-6 %s %s >%s 2>%s" % (
                    age, "-inv" if contig.sv_type == "INV" else "-indel",
                    ref_name, asm_name, out, err)
                execute_cmd = "timeout %ds %s" % (timeout, age_cmd)

                retcode = run_cmd(execute_cmd, thread_logger, None, None)

                if retcode == 0:
                    age_record = AgeRecord(out)
                    if len(age_record.inputs) == 2:
                        age_record.contig = contig
                        age_records.append(age_record)
                    else:
                        thread_logger.error(
                            "Number of inputs != 2 in age output file %s. Skipping."
                            % out)

                if not keep_temp:
                    os.remove(asm_name)
                    os.remove(err)

            unique_age_records = get_unique_age_records(age_records)

            thread_logger.info("Unique %d AGE records for region %s" %
                               (len(unique_age_records), str(region_object)))
            for age_record in unique_age_records:
                thread_logger.info(str(age_record))

            sv_types = list(
                set([
                    age_record.contig.sv_type
                    for age_record in unique_age_records
                ]))
            if len(sv_types) != 1:
                thread_logger.error(
                    "Some problem. Mixed SV types for this interval %s" %
                    (str(sv_types)))
            else:
                sv_type = sv_types[0]
                thread_logger.info("Processing region of type %s" % sv_type)
                breakpoints, info_dict = process_age_records(
                    unique_age_records, sv_type=sv_type, pad=pad)
                bedtools_fields = matching_interval.fields
                if len(breakpoints) == 1 and sv_type == "INS":
                    bedtools_fields += map(str, [
                        breakpoints[0][0], breakpoints[0][0] + 1,
                        breakpoints[0][1]
                    ])
                elif len(breakpoints) == 2 and (sv_type == "DEL"
                                                or sv_type == "INV"):
                    bedtools_fields += map(
                        str, breakpoints + [breakpoints[1] - breakpoints[0]])
                else:
                    bedtools_fields += map(
                        str, [bedtools_fields[1], bedtools_fields[2], -1])
                bedtools_fields.append(base64.b64encode(json.dumps(info_dict)))
                thread_logger.info("Writing out fields %s" %
                                   (str(bedtools_fields)))
                bedtools_intervals.append(
                    pybedtools.create_interval_from_list(bedtools_fields))

            if not keep_temp:
                os.remove(ref_name)
    except Exception as e:
        thread_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e

    assembly_fasta.close()
    reference_fasta.close()

    thread_logger.info("Writing %d intervals" % (len(bedtools_intervals)))
    if bedtools_intervals:
        breakpoints_bed = os.path.join(age_workdir,
                                       "%d_breakpoints.bed" % myid)
        pybedtools.BedTool(bedtools_intervals).saveas(breakpoints_bed)

    return breakpoints_bed
Ejemplo n.º 28
0
def venn_mpl(a, b, c, colors=None, outfn='out.png', labels=None):
    """
    *a*, *b*, and *c* are filenames to BED-like files.

    *colors* is a list of matplotlib colors for the Venn diagram circles.

    *outfn* is the resulting output file.  This is passed directly to
    fig.savefig(), so you can supply extensions of .png, .pdf, or whatever your
    matplotlib installation supports.

    *labels* is a list of labels to use for each of the files; by default the
    labels are ['a','b','c']
    """
    try:
        import matplotlib.pyplot as plt
        from matplotlib.patches import Circle
    except ImportError:
        sys.stderr.write(
            'matplotlib is required to make a Venn diagram with %s\n' %
            os.path.basename(sys.argv[0]))
        sys.exit(1)

    a = pybedtools.BedTool(a)
    b = pybedtools.BedTool(b)
    c = pybedtools.BedTool(c)

    if colors is None:
        colors = ['r', 'b', 'g']

    radius = 6.0
    center = 0.0
    offset = radius / 2

    if labels is None:
        labels = ['a', 'b', 'c']

    circle_a = Circle(xy=(center - offset, center + offset),
                      radius=radius,
                      edgecolor=colors[0],
                      label=labels[0])
    circle_b = Circle(xy=(center + offset, center + offset),
                      radius=radius,
                      edgecolor=colors[1],
                      label=labels[1])
    circle_c = Circle(xy=(center, center - offset),
                      radius=radius,
                      edgecolor=colors[2],
                      label=labels[2])

    fig = plt.figure(facecolor='w')
    ax = fig.add_subplot(111)

    for circle in (circle_a, circle_b, circle_c):
        circle.set_facecolor('none')
        circle.set_linewidth(3)
        ax.add_patch(circle)

    ax.axis('tight')
    ax.axis('equal')
    ax.set_axis_off()

    kwargs = dict(horizontalalignment='center')

    # Unique to A
    ax.text(center - 2 * offset, center + offset, str((a - b - c).count()),
            **kwargs)

    # Unique to B
    ax.text(center + 2 * offset, center + offset, str((b - a - c).count()),
            **kwargs)

    # Unique to C
    ax.text(center, center - 2 * offset, str((c - a - b).count()), **kwargs)

    # A and B not C
    ax.text(center, center + 2 * offset - 0.5 * offset, str(
        (a + b - c).count()), **kwargs)

    # A and C not B
    ax.text(center - 1.2 * offset, center - 0.5 * offset,
            str((a + c - b).count()), **kwargs)

    # B and C not A
    ax.text(center + 1.2 * offset, center - 0.5 * offset,
            str((b + c - a).count()), **kwargs)

    # all
    ax.text(center, center, str((a + b + c).count()), **kwargs)

    ax.legend(loc='best')

    fig.savefig(outfn)

    plt.close(fig)
Ejemplo n.º 29
0
print '\\begin{tabular}{ | l | l | l | p{5cm} |}'
print '\hline'
print 'Sample name & Num. peaks repl1 & Num. peaks repl2 & Num. peaks merged \\\\ \hline  \hline'
print '\\multicolumn{4}{|c|}{Narrow peaks}  \\\\ \hline'

for sample_name in samples:

    # files for replicates
    repl1 = np + sample_name + '_repl1_FE.bw'
    repl2 = np + sample_name + '_repl2_FE.bw'

    # Optional, for corellation on BED files of peaks, two files merge in one
    bed_repl1 = np + sample_name + '_repl1_summits.bed'
    bed_repl2 = np + sample_name + '_repl2_summits.bed'
    bed_merged = np + sample_name + '_merged.bed'
    bd_rp1 = pb.BedTool(bed_repl1)
    bd_rp2 = pb.BedTool(bed_repl2)
    bd_mrg = bd_rp1.cat(bd_rp2)
    bd_mrg.slop(l=1000, r=1000, genome='mm10').merge().saveas(bed_merged)
    msg = "\\verb|{}| & {} & {} & {} \\\\ \hline".format(
        sample_name, str(bd_rp1.count()), str(bd_rp2.count()),
        str(bd_mrg.count()))
    print msg

print '\hline \\multicolumn{4}{|c|}{Broad peaks}  \\\\ \hline'

for sample_name in samples:

    # files for replicates
    repl1 = bp + sample_name + '_repl1_FE.bw'
    repl2 = bp + sample_name + '_repl2_FE.bw'
Ejemplo n.º 30
0
                average_very_good_coverage > 0) else 1))

    return pybedtools.create_interval_from_list(fields)


def add_coverage_information(in_bed, bam):
    return in_bed.each(partial(annotate_coverage, bam=bam))


pybedtools.set_tempdir(args.tmpdir)

with open(args.in_bed, 'r') as f:
    header = f.readline()
header = header.strip()

in_bed = pybedtools.BedTool(args.in_bed)

out_bed = in_bed
logger.info("Initial feature count %d" % (out_bed.count()))

if not os.path.isdir(args.tmpdir):
    os.makedirs(args.tmpdir)

bed_fields = header.split('\t')

if args.rmask_bed:
    out_bed = annotate_bed(out_bed, pybedtools.BedTool(args.rmask_bed))
    logger.info("Feature count after rmask %d" % (out_bed.count()))
    bed_fields += ["OVERLAPS_RMASK"]

if args.segdups_bed: