Esempio n. 1
0
def read_exons(gtf):
    transcripts = defaultdict(pyinter.IntervalSet)
    totlen = 0
    names = []
    trs, ids = [], []
    for toks in (x.rstrip('\r\n').split("\t") for x in ts.nopen(gtf) if x[0] != "#"):
        if toks[2] not in("CDS", "stop_codon") or toks[1] not in("protein_coding"): continue
        #if toks[0] != "1": break
        start, end = map(int, toks[3:5])
        assert start <= end, toks
        transcript = toks[8].split('transcript_id "')[1].split('"', 1)[0]
        transcripts[transcript].add(pyinter.closedopen(start-1, end))

        names.append(toks[8].split('transcript_name "')[1].split('"', 1)[0].rsplit("-", 1)[0])
        ids.append(toks[8].split('gene_id "')[1].split('"', 1)[0])
        trs.append(toks[8].split('transcript_id "')[1].split('"', 1)[0])

    # sort by start so we can do binary search.
    # TODO: need to remove overlapping exons so we don't double-count
    transcripts = dict((k, sorted(v)) for k, v in transcripts.iteritems())
    #ends = dict((k, sorted(v)) for k, v in ends.iteritems())
    ints={}
    lens=pyinter.IntervalSet()
    for tr, ivset in transcripts.iteritems():
        sends = sorted(list(ivset))
        iset=pyinter.IntervalSet(pyinter.closedopen(x.lower_value,x.upper_value) for x in sends)
        lens = lens.union(iset)
        ss, es = [x.lower_value for x in sends], [x.upper_value for x in sends]
        ints[tr] = (ss,es)
    totlen = sum(x.upper_value-x.lower_value for x in lens)
    return ints, set(names), set(ids), set(trs), totlen
Esempio n. 2
0
def get_gap_overlap_positions(path, blocks, read_len, min_mappable=20):
    blocks_gaps = genome_blocks_gaps(blocks, path)
    m = min_mappable

    gap_ref = pyinter.IntervalSet()
    ref = pyinter.IntervalSet()
    pos = 0
    for b in blocks_gaps:
        if len(b) == 0:
            continue
        if not b.is_insertion():
            gap_ref.add(pyinter.closedopen(pos, pos + len(b)))
            if not b.is_gap:
                ref.add(pyinter.closedopen(pos, pos + len(b)))
        pos += len(b)
    # print('gap_ref: {0}\nref: {1}\n'.format(gap_ref, ref))

    A1 = pyinter.IntervalSet()  # i: [i, i+m) contained in gap_ref
    A2 = pyinter.IntervalSet()  # i: [i, i+m) overlaps ref
    for iv in gap_ref:
        if iv.lower_value <= iv.upper_value - m:
            A1.add(pyinter.closed(iv.lower_value, iv.upper_value - m))
    for iv in ref:
        # print(iv)
        A2.add(pyinter.closed(iv.lower_value - m + 1, iv.upper_value - 1))
        # print(A2)

    A3 = A1.intersection(A2)

    A4 = pyinter.IntervalSet()
    A5 = pyinter.IntervalSet()
    for iv in A1:
        A4.add(pyinter.closed(iv.lower_value - read_len + m, iv.upper_value))
    for iv in A3:
        A5.add(pyinter.closed(iv.lower_value - read_len + m, iv.upper_value))

    result = A4.difference(A5)

    # print('A1: {0}\nA2: {1}\nA3: {2}\nA4: {3}\nA5: {4}\n'.format(A1, A2, A3, A4, A5))
    # print('result: {0}'.format(result))
    # print('')

    # remove any empty intervals
    out = pyinter.IntervalSet()
    for iv in result:
        a = iv.lower_value - 1 if iv.lower_value in iv else iv.lower_value
        b = iv.upper_value + 1 if iv.upper_value in iv else iv.upper_value
        # if iv.lower_value in iv or iv.upper_value in iv: # not open
        #     print('A1: {0}\nA2: {1}\nA3: {2}\nA4: {3}\nA5: {4}\n'.format(A1, A2, A3, A4, A5))
        #     print('result: {0}'.format(result))
        #     print(iv)
        #     raise Warning('non-open interval in get_gap_positions')
        if a < b - 1:
            out.add(pyinter.open(a, b))
    return out
Esempio n. 3
0
def generate_gbk_feature_index(genbank_path, feature_index_output_path):
    """
    Create a pickled pyinterval index of genbank features so we can pull
    them quickly.
    """

    gbk_feature_list = []
    with open(genbank_path, 'r') as fh:
        for seq_record in SeqIO.parse(fh, 'genbank'):
            interval_list = []

            for f in seq_record.features:

                if f.type not in GBK_FEATURES_TO_EXTRACT:
                    continue

                f_ivl = pyinter.closedopen(
                        f.location.start, f.location.end)
                f_ivl.type = f.type

                if 'gene' in f.qualifiers:
                    f_ivl.name = f.qualifiers['gene'][0]
                elif 'mobile_element_type' in f.qualifiers:
                    f_ivl.name = f.qualifiers['mobile_element_type'][0]
                # For now, if the gene has no '.name' or '.mobile_element_type', ignore
                gbk_feature_list.append(f_ivl)

    with open(feature_index_output_path, 'w') as fh:
        pickle.dump(gbk_feature_list, fh)
Esempio n. 4
0
def get_features_at_locations(ref_genome, intervals, chromosome=None):
    """
    Use the genbank index dataset and return gene or mobile element names
    that are within these intervals.
    """
    feature_index_path = get_dataset_with_type(ref_genome,
            Dataset.TYPE.FEATURE_INDEX).get_absolute_location()

    with open(feature_index_path, 'r') as fh:

        gbk_feature_list = pickle.load(fh)

        # Dictionary of features to return, for each interval.
        return_features = {}

        # For each input interval, return a list of feature names that
        # overlap.
        for interval in intervals:
            q_ivl = pyinter.closedopen(*interval)
            features = [f_ivl for f_ivl in gbk_feature_list if
                    q_ivl.intersect(f_ivl)]

            return_features[interval] = features

        return return_features
Esempio n. 5
0
def get_features_at_locations(ref_genome, intervals, chromosome=None):
    """
    Use the genbank index dataset and return gene or mobile element names
    that are within these intervals.
    """
    feature_index_path = get_dataset_with_type(
        ref_genome, Dataset.TYPE.FEATURE_INDEX).get_absolute_location()

    with open(feature_index_path, 'r') as fh:

        gbk_feature_list = pickle.load(fh)

        # Dictionary of features to return, for each interval.
        return_features = {}

        # For each input interval, return a list of feature names that
        # overlap.
        for interval in intervals:
            q_ivl = pyinter.closedopen(*interval)
            features = [
                f_ivl for f_ivl in gbk_feature_list if q_ivl.intersect(f_ivl)
            ]

            return_features[interval] = features

        return return_features
Esempio n. 6
0
def generate_gbk_feature_index(genbank_path, feature_index_output_path):
    """
    Create a pickled pyinterval index of genbank features so we can pull
    them quickly.
    """

    gbk_feature_list = []
    with open(genbank_path, 'r') as fh:
        for seq_record in SeqIO.parse(fh, 'genbank'):
            interval_list = []

            for f in seq_record.features:

                if f.type not in GBK_FEATURES_TO_EXTRACT:
                    continue

                f_ivl = pyinter.closedopen(f.location.start, f.location.end)
                f_ivl.type = f.type

                if 'gene' in f.qualifiers:
                    f_ivl.name = f.qualifiers['gene'][0]
                elif 'mobile_element_type' in f.qualifiers:
                    f_ivl.name = f.qualifiers['mobile_element_type'][0]
                # For now, if the gene has no '.name' or '.mobile_element_type', ignore
                gbk_feature_list.append(f_ivl)

    with open(feature_index_output_path, 'w') as fh:
        pickle.dump(gbk_feature_list, fh)
Esempio n. 7
0
def load_genome_gaps(gapsfile, chrom_name):
    gaps = pyinter.IntervalSet()
    with open(gapsfile, 'r') as file:
        lines = [l for l in file.readlines() if l.split('\t')[0] == chrom_name]
        for line in lines:
            toks = line.split('\t')
            a, b = int(toks[1]), int(toks[2])
            gaps.add(pyinter.closedopen(a, b))
    return gaps
Esempio n. 8
0
def read_repeats(path,keyname):
    tracks = defaultdict(pyinter.IntervalSet)
    for toks in (x.rstrip('\r\n').split() for x in ts.nopen(path) if x[0] != "#"):
        start, end = map(int, toks[1:3])
        assert start <= end, toks
        tracks[keyname].add(pyinter.closedopen(start, end))
    ints={}
    for pid, ivset in tracks.iteritems():
        sends = sorted(list(ivset))
        ss, es = [x.lower_value for x in sends], [x.upper_value for x in sends]
        ints[pid] = (ss,es)

    return ints
Esempio n. 9
0
def compute_null_dist(opts, discordant_pairs, dtype, insert_mu, insert_sigma,
                      gap_file, lib_idx, lr_cond):
    nreps = opts['pecluster_null_reps']
    chrom_name, start, end = opts['chromosome'], opts['region_start'], opts[
        'region_end']
    gaps_inter = load_genome_gaps(gap_file, chrom_name)
    chrom_inter = pyinter.IntervalSet()
    chrom_inter.add(pyinter.closedopen(start, end))
    non_gaps_inter = chrom_inter.difference(gaps_inter)
    non_gaps = [(i.lower_value, i.upper_value) for i in non_gaps_inter]
    total_len = sum([i[1] - i[0] for i in non_gaps])

    # For deletion null clusters, don't use pairs that are obviously too large.
    # (for normal data the discordant read cutoff for deletion supports
    #  is like mu + 3 sigma ~ mu + .3mu, and we're excluding stuff bigger than 3mu)
    if dtype == 'Del':
        max_null_insert = insert_mu * opts['insert_max_mu_multiple']
    else:
        max_null_insert = np.Inf

    null_clusters = []
    lr_null_clusters = np.array([], float)
    for _ in range(nreps):
        shuffled = shuffle_discordant_pairs(discordant_pairs,
                                            total_len,
                                            max_insert_size=max_null_insert)
        clusters_tmp, _ = cluster_pairs(opts, shuffled, dtype, lib_idx,
                                        insert_mu, insert_sigma)
        null_clusters.extend(clusters_tmp)
        lr_tmp = np.fromiter((lr_fun[dtype](c, insert_mu, insert_sigma,
                                            opts['insert_cutoff'], lr_cond)
                              for c in clusters_tmp), float)
        lr_null_clusters = np.append(lr_null_clusters, lr_tmp)
    if opts['verbosity'] > 1:
        print('[compute_null_dist] {0}'.format(dtype))
        print('shuffled lr:')
        print(lr_null_clusters)
        print('')

    outname = ('{0}_{1}_null_cluster_{2}reps.txt'.format(
        opts['library_names'][lib_idx], dtype, nreps))
    fname = os.path.join(opts['outdir'], 'logging', outname)
    write_clustering_results(fname,
                             list(zip(lr_null_clusters, null_clusters)),
                             first_reject=0)

    # print('there were {0} {1} clusters after shuffling'.format(len(clusters),
    #                                                            dtype))

    lr_null_clusters.sort()
    return lr_null_clusters
Esempio n. 10
0
def read_pfam(path):
    tracks = defaultdict(pyinter.IntervalSet)
    pids, trs, ids = [], [], []
    for toks in (x.rstrip('\r\n').split() for x in ts.nopen(path) if x[0] != "#"):
        start, end = map(int, toks[1:3])
        assert start <= end, toks
        pid = toks[10].split(';',1)[0].strip('"') #pfamA_id
        tracks[pid].add(pyinter.closedopen(start, end))
        ids.append(toks[12].split(';',1)[0].strip('"')) #gene_name
        trs.append(toks[14].split(';',1)[0].strip('"')) #transcript_id
    ints={}
    for pid, ivset in tracks.iteritems():
        sends = sorted(list(ivset))
        ss, es = [x.lower_value for x in sends], [x.upper_value for x in sends]
        ints[pid] = (ss,es)

    return ints
Esempio n. 11
0
def svelter_convert(svelterfile, outdir, reffile, filter_gaps=False, refgapfile=None,
                    flank_size=1000, verbosity=0):
    os.system('mkdir -p %s' % outdir)
    # collect all bps
    # all_bp = []
    # with open(svelterfile, 'r') as svelter:
    #     for line in svelter:
    #         if is_svelter_header(line):
    #             continue
    #         bp_str = line.split('\t')[3].split(':')[1:]
    #         all_bp.extend(int(x) for x in bp_str)
    # all_bp.sort()

    log = open(os.path.join(outdir, 'convert_{0}.log'.format(svelterfile)), 'w')
    data = []

    # it seems some sv can be repeated in svelter output with different scores
    seen_svstring = set()
    seen_id = {}
    skipped_seen = 0
    skipped_refgap = 0

    with open(svelterfile, 'r') as svelter:
        toks_list = [line.rstrip().split('\t') for line in svelter]

    if filter_gaps:
        chroms = set(toks[0] for toks in toks_list)
        chrom_gaps = {chrom: load_genome_gaps(refgapfile, chrom) for chrom in chroms}
    else:
        chrom_gaps = None

    for toks in toks_list:
        # check if header
        if toks[0] == 'chr' and toks[1] == 'start':
            continue
        # check if passing score
        if float(toks[6]) == 0:
            continue
        # check if sv is duplicate
        svstring = ' '.join(toks[:6])
        if svstring in seen_svstring:
            skipped_seen += 1
            continue
        else:
            seen_svstring.add(svstring)
        # adjust id if we've seen it before
        id = toks[3]
        num_id_seen = seen_id.get(id, 0)
        seen_id[id] = num_id_seen + 1
        if num_id_seen > 0:
            print('saw {0} again'.format(id))
            id_extra = ';' + str(num_id_seen + 1)
        else:
            id_extra = ''
        chrom = toks[0]
        bp_str = toks[3].split(':')[1:]
        bp = [int(x) for x in bp_str]

        if filter_gaps:
            sv_interval = pyinter.closedopen(bp[0], bp[-1])
            sv_gap_intersection = chrom_gaps[chrom].intersection([sv_interval])
            if len(sv_gap_intersection) > 0:
                skipped_refgap += 1
                continue

        breakpoints = {(x, x): Breakpoint((x, x)) for x in bp}
        # il = bisect_left(all_bp, bp[0])
        # if il > 0:
        #     slop_left = min(all_bp[il] - all_bp[il-1], flank_size)
        # else:
        #     slop_left = flank_size
        # ir = bisect_right(all_bp, bp[-1])
        # if ir < len(all_bp):
        #     slop_right = min(all_bp[ir] - all_bp[ir-1], flank_size)
        # else:
        #     slop_right = flank_size
        slop_left, slop_right = flank_size, flank_size
        start = bp[0] - slop_left
        end = bp[-1] + slop_right
        cbout = create_blocks(breakpoints, pyinter.IntervalSet(), chrom, start, end, verbosity)
        blocks, _, left_bp, right_bp = cbout
        svelter_strings = toks[5].split('/')
        paths = [svelter_string_to_path(x, len(blocks)) for x in svelter_strings]
        score = float(toks[6])

        this_data = (paths, blocks, left_bp, right_bp, score, 'PASS',
                     id_extra, None, None)  # no extra INFO/FORMAT tags like VCF vase
        data.append(this_data)
    log.write('skipped_seen\t{0}\n'.format(skipped_seen))
    log.write('skipped_refgap\t{0}\n'.format(skipped_refgap))

    do_sv_processing(data, outdir, reffile, log, verbosity)

    svelter.close()
    log.close()
Esempio n. 12
0
def generic_vcf_convert(vcffile, outdir, reffile, filter_gaps=False, refgapfile=None,
                        caller=None, flank_size=1000, verbosity=0):
    os.system('mkdir -p %s' % outdir)

    vcf = open(vcffile, 'r')
    log = open(os.path.join(outdir, 'convert_{0}.log'.format(vcffile)), 'w')
    data = []
    svtype_skipped = {}
    seen_coords_count = {}
    skipped_refgap = 0
    write_extra = False         # need to write FORMAT or INFO to file?

    with open(vcffile, 'r') as vcf:
        toks_list = [line.rstrip().split('\t') for line in vcf if line[0] != '#']

    if filter_gaps:
        chroms = set(toks[0] for toks in toks_list)
        chrom_gaps = {chrom: load_genome_gaps(refgapfile, chrom) for chrom in chroms}
    else:
        chrom_gaps = None

    for toks in toks_list:
        # NOTE not parsing qual; do filtering beforehand for DELLY
        chrom, pos, id, ref, alt, qual, filterstring, info, format, sample1 = toks

        # VCF is 1-indexed, but specifies pos/end positions
        # which are to the left of breakpoints, so no adjustment
        pos = int(pos)

        tags = info.split(';')
        if 'PRECISE' in tags:
            filterstring += ':PRECISE'
        elif 'IMPRECISE' in tags:
            filterstring += ':IMPRECISE'
        elif caller == 'lumpy':  # only includes tags for imprecise events
            filterstring += ':PRECISE'
        tags = [t for t in tags if '=' in t]
        tagd = {t.split('=')[0]: t.split('=')[1] for t in tags}
        end = int(tagd.get('END', -99999))
        svtype = tagd['SVTYPE']
        if caller == 'pindel' and svtype == 'INS':
            inslen = int(tagd['SVLEN'])
        else:
            inslen = int(tagd.get('INSLEN', 0))

        if caller == 'pindel':
            homlen = int(tagd['HOMLEN'])
            if pos + homlen > end or svtype == 'INS':
                print('pos + homlen > end: positions {0}'.format((pos, end)))
                cipos = (0, 0)
                ciend = (0, 0)
            else:
                cipos = (0, homlen)
                ciend = (0, homlen)
        else:
            if 'CIPOS95' in tagd:   # LUMPY
                tmp = tagd['CIPOS95'].split(',')
                cipos = (int(tmp[0]), int(tmp[1]))
            elif 'CIPOS' in tagd:
                tmp = tagd['CIPOS'].split(',')
                cipos = (int(tmp[0]), int(tmp[1]))
            else:
                cipos = (0, 0)
            if 'CIEND95' in tagd:   # LUMPY
                tmp = tagd['CIEND95'].split(',')
                ciend = (int(tmp[0]), int(tmp[1]))
            elif 'CIEND' in tagd:
                tmp = tagd['CIEND'].split(',')
                ciend = (int(tmp[0]), int(tmp[1]))
            else:
                ciend = (0, 0)
        split_support = int(tagd.get('SR', 0))
        pe_support = int(tagd.get('PE', 0))
        # lumpy STRANDS only relevant for inversions
        if caller == 'lumpy' and svtype == 'INV':
            tmp = tagd['STRANDS'].split(',')
            tmpd = {a: b for (a, b) in (p.split(':') for p in tmp)}
            tagd['INV_PLUS'] = tmpd['++']
            tagd['INV_MINUS'] = tmpd['--']
        tagd_used = ('SR', 'PE', 'SVTYPE', 'SVMETHOD', 'END', 'STRANDS',
                     'SVLEN', 'HOMSEQ', 'CONSENSUS', 'CHR2')
        tagd_extra = {k: v for (k, v) in tagd.items() if k not in tagd_used}

        tags2 = {k: v for (k, v) in zip(format.split(':'), sample1.split(':'))}
        if 'AD' in tags2:       # pindel
            split_support = int(tags2['AD'].split(',')[1])

        gt = tags2['GT']

        if gt == './.' or gt == '.|.':
            is_het = False
            filterstring += ':NOGT'
        elif gt in ('0/0', '0|0'):
            is_het = False
            filterstring += ':ZEROGT'
        elif gt in ('0/1', '1/0', '0|1', '1|0'):
            is_het = True
        else:
            assert(gt in ('1/1', '1|1'))
            is_het = False

        tags2_used = ('AD', 'SR', 'PE', 'SU')
        tags2_extra = {k: v for (k, v) in tags2.items() if k not in tags2_used}
        if len(tagd_extra) + len(tags2_extra) > 0:
            write_extra = True

        # cases
        if svtype == 'DEL':
            path = (0, 1, 4, 5)
            refpath = (0, 1, 2, 3, 4, 5)
            supptype = 'Del'
        elif svtype == 'INV':
            path = (0, 1, 3, 2, 4, 5)
            refpath = (0, 1, 2, 3, 4, 5)
            supptype = 'InvL'
        elif svtype == 'DUP' or svtype == 'DUP:TANDEM':
            path = (0, 1, 2, 3, 2, 3, 4, 5)
            refpath = (0, 1, 2, 3, 4, 5)
            supptype = 'Dup'
        elif svtype == 'INS':
            # INSERTIONS parse inslen, add insertion block to blocks
            path = (0, 1, 4, 5, 2, 3)
            refpath = (0, 1, 2, 3)
            supptype = 'Ins'
        else:
            # skipping delly TRA
            # skipping BND events as they may be ambiguous, in terms of the path
            svtype_skipped[svtype] = svtype_skipped.get(svtype, 0) + 1
            continue

        # check ref gap overlap
        if filter_gaps and end > pos:
            sv_interval = pyinter.closedopen(pos, end)
            sv_gap_intersection = chrom_gaps[chrom].intersection([sv_interval])
            if len(sv_gap_intersection) > 0:
                skipped_refgap += 1
                continue

        # create breakpoints and blocks, keeping in mind uncertainty and possible insertion
        if caller == 'lumpy' and svtype != 'INS':
            # lumpy intervals are not symmetric. POS and END are each the "best guess" for
            # the breakpoints
            bp = [(pos, pos), (end, end)]
        elif svtype != 'INS':
            # if (cipos[1] != -cipos[0] or ciend[1] != -ciend[0]) and \
            #    (pos + cipos[1] < end + ciend[0]):
            if (pos + cipos[1] < end + ciend[0]):
                bp = [(pos + cipos[0], pos + cipos[1]),
                      (end + ciend[0], end + ciend[1])]
            else:
                bp = [(pos, pos), (end, end)]
                filterstring += ':BPOVERLAP'
        else:
            # if cipos[1] != -cipos[0]:
            if cipos[1] > cipos[0]:
                bp = [(pos + cipos[0], pos + cipos[1])]
            else:
                bp = [(pos, pos)]
        pe = [(x, supptype) for x in range(pe_support)]
        # TODO SupportingSplit
        splits = []
        for i in range(split_support):
            aln_tmp = pysam.AlignedSegment()
            aln_tmp.qname = i
            aln_tmp.is_read1 = True
            split_type = supptype + '+'
            splits.append(SupportingSplit(aln_tmp, None, None, None, None, split_type))
        breakpoints = {x: Breakpoint(x, pe=pe, splits=splits) for x in bp}
        slop_left, slop_right = flank_size, flank_size
        start = bp[0][0] - slop_left
        end = bp[-1][1] + slop_right
        cbout = create_blocks(breakpoints, pyinter.IntervalSet(), chrom, start, end, verbosity)
        blocks, _, left_bp, right_bp = cbout

        if svtype == 'INS':
            blocks.append(GenomeInterval(chrom, 0, inslen, is_de_novo=True))

        paths = [path, refpath] if is_het else [path, path]
        score = 0

        coords = (start, end)
        scc = seen_coords_count.get(coords, 0)
        if scc > 0:
            id_extra = chr(ord('a') + scc)
        else:
            id_extra = ''
        seen_coords_count[coords] = scc + 1

        this_data = (paths, blocks, left_bp, right_bp, score, filterstring,
                     id_extra, tagd_extra, tags2_extra)
        data.append(this_data)
    for svtype, count in svtype_skipped.items():
        log.write('skipped_svtype\t{0}\t{1}\n'.format(svtype, count))
    log.write('skipped_refgap\t{0}\n'.format(skipped_refgap))
    do_sv_processing(data, outdir, reffile, log, verbosity, write_extra)

    vcf.close()
    log.close()
Esempio n. 13
0
def add_variants_to_set_from_bed(sample_alignment, bed_dataset):
    """
    Given a bed with feature names and a corresponding sample alignment,
    create new variant sets for every unique feature name and assign variants
    to that fall within these features to the new sets.

    E.g. BED:

    ...
    NC_000913 223514 223534 POOR_MAPPING_QUALITY
    NC_000913 223542 223734 NO_COVERAGE
    NC_000913 223751 224756 POOR_MAPPING_QUALITY
    ...

    Add variants in 223542-223734 to NO_COVERAGE
    Add variants in 223751-224756 and 223514-223534 to POOR_MAPPING_QUALITY
    """

    # Read in the bed file
    bed_dataset_fn = bed_dataset.get_absolute_location()
    reference_genome = sample_alignment.alignment_group.reference_genome
    experiment_sample = sample_alignment.experiment_sample

    # 1. Create a dictionary of disjoint intervals, recursive defaultdict
    feature_disj_intervals = defaultdict(
            lambda: defaultdict(pyinter.IntervalSet))
    variants_to_add = defaultdict(list)

    with open(bed_dataset_fn) as bed_dataset_fh:

        for i, line in enumerate(bed_dataset_fh):
            try:
                chrom, start, end, feature = line.strip().split('\t')
                # make a new interval from start to end
                new_ivl = pyinter.closedopen(int(start), int(end))

                # add new ivl to old ivls
                feature_disj_intervals[feature][chrom].add(new_ivl)
            except:
                print ('WARNING: Callable Loci line ' +
                        '%d: (%s) couldnt be parsed.') % (i, line)

    # 2. Associate variants with these intervals
    variants = Variant.objects.filter(
            variantcallercommondata__alignment_group=\
                    sample_alignment.alignment_group)

    for v in variants:
        for feat, chrom_ivls in feature_disj_intervals.items():

            # Skip if there is no interval in this chromosome
            if v.chromosome.label not in chrom_ivls: continue
            if not chrom_ivls[v.chromosome.label]: continue

            if v.position in chrom_ivls[v.chromosome.label]:
                variants_to_add[feat].append(v)

    # 3. Make new variant sets for any features with variants,
    # and add the variants to them.

    variant_set_to_variant_map = {}

    for feat, variants in variants_to_add.items():

        (feat_variant_set, created) = VariantSet.objects.get_or_create(
                reference_genome=reference_genome,
                label=feat)

        grouped_uid_dict_list = [{
                'sample_uid': experiment_sample.uid,
                'variant_uid': v.uid} for v in variants]

        variant_uid_to_obj_map = dict([(v.uid,v) for v in variants])
        sample_uid_to_obj_map = {experiment_sample.uid: experiment_sample}

        _perform_add(grouped_uid_dict_list, feat_variant_set,
                variant_uid_to_obj_map, sample_uid_to_obj_map)

        variant_set_to_variant_map[feat_variant_set] = variants

    return variant_set_to_variant_map
Esempio n. 14
0
def add_variants_to_set_from_bed(sample_alignment, bed_dataset):
    """
    Given a bed with feature names and a corresponding sample alignment,
    create new variant sets for every unique feature name and assign variants
    to that fall within these features to the new sets.

    E.g. BED:

    ...
    NC_000913 223514 223534 POOR_MAPPING_QUALITY
    NC_000913 223542 223734 NO_COVERAGE
    NC_000913 223751 224756 POOR_MAPPING_QUALITY
    ...

    Add variants in 223542-223734 to NO_COVERAGE
    Add variants in 223751-224756 and 223514-223534 to POOR_MAPPING_QUALITY
    """

    # Read in the bed file
    bed_dataset_fn = bed_dataset.get_absolute_location()
    reference_genome = sample_alignment.alignment_group.reference_genome
    experiment_sample = sample_alignment.experiment_sample

    # 1. Create a dictionary of disjoint intervals, recursive defaultdict
    feature_disj_intervals = defaultdict(
        lambda: defaultdict(pyinter.IntervalSet))
    variants_to_add = defaultdict(list)

    with open(bed_dataset_fn) as bed_dataset_fh:

        for i, line in enumerate(bed_dataset_fh):
            try:
                chrom, start, end, feature = line.strip().split('\t')
                # make a new interval from start to end
                new_ivl = pyinter.closedopen(int(start), int(end))

                # add new ivl to old ivls
                feature_disj_intervals[feature][chrom].add(new_ivl)
            except:
                print('WARNING: Callable Loci line ' +
                      '%d: (%s) couldnt be parsed.') % (i, line)

    # 2. Associate variants with these intervals
    variants = Variant.objects.filter(
            variantcallercommondata__alignment_group=\
                    sample_alignment.alignment_group)

    for v in variants:
        for feat, chrom_ivls in feature_disj_intervals.items():

            # Skip if there is no interval in this chromosome
            if v.chromosome.label not in chrom_ivls: continue
            if not chrom_ivls[v.chromosome.label]: continue

            if v.position in chrom_ivls[v.chromosome.label]:
                variants_to_add[feat].append(v)

    # 3. Make new variant sets for any features with variants,
    # and add the variants to them.

    variant_set_to_variant_map = {}

    for feat, variants in variants_to_add.items():

        (feat_variant_set, created) = VariantSet.objects.get_or_create(
            reference_genome=reference_genome, label=feat)

        grouped_uid_dict_list = [{
            'sample_uid': experiment_sample.uid,
            'variant_uid': v.uid
        } for v in variants]

        variant_uid_to_obj_map = dict([(v.uid, v) for v in variants])
        sample_uid_to_obj_map = {experiment_sample.uid: experiment_sample}

        _perform_add(grouped_uid_dict_list, feat_variant_set,
                     variant_uid_to_obj_map, sample_uid_to_obj_map)

        variant_set_to_variant_map[feat_variant_set] = variants

    return variant_set_to_variant_map
Esempio n. 15
0
def create_blocks(breakpoints, gaps, chrom_name, start, end, verbosity):
    # create list of blocks between breakpoints
    # while adjusting for genome gaps
    gap_indices = set()
    gap_indices.add(0)
    blocks = []
    left_breakpoints = []
    right_breakpoints = []

    breakpoints[(end, end)] = Breakpoint((end, end))

    bploc = list(breakpoints.keys())
    bploc.sort()

    last_end = start
    last_breakpoint = Breakpoint((start, start))

    for bpl in bploc:
        breakpoint = breakpoints[bpl]

        if bpl[0] <= start or bpl[1] > end:
            continue
        iset = pyinter.IntervalSet()
        blockinterval = pyinter.closedopen(last_end, bpl[0])

        iset.add(blockinterval)
        adjusted_blocks = iset.difference(gaps)
        adjusted_blocks = sorted(list(adjusted_blocks))

        if verbosity > 1:
            print('bploc {0}'.format(bpl))
            print('bp {0}'.format(breakpoint))
            print('blockinterval {0}'.format(blockinterval))
            print('adjusted {0}'.format(adjusted_blocks))

        for ab in adjusted_blocks:
            if ab.lower_value == ab.upper_value:  # block completely within a gap
                gap_indices.add(len(blocks))
                break
            else:
                if ab.lower_value != blockinterval.lower_value:
                    gap_indices.add(len(blocks))
                    left_breakpoint = Breakpoint(
                        (ab.lower_value, ab.lower_value))
                else:
                    left_breakpoint = last_breakpoint
                if ab.upper_value != blockinterval.upper_value:
                    gap_indices.add(len(blocks) + 1)
                    right_breakpoint = Breakpoint(
                        (ab.upper_value, ab.upper_value))
                else:
                    right_breakpoint = breakpoint
                if verbosity > 1:
                    print('adding {0}'.format(
                        GenomeInterval(chrom_name, ab.lower_value,
                                       ab.upper_value)))
                    print('\tleft {0}'.format(left_breakpoint))
                    print('\tright {0}'.format(right_breakpoint))
                blocks.append(
                    GenomeInterval(chrom_name, ab.lower_value, ab.upper_value))
                left_breakpoints.append(left_breakpoint)
                right_breakpoints.append(right_breakpoint)
        last_end = bpl[1]
        last_breakpoint = breakpoints[bpl]
    gap_indices.add(len(blocks))
    gap_indices = sorted(list(gap_indices))
    if verbosity > 1:
        print('--creating blocks--')
        print(breakpoints)
        print(blocks)
        print(gap_indices)
        print(left_breakpoints)
        print(right_breakpoints)
    return blocks, gap_indices, left_breakpoints, right_breakpoints
Esempio n. 16
0
    def test_add_variants_to_set_from_bed(self):

        common_entities = create_common_entities()
        project = common_entities['project']
        self.ref_genome_1 = common_entities['reference_genome']

        alignment_group = AlignmentGroup.objects.create(
            label='Alignment 1',
            reference_genome=self.ref_genome_1,
            aligner=AlignmentGroup.ALIGNER.BWA)

        (self.sample_1, created) = ExperimentSample.objects.get_or_create(
            project=project, label=SAMPLE_1_LABEL)

        sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=alignment_group, experiment_sample=self.sample_1)

        # Create variants in the bed regions from best_test.bed
        for var_poor_map in range(20):
            variant = Variant.objects.create(
                type=Variant.TYPE.TRANSITION,
                reference_genome=self.ref_genome_1,
                chromosome=Chromosome.objects.get(
                    reference_genome=self.ref_genome_1),
                position=random.randint(101, 200),
                ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={})

        for var_no_cov in range(20):
            variant = Variant.objects.create(
                type=Variant.TYPE.TRANSITION,
                reference_genome=self.ref_genome_1,
                chromosome=Chromosome.objects.get(
                    reference_genome=self.ref_genome_1),
                position=random.randint(301, 400),
                ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={})

            variant = Variant.objects.create(
                type=Variant.TYPE.TRANSITION,
                reference_genome=self.ref_genome_1,
                chromosome=Chromosome.objects.get(
                    reference_genome=self.ref_genome_1),
                position=random.randint(501, 600),
                ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={})

        new_bed_path = copy_dataset_to_entity_data_dir(
            entity=sample_alignment, original_source_location=TEST_BED)

        bed_dataset = add_dataset_to_entity(
            sample_alignment,
            dataset_label=Dataset.TYPE.BED_CALLABLE_LOCI,
            dataset_type=Dataset.TYPE.BED_CALLABLE_LOCI,
            filesystem_location=new_bed_path)

        vs_to_v_map = add_variants_to_set_from_bed(sample_alignment,
                                                   bed_dataset)

        variant_set_labels = set([vs.label for vs in vs_to_v_map.keys()])
        self.assertEqual(set(['POOR_MAPPING_QUALITY', 'NO_COVERAGE']),
                         variant_set_labels)

        for variant_set, variants in vs_to_v_map.items():
            for v in variants:
                # POOR MAPPING QUAL should be from 101 to 200
                if variant_set.label == 'POOR_MAPPING_QUALITY':
                    self.assertTrue(v.position in pyinter.closedopen(101, 200))
                # NO COVERAGE should be from 301 to 400, 501 to 600
                elif variant_set.label == 'NO_COVERAGE':
                    self.assertTrue(v.position in pyinter.IntervalSet([
                        pyinter.closedopen(301, 400),
                        pyinter.closedopen(501, 600)
                    ]))
                else:
                    raise AssertionError('bad variant set %s made.' %
                                         variant_set.label)
Esempio n. 17
0
def len_without_gaps(chrom_name, start, end, gapsfile):
    gaps = load_genome_gaps(gapsfile, chrom_name)
    region = pyinter.IntervalSet()
    region.add(pyinter.closedopen(start, end))
    diff = region.difference(gaps)
    return sum(x.upper_value - x.lower_value for x in diff)
Esempio n. 18
0
    def test_add_variants_to_set_from_bed(self):

        common_entities = create_common_entities()
        project = common_entities['project']
        self.ref_genome_1 = common_entities['reference_genome']

        alignment_group = AlignmentGroup.objects.create(
            label='Alignment 1',
            reference_genome=self.ref_genome_1,
            aligner=AlignmentGroup.ALIGNER.BWA)

        (self.sample_1, created) = ExperimentSample.objects.get_or_create(
                project=project,
                label=SAMPLE_1_LABEL)

        sample_alignment = ExperimentSampleToAlignment.objects.create(
                alignment_group=alignment_group,
                experiment_sample=self.sample_1)

        # Create variants in the bed regions from best_test.bed
        for var_poor_map in range(20):
            variant = Variant.objects.create(
                    type=Variant.TYPE.TRANSITION,
                    reference_genome=self.ref_genome_1,
                    chromosome=Chromosome.objects.get(reference_genome=self.ref_genome_1),
                    position=random.randint(101,200),
                    ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={}
            )

        for var_no_cov in range(20):
            variant = Variant.objects.create(
                    type=Variant.TYPE.TRANSITION,
                    reference_genome=self.ref_genome_1,
                    chromosome=Chromosome.objects.get(reference_genome=self.ref_genome_1),
                    position=random.randint(301,400),
                    ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={}
            )

            variant = Variant.objects.create(
                    type=Variant.TYPE.TRANSITION,
                    reference_genome=self.ref_genome_1,
                    chromosome=Chromosome.objects.get(reference_genome=self.ref_genome_1),
                    position=random.randint(501,600),
                    ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={}
            )

        new_bed_path = copy_dataset_to_entity_data_dir(
                entity= sample_alignment,
                original_source_location= TEST_BED)

        bed_dataset = add_dataset_to_entity(sample_alignment,
                dataset_label= Dataset.TYPE.BED_CALLABLE_LOCI,
                dataset_type= Dataset.TYPE.BED_CALLABLE_LOCI,
                filesystem_location= new_bed_path)

        vs_to_v_map = add_variants_to_set_from_bed(
                sample_alignment, bed_dataset)

        variant_set_labels = set([vs.label for vs in vs_to_v_map.keys()])
        self.assertEqual(set(['POOR_MAPPING_QUALITY', 'NO_COVERAGE']),
                variant_set_labels)

        for variant_set, variants in vs_to_v_map.items():
            for v in variants:
                # POOR MAPPING QUAL should be from 101 to 200
                if variant_set.label == 'POOR_MAPPING_QUALITY':
                    self.assertTrue(v.position in pyinter.closedopen(
                            101, 200))
                # NO COVERAGE should be from 301 to 400, 501 to 600
                elif variant_set.label == 'NO_COVERAGE':
                    self.assertTrue(v.position in pyinter.IntervalSet([
                                    pyinter.closedopen(301,400),
                                    pyinter.closedopen(501,600)]))
                else:
                    raise AssertionError(
                            'bad variant set %s made.' % variant_set.label)