Example #1
0
def read_table(filename, optfile=None):
    data = defaultdict(dict)
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.strip()
            if not line or line.startswith('#'): continue
            tokens = line.split('\t')
            chrom, pos = tokens[:2]
            alt = tokens[4].split(',')[0]
            info = tokens[7]
            # Split AC and AN from INFO field
            fields = info.split(';')
            fields.sort()
            ac = an = None
            for field in fields:
                if field.startswith('AC='):
                    ac = int(field.split('=')[1])
                    if an is not None:
                        break
                elif field.startswith('AN='):
                    an = int(field.split('=')[1])
                    if ac is not None:
                        break

            assert ac is not None and an is not None, \
                   "Error: entry with AC and AN: %s" % line
            
            data[chrom.lstrip('chr')][(int(pos), alt)] = float(ac) / an

    if optfile and not os.path.isfile(optfile):
        print >>sys.stderr, "Saving optimized table to:", optfile
        with maybe_gzip_open(optfile, 'wb') as ofp:
            cPickle.dump(data, ofp, cPickle.HIGHEST_PROTOCOL)

    return data
Example #2
0
def read_table(filename, optfile=None):
    data = defaultdict(dict)
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.strip()
            if not line or line.startswith('#'): continue
            tokens = line.split('\t')
            chrom, pos = tokens[:2]
            alt = tokens[4].split(',')[0]
            info = tokens[7]
            # Split AC and AN from INFO field
            fields = info.split(';')
            fields.sort()
            ac = an = None
            for field in fields:
                if field.startswith('AC='):
                    ac = int(field.split('=')[1])
                    if an is not None:
                        break
                elif field.startswith('AN='):
                    an = int(field.split('=')[1])
                    if ac is not None:
                        break

            assert ac is not None and an is not None, \
                   "Error: entry with AC and AN: %s" % line

            data[chrom.lstrip('chr')][(int(pos), alt)] = float(ac) / an

    if optfile and not os.path.isfile(optfile):
        print >> sys.stderr, "Saving optimized table to:", optfile
        with maybe_gzip_open(optfile, 'wb') as ofp:
            cPickle.dump(data, ofp, cPickle.HIGHEST_PROTOCOL)

    return data
Example #3
0
File: gerp.py Project: buske/silva
def read_table(filename, optfile=None):
    data = defaultdict(dict)
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.strip()
            if not line or line.startswith('#'): continue
            value, chrom, pos = line.split()
            data[chrom.lstrip('chr')][int(pos)] = float(value)

    if optfile and not os.path.isfile(optfile):
        print >>sys.stderr, "Saving optimized table to:", optfile
        with maybe_gzip_open(optfile, 'wb') as ofp:
            cPickle.dump(data, ofp, cPickle.HIGHEST_PROTOCOL)

    return data
Example #4
0
def read_table(filename, optfile=None):
    data = defaultdict(dict)
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.strip()
            if not line or line.startswith('#'): continue
            value, chrom, pos = line.split()
            data[chrom.lstrip('chr')][int(pos)] = float(value)

    if optfile and not os.path.isfile(optfile):
        print >> sys.stderr, "Saving optimized table to:", optfile
        with maybe_gzip_open(optfile, 'wb') as ofp:
            cPickle.dump(data, ofp, cPickle.HIGHEST_PROTOCOL)

    return data
Example #5
0
def script(filename, quiet=False, verbose=False, **kwargs):
    fields = ['MES', 'dMES', 'MES+', 'MES-', 'MEC-MC?', 'MEC-CS?', 'MES-KM?']
    print '#%s' % '\t'.join(fields)
    NULL = [None] * len(fields)

    seqs = []
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.strip()
            if line:
                seqs.append(Seq(line))

    sites = {}
    scores = {}
    # Accumulate sites for each side
    for side in [3, 5]:
        sites[side] = set()
        for s in seqs:
            sites[side].update(s.iter_seqs(side))

        # Score ALL sites at once!
        scores[side] = score_sites(side, sites[side])

    # Print stats for each object, given queried scores
    for s in seqs:
        # Compute field-wise max of rows for 3' and 5'
        max_row = imap(max, s.score(3, scores[3]), s.score(5, scores[5]))
        print_row(max_row)
Example #6
0
def get_genes(gene_filename=None,
              cache_filename=None,
              genome_filename=None,
              **kwargs):
    """Loads (potentially cached) dict: gene_name -> set(genes)

    If not cached, genome_filename FASTA expected to provide sequence data
    """
    assert gene_filename and genome_filename or cache_filename
    if cache_filename is not None and os.path.isfile(cache_filename):
        print >> sys.stderr, "Loading genes from pickled file: %s" % cache_filename
        with maybe_gzip_open(cache_filename) as ifp:
            genes = cPickle.load(ifp)
    else:
        genome = Genome(genome_filename)

        genes = defaultdict(set)
        missed_chroms = set()
        n_zero_len = 0
        for entry in iter_ucsc_genes(gene_filename):
            chrom = entry['chrom']
            if not chrom.startswith('chr'):
                chrom = 'chr%s' % chrom

            if chrom not in genome:
                if chrom not in missed_chroms:
                    print >>sys.stderr, "Could not find sequence for %s" \
                        " in: %s" % (chrom, genome_filename)
                    missed_chroms.add(chrom)

                continue

            # Substitute id with gene name
            entry['seq'] = genome[chrom]
            try:
                t = Transcript(**entry)
            except AssertionError, e:
                if "Zero-length CDS" in str(e):
                    n_zero_len += 1
                else:
                    print >>sys.stderr, "Skipping transcript: %s: %s" \
                        % (entry['gene'], e)
                continue

            if t.valid():
                genes[entry['gene']].add(t)

        if n_zero_len:
            print >>sys.stderr, "Skipped %d transcripts with zero-length CDS" \
                " annotations" % n_zero_len

        if missed_chroms:
            print >>sys.stderr, "Missing sequences with gene annotations: %s" \
                % ', '.join(sorted(missed_chroms))

        genes = dict(genes)  # remove defaultdict
        if cache_filename:
            print >> sys.stderr, "Saving genes to pickled file: %s" % cache_filename
            with open(cache_filename, 'wb') as ofp:
                cPickle.dump(genes, ofp, cPickle.HIGHEST_PROTOCOL)
Example #7
0
def read_examples(filename):
    lines = []
    header = []
    ncols = None
    with maybe_gzip_open(filename) as ifp:
        header = ifp.readline().strip()
        assert header.startswith('#')
        header = header.replace('#', '')
        for line in ifp:
            line = line.strip()
            if not line: continue

            assert not line.startswith('#')
            tokens = [float(val) for val in line.split()]
            if ncols is None:
                ncols = len(tokens)
            else:
                assert ncols == len(tokens), \
                    "Found row in %s with %d columns (%d expected)" % (filename, len(tokens), ncols)

            lines.append(tokens)

    try:
        data = array(lines, dtype=float)
    except ValueError:
        print >> sys.stderr

    # Find class column
    cols = header.split()
    if cols[0] == 'class':
        class_col = 0
    else:
        class_col = None

    return header, class_col, data
Example #8
0
def read_examples(filename):
    lines = []
    header = []
    ncols = None
    with maybe_gzip_open(filename) as ifp:
        header = ifp.readline().strip()
        assert header.startswith('#')
        header = header.replace('#', '')
        for line in ifp:
            line = line.strip()
            if not line: continue

            assert not line.startswith('#')
            tokens = [float(val) for val in line.split()]
            if ncols is None:
                ncols = len(tokens)
            else:
                assert ncols == len(tokens), \
                    "Found row in %s with %d columns (%d expected)" % (filename, len(tokens), ncols)

            lines.append(tokens)

    try:
        data = array(lines, dtype=float)
    except ValueError:
        print >>sys.stderr

    # Find class column
    cols = header.split()
    if cols[0] == 'class':
        class_col = 0
    else:
        class_col = None
        
    return header, class_col, data
Example #9
0
def iter_sequences(filename, domain=None, **kwargs):
    def get_mut_seqs(seq):
        pre, post = seq.split('/')
        pre, old = pre.split('[')
        new, post = post.split(']')

        if domain:
            pre_len = min(len(pre), domain)
            post_len = min(len(post), domain)
            # If too close to one end of sequence, accomodate
            if pre_len < domain:
                post_len = min(len(post), 2 * domain - pre_len)
            if post_len < domain:
                pre_len = min(len(pre), 2 * domain - post_len)

            pre = pre[-pre_len:]
            post = post[:post_len]
            assert len(pre) + len(post) == 2 * domain

        return pre + old + post, pre + new + post

    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            seq = line.strip().upper()
            try:
                premrna = seq.replace('|', '')
                postmrna = ''.join(seq.split('|')[::2])
                yield get_mut_seqs(premrna), get_mut_seqs(postmrna)
            except (ValueError, AssertionError):
                print >> sys.stderr, "Error, invalid sequence: %s" % seq
                yield None
Example #10
0
def annotate_variants(genes, filename):
    fields = ['chrom', 'pos', 'id', 'ref', 'alt', 'gene', 'tx', 'strand',
              'codon', 'frame', 'premrna']
    print '#%s' % '\t'.join(fields)
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.rstrip()
            if not line or line.startswith('#'): continue

            tokens = line.split()
            chrom, pos, id, ref, alt, gene_id, tx_id = tokens[:7]
            chrom = chrom[3:] if chrom.startswith('chr') else chrom
            pos = int(pos)

            tx = get_transcript(genes, pos, ref, alt, gene_id, tx_id)
            if not tx:
                logging.warning('Transcript not found for variant: %s' % line)
                continue

            # Get codon, frame, and mrna
            cds_offset = tx.project_to_cds(pos)
            aa_pos = int(cds_offset / 3) + 1
            codon = tx.get_codon(aa_pos)
            frame = cds_offset % 3

            mut_str = tx.mutation_str(pos, ref, alt)
            print '\t'.join([chrom, str(pos), id, ref, alt, tx.gene(), tx.tx(),
                              tx.strand(), codon, str(frame), mut_str] + tokens[7:])
Example #11
0
def iter_sequences(filename, domain=None, **kwargs):
    def get_mut_seqs(seq):
        pre, post = seq.split('/')
        pre, old = pre.split('[')
        new, post = post.split(']')

        if domain:
            pre_len = min(len(pre), domain)
            post_len = min(len(post), domain)
            # If too close to one end of sequence, accomodate
            if pre_len < domain:
                post_len = min(len(post), 2*domain - pre_len)
            if post_len < domain:
                pre_len = min(len(pre), 2*domain - post_len)

            pre = pre[-pre_len:]
            post = post[:post_len]
            assert len(pre) + len(post) == 2 * domain

        return pre + old + post, pre + new + post

    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            seq = line.strip().upper()
            try:
                premrna = seq.replace('|', '')
                postmrna = ''.join(seq.split('|')[::2])
                yield get_mut_seqs(premrna), get_mut_seqs(postmrna)
            except (ValueError, AssertionError):
                print >>sys.stderr, "Error, invalid sequence: %s" % seq
                yield None
Example #12
0
def annotate_variants(genes, filename):
    fields = ['chrom', 'pos', 'id', 'ref', 'alt', 'gene', 'tx', 'strand',
              'codon', 'frame', 'premrna']
    print '#%s' % '\t'.join(fields)
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.rstrip()
            if not line or line.startswith('#'): continue

            tokens = line.split()
            chrom, pos, id, ref, alt, gene_id, tx_id = tokens[:7]
            chrom = chrom[3:] if chrom.startswith('chr') else chrom
            pos = int(pos)

            tx = get_transcript(genes, pos, ref, alt, gene_id, tx_id)
            if not tx:
                continue

            # Get codon, frame, and mrna
            cds_offset = tx.project_to_cds(pos)
            aa_pos = int(cds_offset / 3) + 1
            codon = tx.get_codon(aa_pos)
            frame = cds_offset % 3

            mut_str = tx.mutation_str(pos, ref, alt)
            print '\t'.join([chrom, str(pos), id, ref, alt, tx.gene(), tx.tx(),
                              tx.strand(), codon, str(frame), mut_str] + tokens[7:])
Example #13
0
def script(filename, quiet=False, verbose=False, **kwargs):
    fields = ['f_premrna', 'f_mrna']  #, 'splice_dist']
    print '#%s' % '\t'.join(fields)
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.strip().upper()
            assert line.count('/') == 1

            pre, post = line.split('/')
            # Trim off mutation nucs and brackets
            pre = pre[:-2]  # e,g, '[A'
            post = post[2:]  # e.g. 'C]'

            pre_chunks = pre.split('|')
            post_chunks = post.split('|')
            # Assume mutation is in exon

            premrna_f = min(len(pre), len(post)) \
                           / (len(pre) + len(post) + 1)
            pre_cds = ''.join(pre_chunks[::2])
            post_cds = ''.join(post_chunks[::2])
            mrna_f = min(len(pre_cds), len(post_cds)) \
                        / (len(pre_cds) + len(post_cds) + 1)
            #splice_dist = min(len(pre_chunks[-1]), len(post_chunks[0]))

            print '%.4f\t%.4f' % (premrna_f, mrna_f)  #, splice_dist)
Example #14
0
def get_genes(gene_filename=None, cache_filename=None,
              genome_filename=None, **kwargs):
    """Loads (potentially cached) dict: gene_name -> set(genes)

    If not cached, genome_filename FASTA expected to provide sequence data
    """
    assert gene_filename and genome_filename or cache_filename
    if cache_filename is not None and os.path.isfile(cache_filename):
        print >>sys.stderr, "Loading genes from pickled file: %s" % cache_filename
        with maybe_gzip_open(cache_filename) as ifp:
            genes = cPickle.load(ifp)
    else:
        genome = Genome(genome_filename)

        genes = defaultdict(set)
        missed_chroms = set()
        n_zero_len = 0
        for entry in iter_ucsc_genes(gene_filename):
            chrom = entry['chrom']
            if not chrom.startswith('chr'):
                chrom = 'chr%s' % chrom
                
            if chrom not in genome:
                if chrom not in missed_chroms:
                    print >>sys.stderr, "Could not find sequence for %s" \
                        " in: %s" % (chrom, genome_filename)
                    missed_chroms.add(chrom)
                
                continue

            # Substitute id with gene name
            entry['seq'] = genome[chrom]
            try:
                t = Transcript(**entry)
            except AssertionError, e:
                if "Zero-length CDS" in str(e):
                    n_zero_len += 1
                else:
                    print >>sys.stderr, "Skipping transcript: %s: %s" \
                        % (entry['gene'], e)
                continue

            if t.valid():
                genes[entry['gene']].add(t)

        if n_zero_len:
            print >>sys.stderr, "Skipped %d transcripts with zero-length CDS" \
                " annotations" % n_zero_len

        if missed_chroms:
            print >>sys.stderr, "Missing sequences with gene annotations: %s" \
                % ', '.join(sorted(missed_chroms))
            
        genes = dict(genes)  # remove defaultdict
        if cache_filename:
            print >>sys.stderr, "Saving genes to pickled file: %s" % cache_filename
            with open(cache_filename, 'wb') as ofp:
                cPickle.dump(genes, ofp, cPickle.HIGHEST_PROTOCOL)
Example #15
0
def load_data(vector_filename, log=sys.stderr):
    print >>log, "Loading vector data from file: %s" % vector_filename
    with maybe_gzip_open(vector_filename) as ifp:
        data = loadtxt(ifp, dtype=float)

    # Pop solution column
    solutions = data[:, 0]
    data = data[:,1:]
    print >>log, "Loaded data with %d examples and %d features" % data.shape

    return solutions, data
Example #16
0
def iter_sequences(filename):
    seq_re = re.compile(r'([ACGT]*)\[([ACGT])/([ACGT])\]([ACGT]*)')
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            seq = line.strip().upper()
            mut_exons = [chunk for chunk in seq.split('|') if '/' in chunk]
            assert len(mut_exons) == 1
            exon = mut_exons[0]
            m = seq_re.search(exon)
            if m:
                pre, old, new, post = m.groups()
                yield pre, old, new, post
            else:
                print >> sys.stderr, "Error, invalid sequence: %s" % seq
                yield None
Example #17
0
File: pesx.py Project: buske/silva
def iter_sequences(filename):
    seq_re = re.compile(r'([ACGT]*)\[([ACGT])/([ACGT])\]([ACGT]*)')
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            seq = line.strip().upper()
            mut_exons = [chunk for chunk in seq.split('|') if '/' in chunk]
            assert len(mut_exons) == 1
            exon = mut_exons[0]
            m = seq_re.search(exon)
            if m:
                pre, old, new, post = m.groups()
                yield pre, old, new, post
            else:
                print >>sys.stderr, "Error, invalid sequence: %s" % seq
                yield None
Example #18
0
def iter_lines(filename):
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.strip()
            if not line or line.startswith('#'): continue
            ref, alt, strand, codon, offset = line.split()[:5]
            assert strand in set(['+', '-', '.'])
            assert len(ref) == len(alt) == 1
            assert len(codon) == 3
            offset = int(offset)
            if strand == '-':
                ref = COMPLEMENT[ref]
                alt = COMPLEMENT[alt]
                
            assert codon[offset] == ref
            new_codon = codon[:offset] + alt + codon[offset+1:]

            yield codon, new_codon
Example #19
0
def iter_lines(filename):
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.strip()
            if not line or line.startswith('#'): continue
            ref, alt, strand, codon, offset = line.split()[:5]
            assert strand in set(['+', '-', '.'])
            assert len(ref) == len(alt) == 1
            assert len(codon) == 3
            offset = int(offset)
            if strand == '-':
                ref = COMPLEMENT[ref]
                alt = COMPLEMENT[alt]

            assert codon[offset] == ref
            new_codon = codon[:offset] + alt + codon[offset + 1:]

            yield codon, new_codon
Example #20
0
def random_controls(genes, filename, match_cpg=False, avoid_splice=False):
    fields = ['chrom', 'pos', 'id', 'ref', 'alt', 'gene', 'tx']
    print '#%s' % '\t'.join(fields)
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.rstrip()
            if not line or line.startswith('#'): continue

            tokens = line.split()
            chrom, pos, id, ref, alt, gene_id, tx_id = tokens[:7]
            chrom = chrom[3:] if chrom.startswith('chr') else chrom
            pos = int(pos)
            tx = get_transcript(genes, pos, ref, alt, gene_id, tx_id)
            if not tx:
                continue

            if match_cpg:
                offset = tx.project_to_premrna(pos)
                pre = tx.premrna()[offset - 1:offset]
                post = tx.premrna()[offset + 1:offset + 2]
                tx_ref = ref
                tx_alt = alt
                if tx.strand() == '-':
                    tx_ref = ref.translate(COMPLEMENT_TAB)
                    tx_alt = alt.translate(COMPLEMENT_TAB)
                assert tx_ref == tx.premrna()[offset]
                cpg = bool((pre and pre[0] == 'C' and
                            (tx_ref == 'G' or tx_alt == 'G'))
                           or (post and post[0] == 'G' and
                               (tx_ref == 'C' or tx_alt == 'C')))
            else:
                cpg = None

            cds_offset, new_ref, new_alt = \
                random_synonymous_site(tx, cpg=cpg, avoid_splice=avoid_splice)
            new_pos = tx.project_from_cds(cds_offset)
            if tx.strand() == '-':
                new_ref = COMPLEMENT[new_ref]
                new_alt = COMPLEMENT[new_alt]

            print '\t'.join(
                [chrom,
                 str(new_pos), id, new_ref, new_alt, gene_id,
                 tx.tx()] + tokens[7:])
Example #21
0
def random_controls(genes, filename, match_cpg=False, avoid_splice=False):
    fields = ['chrom', 'pos', 'id', 'ref', 'alt', 'gene', 'tx']
    print '#%s' % '\t'.join(fields)
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.rstrip()
            if not line or line.startswith('#'): continue

            tokens = line.split()
            chrom, pos, id, ref, alt, gene_id, tx_id = tokens[:7]
            chrom = chrom[3:] if chrom.startswith('chr') else chrom
            pos = int(pos)
            tx = get_transcript(genes, pos, ref, alt, gene_id, tx_id)
            if not tx:
                continue

            if match_cpg:
                offset = tx.project_to_premrna(pos)
                pre = tx.premrna()[offset-1:offset]
                post = tx.premrna()[offset+1:offset+2]
                tx_ref = ref
                tx_alt = alt
                if tx.strand() == '-':
                    tx_ref = ref.translate(COMPLEMENT_TAB)
                    tx_alt = alt.translate(COMPLEMENT_TAB)
                assert tx_ref == tx.premrna()[offset]
                cpg = bool((pre and pre[0] == 'C' and
                            (tx_ref == 'G' or tx_alt == 'G')) or
                           (post and post[0] == 'G' and
                            (tx_ref == 'C' or tx_alt == 'C')))
            else:
                cpg=None

            cds_offset, new_ref, new_alt = \
                random_synonymous_site(tx, cpg=cpg, avoid_splice=avoid_splice)
            new_pos = tx.project_from_cds(cds_offset)
            if tx.strand() == '-':
                new_ref = COMPLEMENT[new_ref]
                new_alt = COMPLEMENT[new_alt]

            print '\t'.join([chrom, str(new_pos), id, new_ref, new_alt, 
                             gene_id, tx.tx()] + tokens[7:])
Example #22
0
def iter_ucsc_genes(filename):
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            if line.startswith('#'): continue
            tokens = line.strip().split()
            bin, name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, \
                 exonCount, exonStarts, exonEnds, score, name2 = tokens[:13]
                 
            chrom = chrom[3:] if chrom.startswith('chr') else chrom
            exonStarts = exonStarts.strip(',').split(',')
            exonEnds = exonEnds.strip(',').split(',')

            yield {'chrom': chrom,
                   'tx_start': txStart,
                   'tx_end': txEnd,
                   'strand': strand,
                   'cds_start': cdsStart,
                   'cds_end': cdsEnd,
                   'exon_starts': exonStarts,
                   'exon_ends': exonEnds,
                   'gene': name2,
                   'tx': name}
Example #23
0
def iter_ucsc_genes(filename):
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            if line.startswith('#'): continue
            tokens = line.strip().split()
            bin, name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, \
                 exonCount, exonStarts, exonEnds, score, name2 = tokens[:13]
                 
            chrom = chrom[3:] if chrom.startswith('chr') else chrom
            exonStarts = exonStarts.strip(',').split(',')
            exonEnds = exonEnds.strip(',').split(',')

            yield {'chrom': chrom,
                   'tx_start': txStart,
                   'tx_end': txEnd,
                   'strand': strand,
                   'cds_start': cdsStart,
                   'cds_end': cdsEnd,
                   'exon_starts': exonStarts,
                   'exon_ends': exonEnds,
                   'gene': name2,
                   'tx': name}
Example #24
0
            line = line.strip()
            if not line or line.startswith('#'): continue
            value, chrom, pos = line.split()
            data[chrom.lstrip('chr')][int(pos)] = float(value)

    if optfile and not os.path.isfile(optfile):
        print >> sys.stderr, "Saving optimized table to:", optfile
        with maybe_gzip_open(optfile, 'wb') as ofp:
            cPickle.dump(data, ofp, cPickle.HIGHEST_PROTOCOL)

    return data


if optfile and os.path.isfile(optfile):
    print >> sys.stderr, "Loading optimized table from:", optfile
    with maybe_gzip_open(optfile, 'rb') as ifp:
        table = cPickle.load(ifp)
else:
    print >> sys.stderr, "Loading table from:", tablefile
    table = read_table(tablefile, optfile)

print '#GERP++'
for line in sys.stdin:
    line = line.strip()
    if not line or line.startswith('#'): continue
    chrom, pos = line.split(None)
    try:
        value = '%.4f' % table[chrom.lstrip('chr')][int(pos)]
    except (IndexError, KeyError):
        value = 'na'
Example #25
0
def filter_variants(genes, filename, protein_coords=False):
    # Do chromosomal binning to efficiently lookup overlapping transcripts
    n_bins = 2048
    tx_locations = defaultdict(lambda: defaultdict(list))
    for gene, txs in genes.iteritems():
        for tx in txs:
            assert tx.gene() == gene
            start = tx._cds_start + 1
            end = tx._cds_end
            i = int(start / n_bins)
            j = int(end / n_bins)
            for bin in xrange(i, j+1):
                tx_locations[tx.chrom()][bin].append((start, end, tx))

    def find_overlapping_transcripts(chrom, pos):
        bin = int(pos / n_bins)
        txs = tx_locations[chrom][bin]
        return [tx for (start, end, tx) in txs if start <= pos <= end]
            
    fields = ['chrom', 'pos', 'id', 'ref', 'alt', 'gene', 'tx']
    print '#%s' % '\t'.join(fields)
    n_total = 0
    n_kept = 0
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.rstrip()
            if not line or line.startswith('#'): continue
            n_total += 1

            tokens = line.split()
            if protein_coords:
                gene, codon, aa, mut = tokens[:4]
                rest = tokens[1:]
                match = get_transcript_from_protein(genes, gene, codon, 
                                                    aa, mut)
                if match is None:
                    tx = None
                else:
                    (tx, chrom, pos, ref, alt) = match
                    id = '.'
            else:
                chrom, pos, id, ref, alts = tokens[:5]
                rest = tokens[5:]
                chrom = chrom[3:] if chrom.startswith('chr') else chrom
                alt = alts.split(',')[0]
                # Only process SNVs
                if len(ref) != 1 or len(alt) != 1:
                    continue

                pos = int(pos)
                txs = []
                for tx in find_overlapping_transcripts(chrom, pos):
                    try:
                        if tx.is_synonymous(pos, ref, alt):
                            txs.append(tx)
                    except AssertionError:
                        continue

                tx = max(txs) if txs else None  # Take longest valid transcript

            if not tx:
                continue

            n_kept += 1
            print '\t'.join([chrom, str(pos), id, ref, alt, tx.gene(), tx.tx()] + rest)

        print >>sys.stderr, "Found %d synonymous variants (%d dropped)" % \
              (n_kept, n_total - n_kept)
Example #26
0
def filter_variants(genes, filename, protein_coords=False):
    # Do chromosomal binning to efficiently lookup overlapping transcripts
    n_bins = 2048
    tx_locations = defaultdict(lambda: defaultdict(list))
    for gene, txs in genes.iteritems():
        for tx in txs:
            assert tx.gene() == gene
            start = tx._cds_start + 1
            end = tx._cds_end
            i = int(start / n_bins)
            j = int(end / n_bins)
            for bin in xrange(i, j+1):
                tx_locations[tx.chrom()][bin].append((start, end, tx))

    def find_overlapping_transcripts(chrom, pos):
        bin = int(pos / n_bins)
        txs = tx_locations[chrom][bin]
        return [tx for (start, end, tx) in txs if start <= pos <= end]
            
    fields = ['chrom', 'pos', 'id', 'ref', 'alt', 'gene', 'tx']
    print '#%s' % '\t'.join(fields)
    n_total = 0
    n_kept = 0
    with maybe_gzip_open(filename) as ifp:
        for line in ifp:
            line = line.rstrip()
            if not line or line.startswith('#'): continue
            n_total += 1

            tokens = line.split()
            if protein_coords:
                gene, codon, aa, mut = tokens[:4]
                rest = tokens[1:]
                match = get_transcript_from_protein(genes, gene, codon, 
                                                    aa, mut)
                if match is None:
                    tx = None
                else:
                    (tx, chrom, pos, ref, alt) = match
                    id = '.'
            else:
                chrom, pos, id, ref, alts = tokens[:5]
                rest = tokens[5:]
                chrom = chrom[3:] if chrom.startswith('chr') else chrom
                alt = alts.split(',')[0]
                # Only process SNVs
                ref = ref.strip()
                alt = alt.strip()
                if len(ref) != 1 or len(alt) != 1:
                    logging.debug('Dropping non-SNP line: %s' % line)
                    continue

                pos = int(pos)
                txs = []
                for tx in find_overlapping_transcripts(chrom, pos):
                    try:
                        if tx.is_synonymous(pos, ref, alt):
                            txs.append(tx)
                    except AssertionError:
                        continue

                tx = max(txs) if txs else None  # Take longest valid transcript

            if not tx:
                logging.debug('Variant is not synonymous on any transcript: %s' % line)
                continue

            n_kept += 1
            print '\t'.join([chrom, str(pos), id, ref, alt, tx.gene(), tx.tx()] + rest)

        logging.info("Found %d synonymous variants (%d dropped)" % \
              (n_kept, n_total - n_kept))
Example #27
0
File: gerp.py Project: buske/silva
            line = line.strip()
            if not line or line.startswith('#'): continue
            value, chrom, pos = line.split()
            data[chrom.lstrip('chr')][int(pos)] = float(value)

    if optfile and not os.path.isfile(optfile):
        print >>sys.stderr, "Saving optimized table to:", optfile
        with maybe_gzip_open(optfile, 'wb') as ofp:
            cPickle.dump(data, ofp, cPickle.HIGHEST_PROTOCOL)

    return data


if optfile and os.path.isfile(optfile):
    print >>sys.stderr, "Loading optimized table from:", optfile
    with maybe_gzip_open(optfile, 'rb') as ifp:
        table = cPickle.load(ifp)
else:
    print >>sys.stderr, "Loading table from:", tablefile
    table = read_table(tablefile, optfile)

print '#GERP++'
for line in sys.stdin:
    line = line.strip()
    if not line or line.startswith('#'): continue
    chrom, pos = line.split(None)
    try:
        value = '%.4f' % table[chrom.lstrip('chr')][int(pos)]
    except (IndexError, KeyError):
        value = 'na'