def check_gene_proximity(record, dist, direction):
    '''(rec, int, str) -> bool
    dir = 'u' for upstream, 'd' for downstream
    '''
    p = antr.Reader(table)

    try:  # in case site hits end of chrom
        if direction == 'u':
            region = p.fetch(record.chrom, record.pos, record.pos + dist)
        elif direction == 'd':
            region = p.fetch(record.chrom, record.pos - dist, record.pos)
        else:
            print('Invalid argument provided to direction.')
            print('Valid arguments are: u (upstream) and d (downstream)')
        for record in region:
            if record.is_genic:
                out = True
                break
            else:
                continue
        if out:
            return True
        elif not out:
            return False
    except:
        return False
def gc_content_calc(consensus, annotation, windowsize, chrom, outfile):
    ''' (str, str, int, str, str) -> None
    uses the consensus sequence generated above to calculate:
    
    1) total GC content
    2) GC content at selectively unconstrained sites (intronic, intergenic, 4D).
    3) cumulative rho + # of sites with rho estimates in window
    
    writes count of GC nucleotides + Ns + all non-N sites to specified outfile.
    '''
    print('Selected chromosome {chrom}'.format(chrom=chrom))
    lengths = antr.chlamy_lengths()
    chrom_length = lengths[chrom]
    with open(outfile, 'w') as f:
        f.write(
            'start end GC GC4 N 4D_sites total_sites rho_total rho_count\n')
        for window in tqdm(range(0, chrom_length, windowsize)):
            counter = OrderedDict.fromkeys([
                'GC_count', 'GC4_count', 'N_count', '4D_sites', 'total_sites'
            ], 0)
            rho = OrderedDict.fromkeys(['rho_total', 'rho_count'], 0.0)
            window_start = window
            if window + windowsize > chrom_length:
                window_end = chrom_length
            else:
                window_end = window + windowsize
            p = antr.Reader(annotation)
            for record in p.fetch(chrom, window_start, window_end):
                consensus_base = consensus[record.pos - 1]
                if not record.ld_rho == 'NA':
                    rho['rho_total'] += record.ld_rho
                    rho['rho_count'] += 1
                if record.is_fold4:
                    if consensus_base in ['G', 'C']:
                        counter['GC_count'] += 1
                        counter['GC4_count'] += 1
                        counter['4D_sites'] += 1
                        counter['total_sites'] += 1
                    elif consensus_base in ['A', 'T']:
                        counter['4D_sites'] += 1
                        counter['total_sites'] += 1
                    elif consensus_base == 'N':
                        counter['N_count'] += 1
                elif not record.is_fold4:
                    if consensus_base in ['G', 'C']:
                        counter['GC_count'] += 1
                        counter['total_sites'] += 1
                    elif consensus_base in ['A', 'T']:
                        counter['total_sites'] += 1
                    elif consensus_base == 'N':
                        counter['N_count'] += 1

            window_out = ' '.join(
                [str(num) for num in [window_start, window_end]])
            line_out_counts = ' '.join(
                [str(i) for i in list(counter.values())]) + ' '
            line_out_counts += ' '.join([str(i) for i in list(rho.values())])
            f.write(window_out + ' ' + line_out_counts + '\n')
def gc_calc(chromosome, window, table):
    '''Returns GC content in given window as proportion.'''
    seq = ''.join([
        record.ref for record in antr.Reader(table).fetch(
            chromosome, window[0], window[1])
    ])
    total = len(seq)
    GC = seq.count('G') + seq.count('C')
    GC_content = GC / total
    return GC_content
def get_lengths(table, out):
    with open(out, 'w') as f:
        colnames = [
            'chrom', 'start', 'end', 'rho_vals', 'rho_count', 'tract_size',
            'rho_window'
        ]
        f.write('\t'.join(colnames) + '\n')
        p = antr.Reader(table)
        in_tract = False
        rho_vals = 0.0
        rho_count = 0
        for record in tqdm(p):
            if record.is_intergenic:
                if not in_tract:
                    in_tract = True
                    current_chrom = record.chrom
                    start = record.pos
                    if record.ld_rho != 'NA':
                        rho_vals += record.ld_rho
                        rho_count += 1
                elif in_tract:
                    if record.ld_rho != 'NA' and record.chrom == current_chrom:
                        rho_vals += record.ld_rho
                        rho_count += 1
                    elif record.chrom != current_chrom:  # hit end of chrom
                        in_tract = False
                        end = record.pos
                        out = [
                            record.chrom, start, end, rho_vals, rho_count,
                            end - start - 1, rho_vals / rho_count
                        ]
                        out = [str(item) for item in out]
                        f.write('\t'.join(out) + '\n')
                        # reset
                        rho_vals = 0.0
                        rho_count = 0
            elif record.is_genic:
                if in_tract:
                    in_tract = False
                    end = record.pos - 1
                    if start == end:
                        tract_size = 0
                    else:
                        tract_size = end - start - 1
                    out = [
                        record.chrom, start, end, rho_vals, rho_count,
                        tract_size, rho_vals / rho_count
                    ]
                    out = [str(item) for item in out]
                    f.write('\t'.join(out) + '\n')
                    # reset
                    rho_vals = 0.0
                    rho_count = 0
                elif not in_tract:
                    continue
def get_tract_rho(table, chrom, start, end):
    ''' (str, str, int, int) -> (float, int)
    helper function that returns cumulative sum of rho + count of sites
    in input window
    '''
    p = antr.Reader(table)
    rho_vals, rho_count = 0.0, 0
    for record in p.fetch(chrom, start, end):
        rho_vals += record.ld_rho
        rho_count += 1
    return rho_vals, rho_count
Exemple #6
0
def create_lookup(table, context_size, chrom):
    '''(str, int, str) -> str
    uses annotation table to create a lookup string
    for all intergenic sites

    g: genic
    0: intergenic
    1: site upstream of gene
    2: site downstream of gene
    3: site upstream and downstream of gene
    N: unknown
    '''

    p = antr.Reader(table)
    genic_lookup = ''
    proximity_lookup = ''
    start = next(p.fetch(chrom)).pos
    if start > 1:
        for i in range(0, start):
            genic_lookup += 'N'
            proximity_lookup += 'N'

    for rec in p.fetch(chrom):
        if rec.is_genic:
            genic_lookup += 'g'
            continue
        elif rec.is_intergenic:
            genic_lookup += 'i'
            continue

    start = len(proximity_lookup)
    for pos in tqdm(range(start, len(genic_lookup))):
        if genic_lookup[pos] == 'g':
            proximity_lookup += 'g'
            continue
        elif genic_lookup[pos] == 'i':
            upstream, downstream = False, False

            if 'g' in genic_lookup[pos:pos+context_size]: upstream = True
            if 'g' in genic_lookup[pos-context_size:pos]: downstream = True

            if upstream and not downstream:
                proximity_lookup += '1'
                continue
            elif downstream and not upstream:
                proximity_lookup += '2'
                continue
            if upstream and downstream:
                proximity_lookup += '3'
                continue
            else:
                proximity_lookup += '0'

    return genic_lookup, proximity_lookup
Exemple #7
0
def parse_utrs(bed, lookup, start_arrays, table, out):
    ''' (str, dict, dict, str, str) -> None
    uses lookups made above + annotation to both get rho in
    3' UTRs and 'assign' them their nearest intergenic tracts
    '''
    with open(out, 'w', newline='') as f_out:
        fieldnames_out = [
            'chrom', 'utr3_start', 'utr3_end', 'start', 'end',
            'utr3_rho_vals', 'utr3_rho_count', 'utr3_rho_window'
            ]
        writer = csv.DictWriter(f_out, delimiter='\t', fieldnames=fieldnames_out)
        writer.writeheader()
        with open(bed, 'r', newline='') as f_in:
            fieldnames = ['chrom', 'start', 'end', 'info']
            reader = csv.DictReader(f_in, delimiter='\t', fieldnames=fieldnames)
            print('Parsing UTRs...')
            for utr in tqdm(reader):
                chrom = utr['chrom']
                start = int(utr['start'])
                end = int(utr['end'])
                possible_starts = start_arrays[chrom][start_arrays[chrom] > end]
                if possible_starts.size:
                    tract_start = possible_starts.min()
                else:
                    continue # UTR is next to 'end of chromosome' tract
                utr3_rho_vals = 0.0
                utr3_rho_count = 0
                p = antr.Reader(table)
                for record in p.fetch(chrom, start, end):
                    try:
                        assert record.is_utr3
                    except:
                        print('Error - not UTR?')
                        print(record.chrom, record.pos)
                        sys.exit()
                    if record.ld_rho != 'NA':
                        utr3_rho_vals += record.ld_rho
                        utr3_rho_count += 1
                try:
                    utr3_rho_window = utr3_rho_vals / utr3_rho_count
                except ZeroDivisionError:
                    utr3_rho_window = 0
                out_dict = {
                    'chrom': chrom, 'utr3_start': start, 'utr3_end': end,
                    'start': tract_start, 'end': lookup[chrom][tract_start],
                    'utr3_rho_vals': utr3_rho_vals, 'utr3_rho_count': utr3_rho_count,
                    'utr3_rho_window': utr3_rho_window}
                writer.writerow(out_dict)
Exemple #8
0
    def singlecalc(feature_type, strand, distance, table, chromosome, region,
                   start, end):
        p = antr.Reader(table)

        for record in p.fetch(chromosome, region[0], region[1]):
            rho = record.ld_rho

            if feature_type == 'TSS' and strand == '+':
                dist = record.pos - start
            elif feature_type == 'TSS' and strand == '-':
                dist = end - record.pos
            elif feature_type == 'TES' and strand == '+':
                dist = record.pos - end
            elif feature_type == 'TES' and strand == '-':
                dist = start - record.pos

            out = ' '.join([feature_type, str(dist), str(rho)])
            print(out)
Exemple #9
0
def parse_tracts(fname, table, outname):
    with open(outname, 'w', newline='') as f_out:
        fieldnames = [
            'chrom', 'start', 'end', 'tract_size', 'rho_vals', 'rho_count',
            'rho_window', 'utr_start', 'utr_end', 'utr_rho_vals',
            'utr_rho_count', 'utr_rho_window'
        ]
        f_out.write('\t'.join(fieldnames) + '\n')
        writer = csv.DictWriter(f_out, fieldnames=fieldnames, delimiter='\t')
        lengths = antr.chlamy_lengths()
        with open(fname, 'r', newline='') as f_in:
            reader = csv.DictReader(f_in, delimiter='\t')
            for tract in tqdm(reader):
                utr_rho_vals = 0.0
                utr_rho_count = 0
                if int(tract['start']) > int(tract['end']):
                    continue
                else:
                    chrom, utr_start = tract['chrom'], int(tract['end']) + 1
                    chrom_length = lengths[chrom]
                    p = antr.Reader(table)
                    first_iter = True
                    for record in p.fetch(chrom, utr_start, chrom_length):
                        if first_iter and not record.is_utr5 and not record.is_utr3:
                            print('wtf')
                            print(record.chrom, record.pos)
                            break
                        else:
                            first_iter = False
                        if record.is_utr5 or record.is_utr3:
                            utr_rho_vals += record.ld_rho
                            utr_rho_count += 1
                        elif not record.is_utr5 and not record.is_utr3 and not first_iter:
                            utr_end = record.pos - 1
                            out_dict = tract
                            out_dict['utr_start'] = utr_start
                            out_dict['utr_end'] = utr_end
                            out_dict['utr_rho_vals'] = utr_rho_vals
                            out_dict['utr_rho_count'] = utr_rho_count
                            out_dict[
                                'utr_rho_window'] = utr_rho_vals / utr_rho_count
                            writer.writerow(out_dict)
                            break
def gene_proximal_per_tract(line, table, windowsize, split):
    ''' (str, str, int, str) -> list?
    takes in a single input tract 'line' from tsv
    and calculates rho at gene proximal sites

    accounts for tracts where length < windowsize
    (ie entire tract is gene proximal) - in this case,
    splits tract into halves and calls them 'left' and 'right'
    (despite both being < windowsize) to maintain structure of outfile
    '''
    chrom, start, end = line['chrom'], int(line['start']), int(line['end'])
    tract_size = end - start
    if tract_size > (2 * windowsize):
        left_start, left_end = start, start + windowsize
        right_start, right_end = end - windowsize, end
    elif tract_size <= (2 * windowsize) and tract_size > windowsize:
        if not split:
            # split into half for 'left' + 'right'
            left_start, left_end = start, start + (tract_size / 2)
            right_start, right_end = start + (tract_size / 2), end
        elif split == 'left':
            left_start, left_end = start, start + 2000
            right_start, right_end = left_end, end
        elif split == 'right':
            right_start, right_end = end - 2000, end
            left_start, left_end = start, right_start
    elif tract_size < (2 * windowsize):  # do split thing again
        # downstream script can add rho vals and count for full tract
        left_start, left_end = start, start + (tract_size / 2)
        right_start, right_end = start + (tract_size / 2), end
    left_vals, right_vals = 0.0, 0.0
    left_count, right_count = 0, 0
    p = antr.Reader(table)
    for record in p.fetch(chrom, left_start, left_end):
        if record.ld_rho != 'NA':
            left_vals += record.ld_rho
            left_count += 1
    for record in p.fetch(chrom, right_start, right_end):
        if record.ld_rho != 'NA':
            right_vals += record.ld_rho
            right_count += 1
    return left_vals, left_count, right_vals, right_count, tract_size
Exemple #11
0
    def windowcalc(feature_type, strand, distance, windowsize, table,
                   chromosome, region, start, end):
        '''Where region is a tuple of size 2, indicating the start and the end of the region
        i.e. windowcalc('TES', '+', 20, 'table.txt.gz', 'chromosome_2', (900, 1000), 16200, 17300)'''

        windowlist = list(range(region[0], region[1] + 1, windowsize))

        p = antr.Reader(table)

        for i in range(len(windowlist) - 1):
            windowleft, windowright = windowlist[i], windowlist[i + 1]
            rho_cumulative = 0.0
            count = 0

            for record in p.fetch(chromosome, windowleft, windowright):
                rho_cumulative += record.ld_rho
                count += 1

            try:
                rho_out = rho_cumulative / count
            except ZeroDivisionError:
                assert count == 0
                rho_out = 0

            if feature_type == 'TSS' and strand == '+':
                windowleft_out = windowleft - start
                windowright_out = windowright - start
            elif feature_type == 'TSS' and strand == '-':
                windowleft_out = end - windowleft
                windowright_out = end - windowright
            elif feature_type == 'TES' and strand == '+':
                windowleft_out = windowleft - end
                windowright_out = windowright - end
            elif feature_type == 'TES' and strand == '-':
                windowleft_out = start - windowleft
                windowright_out = start - windowright


            windowout = ' '.join([str(feature_type), str(windowleft_out), str(windowright_out), \
                                  str(rho_out), str(rho_cumulative), str(count)])
            print(windowout)
def create_lookup(table, chrom):
    ''' (str, str) -> str
    uses annotation table to create a lookup string for all sites

    modified from rcmb_correlates.py

    i: intergenic
    c: cds
    n: intron
    5: utr5
    3: utr3
    N: unknown
    '''

    p = antr.Reader(table)
    lookup_string = ''
    start = next(p.fetch(chrom)).pos
    if start > 1:
        for i in range(0, start):
            lookup_string += 'N'

    for rec in tqdm(p.fetch(chrom)):
        if rec.is_intergenic:
            lookup_string += 'i'
        elif rec.is_in_CDS:
            lookup_string += 'c'
        elif rec.is_intronic:
            lookup_string += 'n'
        elif rec.is_utr5:
            lookup_string += '5'
        elif rec.is_utr3:
            lookup_string += '3'
        else:
            lookup_string += 'N'

    with open(chrom + '_temp_lookup', 'w') as f:
        f.write(lookup_string)

    return lookup_string
Exemple #13
0
def SFS_from_antr(table,
                  chromosome,
                  start,
                  end,
                  min_alleles=None,
                  neutral_only=False,
                  counter=False):
    SFSs = {}
    p = antr.Reader(table)
    if counter:
        record_count = 0
    for record in p.fetch(chromosome, start, end):
        # diversity calc
        allele_counts = record.quebec_alleles
        if neutral_only and True not in [
                record.is_intergenic, record.is_intronic, record.is_fold4
        ]:
            continue
        try:
            MAF, total_alleles_called = MAF_from_allele_count(
                allele_counts, min_alleles=min_alleles)
        except TypeError:
            continue
        if min_alleles and total_alleles_called < min_alleles:  # filter sites that don't have enough called alleles
            continue
        if total_alleles_called not in SFSs:
            SFSs[total_alleles_called] = SFS([0] * (total_alleles_called + 1))
        SFSs[total_alleles_called].add(MAF, total_alleles_called)
        if counter:
            record_count += 1
    diversity = sum([sfs.theta_pi() * sfs.sites() for sfs in SFSs.values()
                     ]) / sum([sfs.sites() for sfs in SFSs.values()])
    if not counter:
        return diversity
    elif counter:
        return diversity, record_count
def parse_crossovers(filename, table, out):
    with open(filename, 'r') as f:
        crossovers = [line for line in csv.DictReader(f, delimiter='\t')]
    with open(out, 'w') as f_out:
        fieldnames = [
            'cross', 'tetrad', 'individual', 'chromosome', 'left_bound',
            'right_bound', 'mid_point', 'length', 'rho_total', 'rho_count',
            'rho_window'
        ]
        writer = csv.DictWriter(f_out, fieldnames=fieldnames)
        writer.writeheader()

        for co in tqdm(crossovers):
            chrom = str(co['chromosome'])
            start, end = int(co['left_bound']), int(co['right_bound'])
            p = antr.Reader(table)
            rho_vals = [record.ld_rho for record in p.fetch(chrom, start, end)]

            out_dict = deepcopy(co)
            out_dict['rho_total'] = sum(rho_vals)
            out_dict['rho_count'] = len(rho_vals)
            out_dict['rho_window'] = sum(rho_vals) / len(rho_vals)

            writer.writerow(out_dict)
import antr
from tqdm import tqdm
import sys

table = sys.argv[-1]

p = antr.Reader(table)
for item in p.header:
    print(item.strip())

print(
    '##methylation=beta value at CpG sites in three clones of CC2937. [FLOAT]')

# add column names
p.cols.append('methylation')
p.cols = [item.strip() for item in p.cols]
print('#', '\t'.join(p.cols), sep='')  # add ld_rho to end of colnames

# write records
with open('data/methylation/beta_vals_no_context.bed', 'r') as f:
    for line in tqdm(f):
        split = [i.rstrip() for i in line.split('\t')]
        chrom, c_pos, beta = str(split[0]), int(split[1]), float(split[3])

        p = antr.Reader(table).fetch(chrom, c_pos, c_pos + 1, raw=True)

        for record in p:
            record = record + '\t' + str(beta)
            print(record)
                    ]]
                except IndexError:
                    continue
            else:
                continue

# col headers for file
print('chromosome', 'start', 'end', 'length', 'type', 'order', 'rho',
      'total_rho', 'count')

# iterate through chromosomes
for chrom in range(1, 18):
    eprint('starting', chrom)
    current_chrom = 'chromosome_{}'.format(str(chrom))
    eprint('current_chrom = ', current_chrom)
    p = antr.Reader(table)

    # exons
    exon_order = 0  # keep track of which dict for the 'order' column
    # 0 - first, 1 - other, 2 - last
    for coord_dict in [first_exons, other_exons, last_exons]:
        for exon in tqdm(coord_dict[current_chrom]):

            p = antr.Reader(table)

            exon = [int(v) for v in exon]

            exon_start, exon_end = exon  # unpack
            exon_length = exon_end - exon_start
            exon_total_rho = 0.0
            count = 0
def create_full_lookup(table, context_size, chrom):
    '''(str, int, str) -> str
    uses annotation table to create a lookup string
    for all sites (modified from rcmb_correlates.py)
    c: CDS
    i: intronic
    f: utr5
    t: utr3
    0: intergenic (non-gene-proximate)
    1: site upstream of gene
    2: site downstream of gene
    3: site upstream and downstream of gene
    N: unknown

    in the lookup string, string[x] represents position x + 1
    '''

    p = antr.Reader(table)
    initial_lookup = ''
    genic_lookup = ''
    proximity_lookup = ''
    start = next(p.fetch(chrom)).pos
    if start > 1:
        for i in range(0, start):
            initial_lookup += 'N'
            genic_lookup += 'N'
            proximity_lookup += 'N'

    lookup_codes = {
        'is_in_CDS': 'c',
        'is_intronic': 'i',
        'is_utr5': 'f',
        'is_utr3': 't',
        'is_intergenic': '0'
    }
    genic_codes = ['c', 'i', 'f', 't']

    print('Initial run through...')
    for rec in tqdm(p.fetch(chrom)):
        if rec.is_genic:
            genic_lookup += 'g'
        elif rec.is_intergenic:
            genic_lookup += 'i'
        for annotation in lookup_codes.keys():
            if getattr(rec, annotation):
                initial_lookup += lookup_codes[annotation]
                break

    print('Done.')
    print('Generating full string...')
    start = len(proximity_lookup)
    for pos in tqdm(range(start, len(genic_lookup))):
        if genic_lookup[pos] == 'g':
            proximity_lookup += initial_lookup[pos]
        elif genic_lookup[pos] == 'i':
            upstream, downstream = False, False

            if 'g' in genic_lookup[pos:pos + context_size]:
                upstream = True
            if 'g' in genic_lookup[pos - context_size:pos]:
                downstream = True

            if upstream and not downstream:
                proximity_lookup += '1'
                continue
            elif downstream and not upstream:
                proximity_lookup += '2'
                continue
            elif upstream and downstream:
                proximity_lookup += '3'
                continue
            else:
                proximity_lookup += '0'

    return genic_lookup, proximity_lookup
def main(table, windowsize, correlates, gc_content, gene_context):
    # print column headers
    if correlates:
        if context_size:  # ie upstream/downstream of genes
            correlates.extend(['upstream', 'downstream', 'both'])
        title1 = ' '.join([item + '_total' for item in correlates])
        title2 = ' '.join([item + '_count' for item in correlates])
        if gc:
            print('chromosome', 'start', 'end', title1, title2, 'GC', 'count')
        elif not gc:
            print('chromosome', 'start', 'end', title1, title2, 'count')
    elif gc and not correlates:
        print('chromosome', 'start', 'end', 'GC', 'rho', 'rho_total', 'count')

    # iterate through chromosomes
    for chrom in range(1, 18):
        current_chrom = 'chromosome_{}'.format(str(chrom))
        windows = list(range(0, lengths[current_chrom],
                             windowsize)) + [lengths[current_chrom]]

        p = antr.Reader(table)

        for i in range(len(windows) - 1):
            window = (windows[i], windows[i + 1])

            if correlates:
                rho = OrderedDict.fromkeys(correlates, 0.0)
                count = OrderedDict.fromkeys(correlates, 0)
                total_counter = 0

                # iterate through records in window
                for record in tqdm(p.fetch(current_chrom, window[0],
                                           window[1])):
                    for key in rho.keys():
                        if key in ['upstream', 'downstream', 'both']:
                            continue

                        elif attr_fetch(record,
                                        key) and not record.ld_rho == 'NA':
                            if key == 'intergenic' and attr_fetch(
                                    record, 'intergenic') and context_size:
                                neither = True
                                upstream = False
                                downstream = False

                                if check_gene_proximity(
                                        record, context_size, 'u'):
                                    neither = False
                                    upstream = True
                                if check_gene_proximity(
                                        record, context_size, 'd'):
                                    neither = False
                                    downstream = True

                                if upstream and downstream:
                                    rho['both'] += record.ld_rho
                                    count['both'] += 1
                                    total_counter += 1
                                    continue  # don't class as intergenic
                                elif upstream and not downstream:
                                    rho['upstream'] += record.ld_rho
                                    count['upstream'] += 1
                                    total_counter += 1
                                    continue
                                elif downstream and not upstream:
                                    rho['downstream'] += record.ld_rho
                                    count['downstream'] += 1
                                    total_counter += 1
                                    continue
                                elif neither:  # continue to code below
                                    pass

                            rho[key] += record.ld_rho
                            count[key] += 1
                            total_counter += 1
                        else:
                            continue

                rhovals = list(rho.values())
                countvals = list(count.values())

                totals = ' '.join([str(v) for v in rhovals])
                counts = ' '.join([str(v) for v in countvals])

            if gc:  # gc content option selected
                gc_rho = 0.0
                gc_counter = 0

                for record in tqdm(p.fetch(current_chrom, window[0],
                                           window[1])):
                    gc_rho += record.ld_rho
                    gc_counter += 1

                gc_window = gc_calc(current_chrom, window, table)
                gc_rho_perbp = gc_rho / gc_counter

                if correlates:
                    print(current_chrom, window[0], window[1], totals, counts,
                          gc_window, total_counter)
                elif not correlates:
                    print(current_chrom, window[0], window[1], gc_window,
                          gc_rho_perbp, gc_rho, gc_counter)
            elif not gc:
                print(current_chrom, window[0], window[1], totals, counts,
                      total_counter)
Exemple #19
0
def rho_annotations(table, windowsize, gene_context, chrom, out):
    '''(str, int, int, str, str) -> None
    iterates through annotation table and collects rho values for each annotation

    will first create a lookup string to speed up upstream/downstream/both calc
    '''

    print('Chromosome {chrom} selected.'.format(chrom=chrom))
    print('Creating intergenic lookup...')
    genic_lookup, proximity_lookup = create_lookup(table, gene_context, chrom)
    print('Done.')
    correlates = ['is_intergenic', 'is_utr5', 'is_in_CDS', 'is_intronic', 
                  'is_utr3', 'upstream', 'downstream', 'both']

    windows = list(range(0, lengths[chrom], windowsize)) + [lengths[chrom]]
    p = antr.Reader(table)

    print('Starting windowed correlate calc...')
    with open(out, 'w') as f:

        # prep header
        title1 = ' '.join([item + '_total' for item in correlates])
        title2 = ' '.join([item + '_count' for item in correlates])
        header = ' '.join(['chrom', 'start', 'end', title1, title2, 'total_count'])
        f.write(header + '\n')

        # iterate through chromosome
        for i in tqdm(range(len(windows) - 1)):
            rho = OrderedDict.fromkeys(correlates, 0.0)
            count = OrderedDict.fromkeys(correlates, 0)
            total_count = 0

            for record in p.fetch(chrom, windows[i], windows[i+1]):
                for key in rho.keys():
                    if key in ['upstream', 'downstream', 'both']:
                        continue

                    elif getattr(record, key) and not record.ld_rho == 'NA':
                        if key == 'is_intergenic' and getattr(record, 'is_intergenic'):
                            intergenic_type = proximity_lookup[record.pos]
                            if intergenic_type == '1':
                                rho['upstream'] += record.ld_rho
                                count['upstream'] += 1
                            elif intergenic_type == '2':
                                rho['downstream'] += record.ld_rho
                                count['downstream'] += 1
                            elif intergenic_type == '3':
                                rho['both'] += record.ld_rho
                                count['both'] += 1
                            elif intergenic_type == '0':
                                rho['is_intergenic'] += record.ld_rho
                                count['is_intergenic'] += 1
                        else:
                            rho[key] += record.ld_rho
                            count[key] += 1
                            total_count += 1

            # prep output
            rhovals = list(rho.values())
            countvals = list(count.values())
            total_count = str(total_count)
            totals = ' '.join([str(v) for v in rhovals])
            counts = ' '.join([str(v) for v in countvals])

            line_out = ' '.join([chrom, str(windows[i]), str(windows[i+1]), totals, counts, total_count])
            f.write(line_out + '\n')
    print('Complete.')
    print('File written to {out}.'.format(out=out))
    print('Good job!')