Example #1
0
def gc_content_calc(consensus, annotation, windowsize, region, outfile):
    ''' (str, str, int, str, str) -> None
    uses the consensus sequence generated above to calculate GC content
    at selectively unconstrained sites (intronic, intergenic, 4D).
    writes count of GC nucleotides + Ns + all non-N sites to specified outfile.
    '''
    chromosome, start, end = parse_region(region)
    with open(outfile, 'w') as f:
        f.write('start end GC N total_sites\n')
        seq_index = 0
        for window in tqdm(range(start, end, windowsize)):
            counter = OrderedDict.fromkeys(
                ['GC_count', 'N_count', 'total_sites'], 0)
            window_start = window
            if window + windowsize > end:
                window_end = end
            else:
                window_end = window + windowsize
            p = ant.Reader(annotation)
            for record in p.fetch(chromosome, window_start, window_end):
                seq_index = record.pos - start  # fix offset for string indexing
                if record.is_fold4:
                    if consensus[seq_index] in ['G', 'C']:
                        counter['GC_count'] += 1
                        counter['total_sites'] += 1
                    elif consensus[seq_index] in ['A', 'T']:  # Ns not counted
                        counter['total_sites'] += 1
                    elif consensus[seq_index] == 'N':
                        counter['N_count'] += 1
            window_out = ' '.join(
                [str(num) for num in [window_start, window_end]])
            line_out_counts = ' '.join(
                [str(i) for i in list(counter.values())])
            f.write(window_out + ' ' + line_out_counts + '\n')
Example #2
0
def prep_header(table, outfile):
    with open(outfile, 'w') as f_out:
        p = ant.Reader(table)
        for item in p.header:
            f_out.write(item.strip() + '\n')
        f_out.write(
            '##ld_rho=LDhelmet recombination rate on Quebec dataset. p/bp [FLOAT]'
            + '\n')

        # add column names
        p.cols.append('ld_rho')
        p.cols = [item.strip() for item in p.cols]
        col_line = '#' + '\t'.join(p.cols) + '\n'
        f_out.write(col_line)
Example #3
0
def write_lines(table, ldhelmet_dir, outfile):
    for i in range(1, 18):
        with open(outfile, 'a') as f_out:
            with open(ldhelmet_dir + 'chromosome_{}.txt'.format(i)) as f:
                for line in tqdm(f):
                    if line.startswith(('#', 'ver')):
                        continue
                    else:
                        split = line.split(' ')
                        start, end, rho = int(split[0]), int(split[1]), float(
                            split[2])
                        p = ant.Reader(table).fetch('chromosome_{}'.format(i),
                                                    start - 1,
                                                    end - 1,
                                                    raw=True)
                        for record in p:
                            record = record + '\t' + str(rho) + '\n'
                            f_out.write(record)
Example #4
0
'''
add_ld_rho.py but it takes into account methylation

the methylation bed file has been split by chromosome in data/methylation/bed_split - should make it quicker to parse them

python3.5 add_rho_meth.py > [output table]

AH - 02/2018
'''

import ant
from tqdm import tqdm

p = ant.Reader('data/annotation_table.txt.gz')
for item in p.header:
    print(item.strip())

print('##ld_rho=rate of recombination measured by LDhelmet on Quebec dataset. p/bp [FLOAT]')
print('##methylation=beta value at CpG sites in three clones of CC2937 [FLOAT]')

# add column names
p.cols.append('ld_rho')
p.cols.append('methylation')
p.cols = [item.strip() for item in p.cols]
print('#', '\t'.join(p.cols), sep = '') # add new cols

def makelookup(chrom):
    filename = 'data/methylation/bed_split/{}.bed'.format(chrom)
    lookup = {}
    with open(filename, 'r') as m:
        for line in tqdm(m):

def checkrho(record, ld_dict):
    currentrho = 0.0
    for key in ld_dict.keys():
        if record.pos in key:
            currentrho = ld_dict[key]
            continue
        elif record.pos not in key:
            pass
    return currentrho


print('chrom type avgrho numrecords')

p = ant.Reader(annotation).fetch(chrom)
exonic_rho = [checkrho(r, ldh_dict) for r in p if r.is_exonic]
were_exonic = len(exonic_rho)
exonic_rho = sum(exonic_rho) / were_exonic  # overwrites massive list
print(chrom, 'exonic', exonic_rho, were_exonic)

p = ant.Reader(annotation).fetch(chrom)
intronic_rho = [checkrho(r, ldh_dict) for r in p if r.is_intronic]
were_intronic = len(intronic_rho)
intronic_rho = sum(intronic_rho) / were_intronic
print(chrom, 'intronic', intronic_rho, were_intronic)

p = ant.Reader(annotation).fetch(chrom)
genic_rho = [checkrho(r, ldh_dict) for r in p if r.is_genic]
were_genic = len(genic_rho)
genic_rho = sum(genic_rho) / were_genic