def gc_content_calc(consensus, annotation, windowsize, region, outfile): ''' (str, str, int, str, str) -> None uses the consensus sequence generated above to calculate GC content at selectively unconstrained sites (intronic, intergenic, 4D). writes count of GC nucleotides + Ns + all non-N sites to specified outfile. ''' chromosome, start, end = parse_region(region) with open(outfile, 'w') as f: f.write('start end GC N total_sites\n') seq_index = 0 for window in tqdm(range(start, end, windowsize)): counter = OrderedDict.fromkeys( ['GC_count', 'N_count', 'total_sites'], 0) window_start = window if window + windowsize > end: window_end = end else: window_end = window + windowsize p = ant.Reader(annotation) for record in p.fetch(chromosome, window_start, window_end): seq_index = record.pos - start # fix offset for string indexing if record.is_fold4: if consensus[seq_index] in ['G', 'C']: counter['GC_count'] += 1 counter['total_sites'] += 1 elif consensus[seq_index] in ['A', 'T']: # Ns not counted counter['total_sites'] += 1 elif consensus[seq_index] == 'N': counter['N_count'] += 1 window_out = ' '.join( [str(num) for num in [window_start, window_end]]) line_out_counts = ' '.join( [str(i) for i in list(counter.values())]) f.write(window_out + ' ' + line_out_counts + '\n')
def prep_header(table, outfile): with open(outfile, 'w') as f_out: p = ant.Reader(table) for item in p.header: f_out.write(item.strip() + '\n') f_out.write( '##ld_rho=LDhelmet recombination rate on Quebec dataset. p/bp [FLOAT]' + '\n') # add column names p.cols.append('ld_rho') p.cols = [item.strip() for item in p.cols] col_line = '#' + '\t'.join(p.cols) + '\n' f_out.write(col_line)
def write_lines(table, ldhelmet_dir, outfile): for i in range(1, 18): with open(outfile, 'a') as f_out: with open(ldhelmet_dir + 'chromosome_{}.txt'.format(i)) as f: for line in tqdm(f): if line.startswith(('#', 'ver')): continue else: split = line.split(' ') start, end, rho = int(split[0]), int(split[1]), float( split[2]) p = ant.Reader(table).fetch('chromosome_{}'.format(i), start - 1, end - 1, raw=True) for record in p: record = record + '\t' + str(rho) + '\n' f_out.write(record)
''' add_ld_rho.py but it takes into account methylation the methylation bed file has been split by chromosome in data/methylation/bed_split - should make it quicker to parse them python3.5 add_rho_meth.py > [output table] AH - 02/2018 ''' import ant from tqdm import tqdm p = ant.Reader('data/annotation_table.txt.gz') for item in p.header: print(item.strip()) print('##ld_rho=rate of recombination measured by LDhelmet on Quebec dataset. p/bp [FLOAT]') print('##methylation=beta value at CpG sites in three clones of CC2937 [FLOAT]') # add column names p.cols.append('ld_rho') p.cols.append('methylation') p.cols = [item.strip() for item in p.cols] print('#', '\t'.join(p.cols), sep = '') # add new cols def makelookup(chrom): filename = 'data/methylation/bed_split/{}.bed'.format(chrom) lookup = {} with open(filename, 'r') as m: for line in tqdm(m):
def checkrho(record, ld_dict): currentrho = 0.0 for key in ld_dict.keys(): if record.pos in key: currentrho = ld_dict[key] continue elif record.pos not in key: pass return currentrho print('chrom type avgrho numrecords') p = ant.Reader(annotation).fetch(chrom) exonic_rho = [checkrho(r, ldh_dict) for r in p if r.is_exonic] were_exonic = len(exonic_rho) exonic_rho = sum(exonic_rho) / were_exonic # overwrites massive list print(chrom, 'exonic', exonic_rho, were_exonic) p = ant.Reader(annotation).fetch(chrom) intronic_rho = [checkrho(r, ldh_dict) for r in p if r.is_intronic] were_intronic = len(intronic_rho) intronic_rho = sum(intronic_rho) / were_intronic print(chrom, 'intronic', intronic_rho, were_intronic) p = ant.Reader(annotation).fetch(chrom) genic_rho = [checkrho(r, ldh_dict) for r in p if r.is_genic] were_genic = len(genic_rho) genic_rho = sum(genic_rho) / were_genic