def check_gene_proximity(record, dist, direction): '''(rec, int, str) -> bool dir = 'u' for upstream, 'd' for downstream ''' p = antr.Reader(table) try: # in case site hits end of chrom if direction == 'u': region = p.fetch(record.chrom, record.pos, record.pos + dist) elif direction == 'd': region = p.fetch(record.chrom, record.pos - dist, record.pos) else: print('Invalid argument provided to direction.') print('Valid arguments are: u (upstream) and d (downstream)') for record in region: if record.is_genic: out = True break else: continue if out: return True elif not out: return False except: return False
def gc_content_calc(consensus, annotation, windowsize, chrom, outfile): ''' (str, str, int, str, str) -> None uses the consensus sequence generated above to calculate: 1) total GC content 2) GC content at selectively unconstrained sites (intronic, intergenic, 4D). 3) cumulative rho + # of sites with rho estimates in window writes count of GC nucleotides + Ns + all non-N sites to specified outfile. ''' print('Selected chromosome {chrom}'.format(chrom=chrom)) lengths = antr.chlamy_lengths() chrom_length = lengths[chrom] with open(outfile, 'w') as f: f.write( 'start end GC GC4 N 4D_sites total_sites rho_total rho_count\n') for window in tqdm(range(0, chrom_length, windowsize)): counter = OrderedDict.fromkeys([ 'GC_count', 'GC4_count', 'N_count', '4D_sites', 'total_sites' ], 0) rho = OrderedDict.fromkeys(['rho_total', 'rho_count'], 0.0) window_start = window if window + windowsize > chrom_length: window_end = chrom_length else: window_end = window + windowsize p = antr.Reader(annotation) for record in p.fetch(chrom, window_start, window_end): consensus_base = consensus[record.pos - 1] if not record.ld_rho == 'NA': rho['rho_total'] += record.ld_rho rho['rho_count'] += 1 if record.is_fold4: if consensus_base in ['G', 'C']: counter['GC_count'] += 1 counter['GC4_count'] += 1 counter['4D_sites'] += 1 counter['total_sites'] += 1 elif consensus_base in ['A', 'T']: counter['4D_sites'] += 1 counter['total_sites'] += 1 elif consensus_base == 'N': counter['N_count'] += 1 elif not record.is_fold4: if consensus_base in ['G', 'C']: counter['GC_count'] += 1 counter['total_sites'] += 1 elif consensus_base in ['A', 'T']: counter['total_sites'] += 1 elif consensus_base == 'N': counter['N_count'] += 1 window_out = ' '.join( [str(num) for num in [window_start, window_end]]) line_out_counts = ' '.join( [str(i) for i in list(counter.values())]) + ' ' line_out_counts += ' '.join([str(i) for i in list(rho.values())]) f.write(window_out + ' ' + line_out_counts + '\n')
def gc_calc(chromosome, window, table): '''Returns GC content in given window as proportion.''' seq = ''.join([ record.ref for record in antr.Reader(table).fetch( chromosome, window[0], window[1]) ]) total = len(seq) GC = seq.count('G') + seq.count('C') GC_content = GC / total return GC_content
def get_lengths(table, out): with open(out, 'w') as f: colnames = [ 'chrom', 'start', 'end', 'rho_vals', 'rho_count', 'tract_size', 'rho_window' ] f.write('\t'.join(colnames) + '\n') p = antr.Reader(table) in_tract = False rho_vals = 0.0 rho_count = 0 for record in tqdm(p): if record.is_intergenic: if not in_tract: in_tract = True current_chrom = record.chrom start = record.pos if record.ld_rho != 'NA': rho_vals += record.ld_rho rho_count += 1 elif in_tract: if record.ld_rho != 'NA' and record.chrom == current_chrom: rho_vals += record.ld_rho rho_count += 1 elif record.chrom != current_chrom: # hit end of chrom in_tract = False end = record.pos out = [ record.chrom, start, end, rho_vals, rho_count, end - start - 1, rho_vals / rho_count ] out = [str(item) for item in out] f.write('\t'.join(out) + '\n') # reset rho_vals = 0.0 rho_count = 0 elif record.is_genic: if in_tract: in_tract = False end = record.pos - 1 if start == end: tract_size = 0 else: tract_size = end - start - 1 out = [ record.chrom, start, end, rho_vals, rho_count, tract_size, rho_vals / rho_count ] out = [str(item) for item in out] f.write('\t'.join(out) + '\n') # reset rho_vals = 0.0 rho_count = 0 elif not in_tract: continue
def get_tract_rho(table, chrom, start, end): ''' (str, str, int, int) -> (float, int) helper function that returns cumulative sum of rho + count of sites in input window ''' p = antr.Reader(table) rho_vals, rho_count = 0.0, 0 for record in p.fetch(chrom, start, end): rho_vals += record.ld_rho rho_count += 1 return rho_vals, rho_count
def create_lookup(table, context_size, chrom): '''(str, int, str) -> str uses annotation table to create a lookup string for all intergenic sites g: genic 0: intergenic 1: site upstream of gene 2: site downstream of gene 3: site upstream and downstream of gene N: unknown ''' p = antr.Reader(table) genic_lookup = '' proximity_lookup = '' start = next(p.fetch(chrom)).pos if start > 1: for i in range(0, start): genic_lookup += 'N' proximity_lookup += 'N' for rec in p.fetch(chrom): if rec.is_genic: genic_lookup += 'g' continue elif rec.is_intergenic: genic_lookup += 'i' continue start = len(proximity_lookup) for pos in tqdm(range(start, len(genic_lookup))): if genic_lookup[pos] == 'g': proximity_lookup += 'g' continue elif genic_lookup[pos] == 'i': upstream, downstream = False, False if 'g' in genic_lookup[pos:pos+context_size]: upstream = True if 'g' in genic_lookup[pos-context_size:pos]: downstream = True if upstream and not downstream: proximity_lookup += '1' continue elif downstream and not upstream: proximity_lookup += '2' continue if upstream and downstream: proximity_lookup += '3' continue else: proximity_lookup += '0' return genic_lookup, proximity_lookup
def parse_utrs(bed, lookup, start_arrays, table, out): ''' (str, dict, dict, str, str) -> None uses lookups made above + annotation to both get rho in 3' UTRs and 'assign' them their nearest intergenic tracts ''' with open(out, 'w', newline='') as f_out: fieldnames_out = [ 'chrom', 'utr3_start', 'utr3_end', 'start', 'end', 'utr3_rho_vals', 'utr3_rho_count', 'utr3_rho_window' ] writer = csv.DictWriter(f_out, delimiter='\t', fieldnames=fieldnames_out) writer.writeheader() with open(bed, 'r', newline='') as f_in: fieldnames = ['chrom', 'start', 'end', 'info'] reader = csv.DictReader(f_in, delimiter='\t', fieldnames=fieldnames) print('Parsing UTRs...') for utr in tqdm(reader): chrom = utr['chrom'] start = int(utr['start']) end = int(utr['end']) possible_starts = start_arrays[chrom][start_arrays[chrom] > end] if possible_starts.size: tract_start = possible_starts.min() else: continue # UTR is next to 'end of chromosome' tract utr3_rho_vals = 0.0 utr3_rho_count = 0 p = antr.Reader(table) for record in p.fetch(chrom, start, end): try: assert record.is_utr3 except: print('Error - not UTR?') print(record.chrom, record.pos) sys.exit() if record.ld_rho != 'NA': utr3_rho_vals += record.ld_rho utr3_rho_count += 1 try: utr3_rho_window = utr3_rho_vals / utr3_rho_count except ZeroDivisionError: utr3_rho_window = 0 out_dict = { 'chrom': chrom, 'utr3_start': start, 'utr3_end': end, 'start': tract_start, 'end': lookup[chrom][tract_start], 'utr3_rho_vals': utr3_rho_vals, 'utr3_rho_count': utr3_rho_count, 'utr3_rho_window': utr3_rho_window} writer.writerow(out_dict)
def singlecalc(feature_type, strand, distance, table, chromosome, region, start, end): p = antr.Reader(table) for record in p.fetch(chromosome, region[0], region[1]): rho = record.ld_rho if feature_type == 'TSS' and strand == '+': dist = record.pos - start elif feature_type == 'TSS' and strand == '-': dist = end - record.pos elif feature_type == 'TES' and strand == '+': dist = record.pos - end elif feature_type == 'TES' and strand == '-': dist = start - record.pos out = ' '.join([feature_type, str(dist), str(rho)]) print(out)
def parse_tracts(fname, table, outname): with open(outname, 'w', newline='') as f_out: fieldnames = [ 'chrom', 'start', 'end', 'tract_size', 'rho_vals', 'rho_count', 'rho_window', 'utr_start', 'utr_end', 'utr_rho_vals', 'utr_rho_count', 'utr_rho_window' ] f_out.write('\t'.join(fieldnames) + '\n') writer = csv.DictWriter(f_out, fieldnames=fieldnames, delimiter='\t') lengths = antr.chlamy_lengths() with open(fname, 'r', newline='') as f_in: reader = csv.DictReader(f_in, delimiter='\t') for tract in tqdm(reader): utr_rho_vals = 0.0 utr_rho_count = 0 if int(tract['start']) > int(tract['end']): continue else: chrom, utr_start = tract['chrom'], int(tract['end']) + 1 chrom_length = lengths[chrom] p = antr.Reader(table) first_iter = True for record in p.fetch(chrom, utr_start, chrom_length): if first_iter and not record.is_utr5 and not record.is_utr3: print('wtf') print(record.chrom, record.pos) break else: first_iter = False if record.is_utr5 or record.is_utr3: utr_rho_vals += record.ld_rho utr_rho_count += 1 elif not record.is_utr5 and not record.is_utr3 and not first_iter: utr_end = record.pos - 1 out_dict = tract out_dict['utr_start'] = utr_start out_dict['utr_end'] = utr_end out_dict['utr_rho_vals'] = utr_rho_vals out_dict['utr_rho_count'] = utr_rho_count out_dict[ 'utr_rho_window'] = utr_rho_vals / utr_rho_count writer.writerow(out_dict) break
def gene_proximal_per_tract(line, table, windowsize, split): ''' (str, str, int, str) -> list? takes in a single input tract 'line' from tsv and calculates rho at gene proximal sites accounts for tracts where length < windowsize (ie entire tract is gene proximal) - in this case, splits tract into halves and calls them 'left' and 'right' (despite both being < windowsize) to maintain structure of outfile ''' chrom, start, end = line['chrom'], int(line['start']), int(line['end']) tract_size = end - start if tract_size > (2 * windowsize): left_start, left_end = start, start + windowsize right_start, right_end = end - windowsize, end elif tract_size <= (2 * windowsize) and tract_size > windowsize: if not split: # split into half for 'left' + 'right' left_start, left_end = start, start + (tract_size / 2) right_start, right_end = start + (tract_size / 2), end elif split == 'left': left_start, left_end = start, start + 2000 right_start, right_end = left_end, end elif split == 'right': right_start, right_end = end - 2000, end left_start, left_end = start, right_start elif tract_size < (2 * windowsize): # do split thing again # downstream script can add rho vals and count for full tract left_start, left_end = start, start + (tract_size / 2) right_start, right_end = start + (tract_size / 2), end left_vals, right_vals = 0.0, 0.0 left_count, right_count = 0, 0 p = antr.Reader(table) for record in p.fetch(chrom, left_start, left_end): if record.ld_rho != 'NA': left_vals += record.ld_rho left_count += 1 for record in p.fetch(chrom, right_start, right_end): if record.ld_rho != 'NA': right_vals += record.ld_rho right_count += 1 return left_vals, left_count, right_vals, right_count, tract_size
def windowcalc(feature_type, strand, distance, windowsize, table, chromosome, region, start, end): '''Where region is a tuple of size 2, indicating the start and the end of the region i.e. windowcalc('TES', '+', 20, 'table.txt.gz', 'chromosome_2', (900, 1000), 16200, 17300)''' windowlist = list(range(region[0], region[1] + 1, windowsize)) p = antr.Reader(table) for i in range(len(windowlist) - 1): windowleft, windowright = windowlist[i], windowlist[i + 1] rho_cumulative = 0.0 count = 0 for record in p.fetch(chromosome, windowleft, windowright): rho_cumulative += record.ld_rho count += 1 try: rho_out = rho_cumulative / count except ZeroDivisionError: assert count == 0 rho_out = 0 if feature_type == 'TSS' and strand == '+': windowleft_out = windowleft - start windowright_out = windowright - start elif feature_type == 'TSS' and strand == '-': windowleft_out = end - windowleft windowright_out = end - windowright elif feature_type == 'TES' and strand == '+': windowleft_out = windowleft - end windowright_out = windowright - end elif feature_type == 'TES' and strand == '-': windowleft_out = start - windowleft windowright_out = start - windowright windowout = ' '.join([str(feature_type), str(windowleft_out), str(windowright_out), \ str(rho_out), str(rho_cumulative), str(count)]) print(windowout)
def create_lookup(table, chrom): ''' (str, str) -> str uses annotation table to create a lookup string for all sites modified from rcmb_correlates.py i: intergenic c: cds n: intron 5: utr5 3: utr3 N: unknown ''' p = antr.Reader(table) lookup_string = '' start = next(p.fetch(chrom)).pos if start > 1: for i in range(0, start): lookup_string += 'N' for rec in tqdm(p.fetch(chrom)): if rec.is_intergenic: lookup_string += 'i' elif rec.is_in_CDS: lookup_string += 'c' elif rec.is_intronic: lookup_string += 'n' elif rec.is_utr5: lookup_string += '5' elif rec.is_utr3: lookup_string += '3' else: lookup_string += 'N' with open(chrom + '_temp_lookup', 'w') as f: f.write(lookup_string) return lookup_string
def SFS_from_antr(table, chromosome, start, end, min_alleles=None, neutral_only=False, counter=False): SFSs = {} p = antr.Reader(table) if counter: record_count = 0 for record in p.fetch(chromosome, start, end): # diversity calc allele_counts = record.quebec_alleles if neutral_only and True not in [ record.is_intergenic, record.is_intronic, record.is_fold4 ]: continue try: MAF, total_alleles_called = MAF_from_allele_count( allele_counts, min_alleles=min_alleles) except TypeError: continue if min_alleles and total_alleles_called < min_alleles: # filter sites that don't have enough called alleles continue if total_alleles_called not in SFSs: SFSs[total_alleles_called] = SFS([0] * (total_alleles_called + 1)) SFSs[total_alleles_called].add(MAF, total_alleles_called) if counter: record_count += 1 diversity = sum([sfs.theta_pi() * sfs.sites() for sfs in SFSs.values() ]) / sum([sfs.sites() for sfs in SFSs.values()]) if not counter: return diversity elif counter: return diversity, record_count
def parse_crossovers(filename, table, out): with open(filename, 'r') as f: crossovers = [line for line in csv.DictReader(f, delimiter='\t')] with open(out, 'w') as f_out: fieldnames = [ 'cross', 'tetrad', 'individual', 'chromosome', 'left_bound', 'right_bound', 'mid_point', 'length', 'rho_total', 'rho_count', 'rho_window' ] writer = csv.DictWriter(f_out, fieldnames=fieldnames) writer.writeheader() for co in tqdm(crossovers): chrom = str(co['chromosome']) start, end = int(co['left_bound']), int(co['right_bound']) p = antr.Reader(table) rho_vals = [record.ld_rho for record in p.fetch(chrom, start, end)] out_dict = deepcopy(co) out_dict['rho_total'] = sum(rho_vals) out_dict['rho_count'] = len(rho_vals) out_dict['rho_window'] = sum(rho_vals) / len(rho_vals) writer.writerow(out_dict)
import antr from tqdm import tqdm import sys table = sys.argv[-1] p = antr.Reader(table) for item in p.header: print(item.strip()) print( '##methylation=beta value at CpG sites in three clones of CC2937. [FLOAT]') # add column names p.cols.append('methylation') p.cols = [item.strip() for item in p.cols] print('#', '\t'.join(p.cols), sep='') # add ld_rho to end of colnames # write records with open('data/methylation/beta_vals_no_context.bed', 'r') as f: for line in tqdm(f): split = [i.rstrip() for i in line.split('\t')] chrom, c_pos, beta = str(split[0]), int(split[1]), float(split[3]) p = antr.Reader(table).fetch(chrom, c_pos, c_pos + 1, raw=True) for record in p: record = record + '\t' + str(beta) print(record)
]] except IndexError: continue else: continue # col headers for file print('chromosome', 'start', 'end', 'length', 'type', 'order', 'rho', 'total_rho', 'count') # iterate through chromosomes for chrom in range(1, 18): eprint('starting', chrom) current_chrom = 'chromosome_{}'.format(str(chrom)) eprint('current_chrom = ', current_chrom) p = antr.Reader(table) # exons exon_order = 0 # keep track of which dict for the 'order' column # 0 - first, 1 - other, 2 - last for coord_dict in [first_exons, other_exons, last_exons]: for exon in tqdm(coord_dict[current_chrom]): p = antr.Reader(table) exon = [int(v) for v in exon] exon_start, exon_end = exon # unpack exon_length = exon_end - exon_start exon_total_rho = 0.0 count = 0
def create_full_lookup(table, context_size, chrom): '''(str, int, str) -> str uses annotation table to create a lookup string for all sites (modified from rcmb_correlates.py) c: CDS i: intronic f: utr5 t: utr3 0: intergenic (non-gene-proximate) 1: site upstream of gene 2: site downstream of gene 3: site upstream and downstream of gene N: unknown in the lookup string, string[x] represents position x + 1 ''' p = antr.Reader(table) initial_lookup = '' genic_lookup = '' proximity_lookup = '' start = next(p.fetch(chrom)).pos if start > 1: for i in range(0, start): initial_lookup += 'N' genic_lookup += 'N' proximity_lookup += 'N' lookup_codes = { 'is_in_CDS': 'c', 'is_intronic': 'i', 'is_utr5': 'f', 'is_utr3': 't', 'is_intergenic': '0' } genic_codes = ['c', 'i', 'f', 't'] print('Initial run through...') for rec in tqdm(p.fetch(chrom)): if rec.is_genic: genic_lookup += 'g' elif rec.is_intergenic: genic_lookup += 'i' for annotation in lookup_codes.keys(): if getattr(rec, annotation): initial_lookup += lookup_codes[annotation] break print('Done.') print('Generating full string...') start = len(proximity_lookup) for pos in tqdm(range(start, len(genic_lookup))): if genic_lookup[pos] == 'g': proximity_lookup += initial_lookup[pos] elif genic_lookup[pos] == 'i': upstream, downstream = False, False if 'g' in genic_lookup[pos:pos + context_size]: upstream = True if 'g' in genic_lookup[pos - context_size:pos]: downstream = True if upstream and not downstream: proximity_lookup += '1' continue elif downstream and not upstream: proximity_lookup += '2' continue elif upstream and downstream: proximity_lookup += '3' continue else: proximity_lookup += '0' return genic_lookup, proximity_lookup
def main(table, windowsize, correlates, gc_content, gene_context): # print column headers if correlates: if context_size: # ie upstream/downstream of genes correlates.extend(['upstream', 'downstream', 'both']) title1 = ' '.join([item + '_total' for item in correlates]) title2 = ' '.join([item + '_count' for item in correlates]) if gc: print('chromosome', 'start', 'end', title1, title2, 'GC', 'count') elif not gc: print('chromosome', 'start', 'end', title1, title2, 'count') elif gc and not correlates: print('chromosome', 'start', 'end', 'GC', 'rho', 'rho_total', 'count') # iterate through chromosomes for chrom in range(1, 18): current_chrom = 'chromosome_{}'.format(str(chrom)) windows = list(range(0, lengths[current_chrom], windowsize)) + [lengths[current_chrom]] p = antr.Reader(table) for i in range(len(windows) - 1): window = (windows[i], windows[i + 1]) if correlates: rho = OrderedDict.fromkeys(correlates, 0.0) count = OrderedDict.fromkeys(correlates, 0) total_counter = 0 # iterate through records in window for record in tqdm(p.fetch(current_chrom, window[0], window[1])): for key in rho.keys(): if key in ['upstream', 'downstream', 'both']: continue elif attr_fetch(record, key) and not record.ld_rho == 'NA': if key == 'intergenic' and attr_fetch( record, 'intergenic') and context_size: neither = True upstream = False downstream = False if check_gene_proximity( record, context_size, 'u'): neither = False upstream = True if check_gene_proximity( record, context_size, 'd'): neither = False downstream = True if upstream and downstream: rho['both'] += record.ld_rho count['both'] += 1 total_counter += 1 continue # don't class as intergenic elif upstream and not downstream: rho['upstream'] += record.ld_rho count['upstream'] += 1 total_counter += 1 continue elif downstream and not upstream: rho['downstream'] += record.ld_rho count['downstream'] += 1 total_counter += 1 continue elif neither: # continue to code below pass rho[key] += record.ld_rho count[key] += 1 total_counter += 1 else: continue rhovals = list(rho.values()) countvals = list(count.values()) totals = ' '.join([str(v) for v in rhovals]) counts = ' '.join([str(v) for v in countvals]) if gc: # gc content option selected gc_rho = 0.0 gc_counter = 0 for record in tqdm(p.fetch(current_chrom, window[0], window[1])): gc_rho += record.ld_rho gc_counter += 1 gc_window = gc_calc(current_chrom, window, table) gc_rho_perbp = gc_rho / gc_counter if correlates: print(current_chrom, window[0], window[1], totals, counts, gc_window, total_counter) elif not correlates: print(current_chrom, window[0], window[1], gc_window, gc_rho_perbp, gc_rho, gc_counter) elif not gc: print(current_chrom, window[0], window[1], totals, counts, total_counter)
def rho_annotations(table, windowsize, gene_context, chrom, out): '''(str, int, int, str, str) -> None iterates through annotation table and collects rho values for each annotation will first create a lookup string to speed up upstream/downstream/both calc ''' print('Chromosome {chrom} selected.'.format(chrom=chrom)) print('Creating intergenic lookup...') genic_lookup, proximity_lookup = create_lookup(table, gene_context, chrom) print('Done.') correlates = ['is_intergenic', 'is_utr5', 'is_in_CDS', 'is_intronic', 'is_utr3', 'upstream', 'downstream', 'both'] windows = list(range(0, lengths[chrom], windowsize)) + [lengths[chrom]] p = antr.Reader(table) print('Starting windowed correlate calc...') with open(out, 'w') as f: # prep header title1 = ' '.join([item + '_total' for item in correlates]) title2 = ' '.join([item + '_count' for item in correlates]) header = ' '.join(['chrom', 'start', 'end', title1, title2, 'total_count']) f.write(header + '\n') # iterate through chromosome for i in tqdm(range(len(windows) - 1)): rho = OrderedDict.fromkeys(correlates, 0.0) count = OrderedDict.fromkeys(correlates, 0) total_count = 0 for record in p.fetch(chrom, windows[i], windows[i+1]): for key in rho.keys(): if key in ['upstream', 'downstream', 'both']: continue elif getattr(record, key) and not record.ld_rho == 'NA': if key == 'is_intergenic' and getattr(record, 'is_intergenic'): intergenic_type = proximity_lookup[record.pos] if intergenic_type == '1': rho['upstream'] += record.ld_rho count['upstream'] += 1 elif intergenic_type == '2': rho['downstream'] += record.ld_rho count['downstream'] += 1 elif intergenic_type == '3': rho['both'] += record.ld_rho count['both'] += 1 elif intergenic_type == '0': rho['is_intergenic'] += record.ld_rho count['is_intergenic'] += 1 else: rho[key] += record.ld_rho count[key] += 1 total_count += 1 # prep output rhovals = list(rho.values()) countvals = list(count.values()) total_count = str(total_count) totals = ' '.join([str(v) for v in rhovals]) counts = ' '.join([str(v) for v in countvals]) line_out = ' '.join([chrom, str(windows[i]), str(windows[i+1]), totals, counts, total_count]) f.write(line_out + '\n') print('Complete.') print('File written to {out}.'.format(out=out)) print('Good job!')