def read_score(score_tab, ref): fa = Fasta(ref) fa_dict = {} for i, j in fa.items(): fa_dict[i.split('\t')[0]] = j score_dict = nested_dict() with open(score_tab, 'r') as TXT: for line in TXT: line = line.strip() if not line or line.startswith('@'): continue arr = line.split('\t') if arr[1] == '-': continue score_dict[arr[0]][int(arr[2])] = arr[7] score_dict = score_dict.to_dict() # print score_dict reactivity_dict = nested_dict(2, list) for i, j in score_dict.items(): for p in xrange(1, len(fa_dict[i]) + 1): if p not in score_dict[i]: r = 'NULL' elif score_dict[i][p] == '-1': r = 'NULL' else: r = score_dict[i][p] reactivity_dict[i]['reactivity_ls'].append(r) return reactivity_dict.to_dict()
def read_fa( fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa' ): fa_dict1 = Fasta(fa, key_fn=lambda key: key.split("\t")[0]) fa_dict = {i.split()[0]: j[0:] for i, j in fa_dict1.items()} print fa_dict.keys()[0:3] return fa_dict
def calc_nuc_counts(fasta_filename, region_size_min, region_size_max, verbose): ''' calculate nuc frequencies for normalization. Returns: dict of nucleotide frequencies. ''' nuc_counts = defaultdict(Counter) fasta = Fasta(fasta_filename) for chrom, seq in fasta.items(): for idx, pos in enumerate(seq): for region_size in range(region_size_min, region_size_max + 1): nucs = seq[idx:idx+region_size] if len(nucs) < region_size: continue nuc_counts[region_size][nucs] += 1 return nuc_counts
def read_fa(fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'): gj.printFuncRun('read_fa') gj.printFuncArgs() fa_dict1 = Fasta(fa, key_fn=lambda key:key.split("\t")[0]) fa_dict = {i.split()[0]:j[0:] for i,j in fa_dict1.items()} print fa_dict.keys()[0:3] gj.printFuncRun('read_fa') return fa_dict
def read_fa( fa='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10_transcriptome.fa' ): gj.printFuncRun('read_fa') gj.printFuncArgs() fa_dict1 = Fasta(fa, key_fn=lambda key: key.split("\t")[0]) fa_dict = {i.split()[0].split('.')[0]: j[0:] for i, j in fa_dict1.items()} print fa_dict.keys()[0:3] gj.printFuncRun('read_fa') return fa_dict
def read_fa(fa=None, species='human', pureID=1): if fa is None: if species == 'mouse': fa = '/home/gongjing/project/shape_imputation/data/ref/mm10/mm10_transcriptome.fa' if species == 'human': fa = '/home/gongjing/project/shape_imputation/data/ref/hg38/hg38_transcriptome.fa' if species == 'human_rRNA': fa = '/Share2/home/zhangqf5/gongjing/RNA-structure-profile-imputation/data/ref/rRNA_human/ribosomalRNA_4.fa' if species == 'human_rfam': fa = '/Share2/home/zhangqf5/gongjing/RNA-structure-profile-imputation/data/ref/Rfam_human/human.dot.dedup.fa' if species == 'mouse(CIRSseq)': fa = '/home/gongjing/project/shape_imputation/data/CIRSseq/cirs_txid.fa' fa_dict1 = Fasta(fa, key_fn=lambda key: key.split("\t")[0]) if pureID: fa_dict = { i.split()[0].split('.')[0]: j[0:] for i, j in fa_dict1.items() } else: fa_dict = {i.split()[0]: j[0:] for i, j in fa_dict1.items()} print(list(fa_dict.keys())[0:3]) return fa_dict
def search(self, ref_base, pos, alt_base="X"): var_name = "".join([ref_base, str(pos), alt_base]) fasta_string = self.create_variant_probe_set(var_name=var_name) with tempfile.NamedTemporaryFile() as fp: fp.write(fasta_string) fp.seek(0) fasta = Fasta(fp.name) refs = [] alts = [] for k, v in fasta.items(): if "ref" in k: refs.append(str(v)) else: alts.append(str(v)) return {"query": var_name, "results": self.genotype_alleles(refs, alts)}
def wig_to_out(): wig = '/home/gongjing/project/shape_imputation/data/CIRSseq/GSE54106_CIRS-seq_Reactivity_combined.wig' returnval = loadWigHash(wig) fa = '/home/gongjing/project/shape_imputation/data/CIRSseq/cirs.fa' fa_dict1 = Fasta(fa) fa_dict = {} for i,j in fa_dict1.items(): fa_dict[i.split('|')[0]] = j[0:] savefn = wig.replace('.wig', '.out') with open(savefn, 'w') as SAVEFN: for tx_id in returnval: if tx_id in fa_dict: tx_len = len(fa_dict[tx_id][0:]) reactivity_ls = [] for i in range(1, tx_len+1): if i in returnval[tx_id]: reactivity_ls.append(returnval[tx_id][i]) else: reactivity_ls.append('NULL') SAVEFN.write('\t'.join(map(str, [tx_id, tx_len, '0']+reactivity_ls))+'\n')
def calc_nuc_counts(fasta_filename, region_size_min, region_size_max, verbose): ''' calculate nuc frequencies for normalization. Returns: dict of nucleotide frequencies. ''' nuc_counts = defaultdict(Counter) fasta = Fasta(fasta_filename) for chrom, seq in fasta.items(): for idx, pos in enumerate(seq): for region_size in range(region_size_min, region_size_max + 1): nucs = seq[idx:idx + region_size] if len(nucs) < region_size: continue nuc_counts[region_size][nucs] += 1 return nuc_counts
icshape_panpan_1870 = icshape_panpan_1869[1:][0:18] + [ 'NULL' ] + icshape_panpan_1869[1:][18:] + ['NULL'] icshape_panpan_1870 = [ np.nan if i == 'NULL' else float(i) for i in icshape_panpan_1870 ] fromPan1870_df = pd.DataFrame({'reactivity': icshape_panpan_1870}) mouse_acces_ls = read_access() mouse_dot_18S_new, mouse_ct_18S_new = read_mouse_dot() fa_dict1 = Fasta('../data/rRNA/mouse_ribosomalRNA_4.fa', key_fn=lambda key: key.split("\t")[0]) fa_dict_rRNA_mouse = { i.split()[0].split('.')[0]: j[0:] for i, j in fa_dict1.items() } # fa_dict_rRNA_mouse.keys() df_reactivity_df = pd.DataFrame({ 'base': list(fa_dict_rRNA_mouse['18S']), 'dot': list(mouse_dot_18S_new), 'keth-seq': kethoxal_notreat_df['reactivity'], 'icSHAPE': fromPan1870_df['reactivity'] }) df_reactivity_df.head() ### plot diff of single stranded bases df_reactivity_df = df_reactivity_df[df_reactivity_df['base'] == 'G'] df_reactivity_df = df_reactivity_df[df_reactivity_df['dot'].isin([ '.',