def read_score(score_tab, ref):
    fa = Fasta(ref)
    fa_dict = {}
    for i, j in fa.items():
        fa_dict[i.split('\t')[0]] = j

    score_dict = nested_dict()
    with open(score_tab, 'r') as TXT:
        for line in TXT:
            line = line.strip()
            if not line or line.startswith('@'): continue
            arr = line.split('\t')
            if arr[1] == '-': continue
            score_dict[arr[0]][int(arr[2])] = arr[7]
    score_dict = score_dict.to_dict()
    #     print score_dict

    reactivity_dict = nested_dict(2, list)
    for i, j in score_dict.items():
        for p in xrange(1, len(fa_dict[i]) + 1):
            if p not in score_dict[i]:
                r = 'NULL'
            elif score_dict[i][p] == '-1':
                r = 'NULL'
            else:
                r = score_dict[i][p]
            reactivity_dict[i]['reactivity_ls'].append(r)

    return reactivity_dict.to_dict()
Esempio n. 2
0
def read_fa(
    fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'
):
    fa_dict1 = Fasta(fa, key_fn=lambda key: key.split("\t")[0])
    fa_dict = {i.split()[0]: j[0:] for i, j in fa_dict1.items()}
    print fa_dict.keys()[0:3]
    return fa_dict
Esempio n. 3
0
def calc_nuc_counts(fasta_filename, region_size_min,
                    region_size_max, verbose):
    ''' calculate nuc frequencies for normalization.

        Returns: dict of nucleotide frequencies.
    '''

    nuc_counts = defaultdict(Counter)

    fasta = Fasta(fasta_filename)

    for chrom, seq in fasta.items():

        for idx, pos in enumerate(seq):

            for region_size in range(region_size_min,
                                     region_size_max + 1):

                nucs = seq[idx:idx+region_size]

                if len(nucs) < region_size: continue

                nuc_counts[region_size][nucs] += 1

    return nuc_counts
Esempio n. 4
0
def read_fa(fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'):
	gj.printFuncRun('read_fa')
	gj.printFuncArgs()
	fa_dict1 = Fasta(fa, key_fn=lambda key:key.split("\t")[0])
	fa_dict = {i.split()[0]:j[0:] for i,j in fa_dict1.items()}
	print fa_dict.keys()[0:3]
	gj.printFuncRun('read_fa')
	return fa_dict
Esempio n. 5
0
def read_fa(
    fa='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10_transcriptome.fa'
):
    gj.printFuncRun('read_fa')
    gj.printFuncArgs()
    fa_dict1 = Fasta(fa, key_fn=lambda key: key.split("\t")[0])
    fa_dict = {i.split()[0].split('.')[0]: j[0:] for i, j in fa_dict1.items()}
    print fa_dict.keys()[0:3]
    gj.printFuncRun('read_fa')
    return fa_dict
Esempio n. 6
0
def read_fa(fa=None, species='human', pureID=1):
    if fa is None:
        if species == 'mouse':
            fa = '/home/gongjing/project/shape_imputation/data/ref/mm10/mm10_transcriptome.fa'
        if species == 'human':
            fa = '/home/gongjing/project/shape_imputation/data/ref/hg38/hg38_transcriptome.fa'
        if species == 'human_rRNA':
            fa = '/Share2/home/zhangqf5/gongjing/RNA-structure-profile-imputation/data/ref/rRNA_human/ribosomalRNA_4.fa'
        if species == 'human_rfam':
            fa = '/Share2/home/zhangqf5/gongjing/RNA-structure-profile-imputation/data/ref/Rfam_human/human.dot.dedup.fa'
        if species == 'mouse(CIRSseq)':
            fa = '/home/gongjing/project/shape_imputation/data/CIRSseq/cirs_txid.fa'
    fa_dict1 = Fasta(fa, key_fn=lambda key: key.split("\t")[0])
    if pureID:
        fa_dict = {
            i.split()[0].split('.')[0]: j[0:]
            for i, j in fa_dict1.items()
        }
    else:
        fa_dict = {i.split()[0]: j[0:] for i, j in fa_dict1.items()}
    print(list(fa_dict.keys())[0:3])
    return fa_dict
Esempio n. 7
0
 def search(self, ref_base, pos, alt_base="X"):
     var_name = "".join([ref_base, str(pos), alt_base])
     fasta_string = self.create_variant_probe_set(var_name=var_name)
     with tempfile.NamedTemporaryFile() as fp:
         fp.write(fasta_string)
         fp.seek(0)
         fasta = Fasta(fp.name)
     refs = []
     alts = []
     for k, v in fasta.items():
         if "ref" in k:
             refs.append(str(v))
         else:
             alts.append(str(v))
     return {"query": var_name, "results": self.genotype_alleles(refs, alts)}
def wig_to_out():
    wig = '/home/gongjing/project/shape_imputation/data/CIRSseq/GSE54106_CIRS-seq_Reactivity_combined.wig'
    returnval = loadWigHash(wig)
    fa = '/home/gongjing/project/shape_imputation/data/CIRSseq/cirs.fa'
    fa_dict1 = Fasta(fa)
    fa_dict = {}
    for i,j in fa_dict1.items():
        fa_dict[i.split('|')[0]] = j[0:]

    savefn = wig.replace('.wig', '.out')
    with open(savefn, 'w') as SAVEFN:
        for tx_id in returnval:
            if tx_id in fa_dict:
                tx_len = len(fa_dict[tx_id][0:])
                
                reactivity_ls = []
                for i in range(1, tx_len+1):
                    if i in returnval[tx_id]:
                        reactivity_ls.append(returnval[tx_id][i])
                    else:
                        reactivity_ls.append('NULL')
                SAVEFN.write('\t'.join(map(str, [tx_id, tx_len, '0']+reactivity_ls))+'\n')
Esempio n. 9
0
def calc_nuc_counts(fasta_filename, region_size_min, region_size_max, verbose):
    ''' calculate nuc frequencies for normalization.

        Returns: dict of nucleotide frequencies.
    '''

    nuc_counts = defaultdict(Counter)

    fasta = Fasta(fasta_filename)

    for chrom, seq in fasta.items():

        for idx, pos in enumerate(seq):

            for region_size in range(region_size_min, region_size_max + 1):

                nucs = seq[idx:idx + region_size]

                if len(nucs) < region_size: continue

                nuc_counts[region_size][nucs] += 1

    return nuc_counts
Esempio n. 10
0
icshape_panpan_1870 = icshape_panpan_1869[1:][0:18] + [
    'NULL'
] + icshape_panpan_1869[1:][18:] + ['NULL']
icshape_panpan_1870 = [
    np.nan if i == 'NULL' else float(i) for i in icshape_panpan_1870
]
fromPan1870_df = pd.DataFrame({'reactivity': icshape_panpan_1870})

mouse_acces_ls = read_access()
mouse_dot_18S_new, mouse_ct_18S_new = read_mouse_dot()

fa_dict1 = Fasta('../data/rRNA/mouse_ribosomalRNA_4.fa',
                 key_fn=lambda key: key.split("\t")[0])
fa_dict_rRNA_mouse = {
    i.split()[0].split('.')[0]: j[0:]
    for i, j in fa_dict1.items()
}
# fa_dict_rRNA_mouse.keys()

df_reactivity_df = pd.DataFrame({
    'base': list(fa_dict_rRNA_mouse['18S']),
    'dot': list(mouse_dot_18S_new),
    'keth-seq': kethoxal_notreat_df['reactivity'],
    'icSHAPE': fromPan1870_df['reactivity']
})
df_reactivity_df.head()

### plot diff of single stranded bases
df_reactivity_df = df_reactivity_df[df_reactivity_df['base'] == 'G']
df_reactivity_df = df_reactivity_df[df_reactivity_df['dot'].isin([
    '.',