Beispiel #1
0
def visualize(prim_str, pred_string, true_string, save_imgs = False, suffix_img = ''):
    corr_string = balance_op_tmp(pred_string)

    print(f'pred: {pred_string}')
    print(f"true: {true_string}")
    print(f'corr: {corr_string}')

    with open('tmp_vis_pred.txt', 'w') as f:
        f.write(prim_str+ os.linesep)
        f.write(corr_string+ os.linesep)
        f.close
    with open('tmp_vis_true.txt', 'w') as f:
        f.write(prim_str+ os.linesep)
        f.write(true_string+os.linesep)
        f.close

    plt.figure(figsize = (20, 20))
    plt.title('predicted')
    cg = forgi.load_rna('tmp_vis_pred.txt', allow_many=False)
    fvm.plot_rna(cg, text_kwargs={"fontweight":"black"}, lighten=0.7,backbone_kwargs={"linewidth":3})
    if save_imgs:
        plt.savefig(f'pred_{suffix_img}.jpg')
    plt.show()

    plt.figure(figsize = (20, 20))
    plt.title('original')
    cg = forgi.load_rna('tmp_vis_true.txt', allow_many=False)
    fvm.plot_rna(cg, text_kwargs={"fontweight":"black"}, lighten=0.7,backbone_kwargs={"linewidth":3})
    plt.savefig(f'true_{suffix_img}.jpg')
    plt.show()
Beispiel #2
0
def draw_2nd_structure(srna):
    for i in range(3):
        name = srna.loc[i, "name"]
        sequence = srna.loc[i, "sequence"]
        folding = fold_rna(srna.loc[i, "sequence"])[0]
        print("##############", name, "###############")
        print(sequence)
        print(folding)
        print()

        fx_test = ">{0}\n{1}\n{2}\n".format(name, sequence, folding)
        textfile = open('./resources/' + srna.loc[i, "name"] + '.fx', "w")
        textfile.write(fx_test)
        textfile.close()
    for i in range(3):
        ##Print Structure
        plt.figure(figsize=(20, 20))
        cg = forgi.load_rna('./resources/' + srna.loc[i, "name"] + '.fx',
                            allow_many=False)
        fvm.plot_rna(cg,
                     text_kwargs={"fontweight": "black"},
                     lighten=0.7,
                     backbone_kwargs={"linewidth": 3})
        plt.show()
        # plt.savefig(srna.loc[i, "name"]+'.png')
    return srna
Beispiel #3
0
def create_pairing_matrix(seq, num_backtrack=10):
    # create fold compound object
    fc = RNA.fold_compound(seq, md)
    # compute MFE
    (ss, mfe) = fc.mfe()
    # rescale Boltzmann factors according to MFE
    fc.exp_params_rescale(mfe)
    # compute partition function to fill DP matrices
    fc.pf()
    structures = []
    pair_tables = []
    pair_matrices = []

    for s in fc.pbacktrack(num_backtrack):
        structures.append(s)
        bg, = forgi.load_rna(s)
        pair_tables.append(bg.to_pair_table())
    for pair_idx in pair_tables:
        pair_matrix = np.eye(len(seq))
        pair_idx = np.array(pair_idx[1:]) - 1
        for i, j in enumerate(pair_idx):
            if j == -1:  # unpaired
                continue
            else:
                pair_matrix[i, j] += 1
        pair_matrix = pair_matrix / np.sum(pair_matrix, axis=-1, keepdims=True)
        pair_matrices.append(pair_matrix)
    pair_matrix = np.mean(pair_matrices, axis=0)
    return pair_matrix
Beispiel #4
0
 def load(self, pattern):
     num_loaded = 0
     num_selected = 0
     if not isinstance(pattern, list):
         pattern = [pattern]
     fns = []
     for pat in pattern:
         fns.extend(glob(pat))
     if len(fns) > 1000:
         lev = logging.WARNING
     else:
         lev = logging.INFO
     log.log(lev, "Loading %s files", len(fns))
     with _LoggingContext(logging.getLogger(), logging.CRITICAL):
         for fn in fns:
             num_selected += 1
             if fn not in self.cgs:
                 cgs = forgi.load_rna(fn)
                 if len(cgs) != 1:
                     raise ValueError(
                         "Expected 1 RNA component in file {}, found {}:{}".
                         format(fn, len(cgs)), [cg.name for cg in cgs])
                 cg, = cgs
                 cg.infos["filename"] = fn
                 self.cgs[fn] = cg
                 num_loaded += 1
             if fn not in self.pdds:
                 points = []
                 try:
                     pd_pdd = pd.read_csv(fn + ".pdd.csv")
                 except:
                     for i in range(1, cg.seq_length + 1):
                         points.append(
                             cg.get_virtual_residue(
                                 i, allow_single_stranded=True))
                     x, y = ftuv.pair_distance_distribution(
                         points, stepsize=self.stepsize)
                     df = pd.DataFrame({"step": x, "count": y})
                     df.to_csv(fn + ".pdd.csv")
                 else:
                     x = pd_pdd["step"]
                     y = pd_pdd["count"]
                 self.pdds[fn] = (x, y)
     scores = self.get_scores(pattern)
     if scores:
         minsc = scores[0]
         maxsc = scores[-1]
     else:
         minsc = None
         maxsc = None
     return num_selected, num_loaded, minsc, maxsc
Beispiel #5
0
def structure(sequence, dot_bracket, mirna_name, index):

    out_dir = index + "/mirna"

    with open(out_dir + "/" + mirna_name + ".db", "w") as temp_file:
        temp_file.write(sequence + '\n')
        temp_file.write(dot_bracket)

    cg = forgi.load_rna(out_dir + "/" + mirna_name + ".db", allow_many=False)
    fvm.plot_rna(cg,
                 text_kwargs={"fontweight": "black"},
                 lighten=0.7,
                 backbone_kwargs={"linewidth": 3})
    plt.savefig("gui/src/assets/" + mirna_name + ".png")
Beispiel #6
0
def create_multiclass_vector(seq, num_backtrack=10):
    # create fold compound object
    fc = RNA.fold_compound(seq, md)
    # compute MFE
    (ss, mfe) = fc.mfe()
    # rescale Boltzmann factors according to MFE
    fc.exp_params_rescale(mfe)
    # compute partition function to fill DP matrices
    fc.pf()
    structures = []
    dict_map = {'s': 'P', 'i': 'I', 'h': 'H', 'm': 'M', 'f': 'E', 't': 'E'}
    one_hot_vector = np.zeros((len(seq), 5))
    for s in fc.pbacktrack(num_backtrack):
        structures.append(s)
        bg, = forgi.load_rna(s)
        element_string = bg.to_element_string()
        element_string = ''.join(map(lambda x: dict_map[x], element_string))
        one_hot_vector += one_hot_encode_struct(element_string,
                                                method='signed')
    one_hot_vector /= 1.0 * num_backtrack
    return one_hot_vector
Beispiel #7
0
def plot_rna_struct(seq,
                    struct,
                    ax=None,
                    offset=(0, 0),
                    text_kwargs={},
                    backbone_kwargs={},
                    basepair_kwargs={},
                    highlight_bp_idx=[],
                    highlight_nt_idx=[],
                    lighten=0.7,
                    saveto='tmp.png'):
    with open('tmp.fa', 'w') as file:
        file.write('>tmp\n%s\n%s' % (seq, struct))
    cg = forgi.load_rna('tmp.fa', allow_many=False)

    RNA.cvar.rna_plot_type = 1

    fig = plt.figure(figsize=(30, 30))
    coords = []

    bp_string = cg.to_dotbracket_string()

    if ax is None:
        ax = plt.gca()

    if offset is None:
        offset = (0, 0)
    elif offset is True:
        offset = (ax.get_xlim()[1], ax.get_ylim()[1])
    else:
        pass

    vrna_coords = RNA.get_xy_coordinates(bp_string)
    # TODO Add option to rotate the plot
    for i, _ in enumerate(bp_string):
        coord = (offset[0] + vrna_coords.get(i).X,
                 offset[1] + vrna_coords.get(i).Y)
        coords.append(coord)
    coords = np.array(coords)
    # First plot backbone
    bkwargs = {"color": "grey", "zorder": 0, "linewidth": 0.5}
    bkwargs.update(backbone_kwargs)
    ax.plot(coords[:, 0], coords[:, 1], **bkwargs)
    # Now plot basepairs
    basepairs_hl, basepairs_nonhl = [], []
    for s in cg.stem_iterator():
        for p1, p2 in cg.stem_bp_iterator(s):
            if (p1 - 1, p2 - 1) in highlight_bp_idx:
                basepairs_hl.append([coords[p1 - 1], coords[p2 - 1]])
            else:
                basepairs_nonhl.append([coords[p1 - 1], coords[p2 - 1]])

    if len(basepairs_hl) > 0:
        basepairs_hl = np.array(basepairs_hl)
        bpkwargs_hl = {"color": 'red', "zorder": 0, "linewidth": 3}
        bpkwargs_hl.update(basepair_kwargs)
        ax.plot(basepairs_hl[:, :, 0].T, basepairs_hl[:, :, 1].T,
                **bpkwargs_hl)

    if len(basepairs_nonhl) > 0:
        basepairs_nonhl = np.array(basepairs_nonhl)
        bpkwargs_nonhl = {"color": 'black', "zorder": 0, "linewidth": 0.5}
        bpkwargs_nonhl.update(basepair_kwargs)
        ax.plot(basepairs_nonhl[:, :, 0].T, basepairs_nonhl[:, :, 1].T,
                **bpkwargs_nonhl)

    # Now plot circles
    for i, coord in enumerate(coords):

        if i in highlight_nt_idx:
            c = 'green'
            h, l, s = colorsys.rgb_to_hls(*mc.to_rgb(c))
            if lighten > 0:
                l += (1 - l) * min(1, lighten)
            else:
                l += l * max(-1, lighten)
            c = colorsys.hls_to_rgb(h, l, s)
            circle = plt.Circle((coord[0], coord[1]),
                                edgecolor="black",
                                facecolor=c)
        else:
            circle = plt.Circle((coord[0], coord[1]),
                                edgecolor="black",
                                facecolor="white")

        ax.add_artist(circle)
        if cg.seq:
            if "fontweight" not in text_kwargs:
                text_kwargs["fontweight"] = "bold"
            ax.annotate(cg.seq[i + 1],
                        xy=coord,
                        ha="center",
                        va="center",
                        **text_kwargs)

    all_coords = list(coords)
    ntnum_kwargs = {"color": "gray"}
    ntnum_kwargs.update(text_kwargs)
    for nt in range(10, cg.seq_length, 10):
        # We try different angles
        annot_pos = _find_annot_pos_on_circle(nt, all_coords, cg)
        if annot_pos is not None:
            ax.annotate(str(nt),
                        xy=coords[nt - 1],
                        xytext=annot_pos,
                        arrowprops={
                            "width": 1,
                            "headwidth": 1,
                            "color": "gray"
                        },
                        ha="center",
                        va="center",
                        zorder=0,
                        **ntnum_kwargs)
            all_coords.append(annot_pos)

    datalim = ((min(list(coords[:, 0]) + [ax.get_xlim()[0]]),
                min(list(coords[:, 1]) + [ax.get_ylim()[0]])),
               (max(list(coords[:, 0]) + [ax.get_xlim()[1]]),
                max(list(coords[:, 1]) + [ax.get_ylim()[1]])))

    ax.set_aspect('equal', 'datalim')
    ax.update_datalim(datalim)
    ax.autoscale_view()
    ax.set_axis_off()

    plt.savefig(saveto, dpi=350)
    plt.close(fig)
Beispiel #8
0
import RNA
import forgi
import numpy as np
import pandas as pd
import pickle

count = 0
to_pickle = []
pos_dotbracket = open('pos_dotbracket_FXR1.txt', 'r')
for dotbracket in pos_dotbracket:
    source = []
    target = []
    bg, = forgi.load_rna(dotbracket.rstrip())
    pt = bg.to_pair_table()[1:]
    #am = np.zeros((100,100))
    for i in range(len(pt)):
        if i + 1 < len(pt):
            source.append(i)
            target.append(i + 1)
        if i - 1 > -1:
            source.append(i)
            target.append(i - 1)
        if pt[i] != 0:
            source.append(i)
            target.append(pt[i] - 1)
    source = np.array(source)
    target = np.array(target)
    to_pickle.append(np.stack((source, target)))
    if count % 1000 == 0:
        print(count)
    count += 1
Beispiel #9
0
            # print(row[0])
            seq = row[2]
            # print('seq')
            # print(seq)
            dotbracket = row[14]
            # print("dotbracket:")
            # print(dotbracket)
            if len(seq) != len(dotbracket):
                print('Different Lengths, failed:')
                print(row)
                continue
            tmp_file = open("tmp.txt", "w")
            tmp_file.write(seq + '\n')
            tmp_file.write(dotbracket + '\n')
            tmp_file.close()
            cg = forgi.load_rna('tmp.txt', allow_many=False)
            result = fgb.BulgeGraph.to_element_string(cg, with_numbers=True)
            result = result.splitlines()
            row.append(str(result[0]))
            row.append(str(result[1]))
            writeCSV.writerow(row)
            # all_col.append(row)
        # writeCSV.writerows(all_col)

# bg = fgb.BulgeGraph.from_dotbracket('((..))..((..))')

# rna_seq_dotb = ['CGCUUCAUAUAAUCCUAAUGAUAUGGUUUGGGAGUUUCUACCAAGAGCCUUAAACUCUUGAUUAUGAAGUU', '((((((((((..(((((((.......)))))))......).(((.((.......))))))..)))))))).']
# tmp_file = open("tmp.txt","w")
# tmp_file.write(rna_seq_dotb[0]+ '\n')
# tmp_file.write(rna_seq_dotb[1]+ '\n')
# tmp_file.writelines(rna_seq_dotb)
def search_dot_pair_stem(args):
    dot_dict = util.read_dot(dot=args.dot)
    import forgi
    d = dot_dict['dotbracket']
    s = '...(((((...[[[.))))).((((((((((.(((((((((.....(((.(((..((...(((....((..........))...)))))......(((......((((..((..((....(((..................((((....(((((((.....))))))).....)))).......((((...((((((....))))))...))))....((((((.......(((((.((((...((((.((((((((....))))))))..)))).)))).....)))))......))))))...........((((.((((......))))))))....)))...))))..))))(((..(.(((....((((((((.......))))))))))).....))))...((((((((....))))...))))))).((((((..........)))))).((((....))))...)))))).).....(.(((...(((((...))))).)))).)).))))))....(((((((((((((....))).))))))).)))......(((.(((.......)))).)).........(((((((((....[[[[.....[[.)))]].......]]]])))))).))))))))))..........(((((.....((((...(((.......(((.(((((((((((((.((((....))))....))))))))..)))))))).......((((.(((((...(((((((......)))))))....)))))))))................................................................................................................................(((((((((..(((((((((..((((((((...(((......)))......))))))))..))....(..((....)))))))))).))))).))))...)))...))))....((((((...((...((((.........))))...))))))))..........[[[[[[.(((..((((((((.(((((....)))))))))))))..)))...[[..))]]...]]]....]]].)))..(((.....((((....))))....)))...]]]..(((((.(((((((..((..(((((((((((((((((....((((........))))........(((((((....(((((........((((((........))))))......)))))...((.((((..(((((((((...(((((((((....)))..((((......))))..)))))).....((((.(((.((((..((((....(((..((((....)))).)))....))))..)))))))..((((((((.....))))))))....))))...)))).)))...).))))))).....)))))))...)).))))))))))...(((((((.....(((.......((..((((....))))..)).....))).....)))))))......(...((((((((........))))))))...).....))))).....((((((((.......))))))))......))...)))))))))).))....((.((.(.((((((((.((.((((((((((((..(((((((((((((((.((((((((((((.....))))))))))))...)))))))))))))))..))))))))))))).)))))))))..).))..))....((((((((((....))))))))))........'
    bg, = forgi.load_rna(s)

    pair_dict = nested_dict(2, list)

    reg_str = "[\(\)]{" + str(args.min_len) + ",}"
    # for n,match in enumerate(re.finditer(reg_str, dot_dict['dotbracket'])):
    for n, match in enumerate(re.finditer(reg_str, s)):
        # if n >= 3: continue

        start = match.span()[0]  # 0-based
        end = match.span()[1]  # # 0-based, not include

        start_pair = bg.pairing_partner(start + 1)  # 1-based
        end_pair = bg.pairing_partner(end)  # 1-based

        pos_ls = [start, end, start_pair, end_pair]
        # print(pos_ls)
        if None in pos_ls: continue
        middle = int((max(pos_ls) + min(pos_ls)) / 2)
        mask_len = max(pos_ls) - min(pos_ls)
        if mask_len > 100: continue

        fragment_start = middle - 50
        fragment_end = middle + 50

        if fragment_start < 0: continue
        if fragment_end > len(dot_dict['dotbracket']): continue

        print(n, match.start(), match.span(), match.group(), start_pair,
              end_pair, middle, fragment_start, fragment_end)

        if (end - start) != (abs(start_pair - end_pair) + 1): continue
        pair_start = min(start_pair, end_pair) - 1
        pair_end = max(start_pair, end_pair)

        if end < pair_start:
            gap_start = end
            gap_end = pair_start
        else:
            gap_start = pair_end
            gap_end = start

        gap_dot = s[pair_end:start]
        if len(set(gap_dot)) != 1: continue

        pair_dict[n]['gap_start'] = gap_start
        pair_dict[n]['gap_end'] = gap_end

        pair_dict[n]['stem_start'] = start
        pair_dict[n]['stem_end'] = end
        pair_dict[n]['pair_start'] = pair_start
        pair_dict[n]['pair_end'] = pair_end
        pair_dict[n]['fragment_start'] = fragment_start
        pair_dict[n]['fragment_end'] = fragment_end

        generate_mask_region_validate(
            shape_out=args.shape_out,
            tx=args.tx,
            species=args.species,
            mask_start=start,
            mask_end=end,
            fragment_start=fragment_start,
            fragment_end=fragment_end,
            savefn_dir=
            '/home/gongjing/project/shape_imputation/data/hek_wc_vivo_rRNA/3.shape/mask_specific_regions/ss_pair',
            plot_gradient=1)
        generate_mask_region_validate(
            shape_out=args.shape_out,
            tx=args.tx,
            species=args.species,
            mask_start=pair_start,
            mask_end=pair_end,
            fragment_start=fragment_start,
            fragment_end=fragment_end,
            savefn_dir=
            '/home/gongjing/project/shape_imputation/data/hek_wc_vivo_rRNA/3.shape/mask_specific_regions/ss_pair',
            plot_gradient=1)

    pair_df = pd.DataFrame.from_dict(pair_dict, orient='index')
    savefn = '/home/gongjing/project/shape_imputation/data/hek_wc_vivo_rRNA/3.shape/mask_specific_regions/ss_pair/stem_pair.txt'
    pair_df.to_csv(savefn, header=True, index=True, sep='\t')