Ejemplo n.º 1
0
def gene_bin(para, data='YeastSyn'):
    show("""
    Check gene distributions among the bins in heatmap
    """, True)
    map1 = ContactMap()
    assert map1.load(data, False)
    GE = read_gene(para['DataPath']+'/SGD/SGD_features.tab')
    bin_idx = []
    for gid in GE:
        ch, st, ed = GE[gid]
        try:
            idx = map1.choose_map_loc([int(ch)], [int(st)-1])
            for i in idx:
                if i >= 0:
                    bin_idx.append(i)
        except:
            print 'Skip', ch, st, gid
    n = map1.contact_map.shape[0]
    cout = histogram(bin_idx, range(n), False)
    for i in xrange(n):
        show(map1.idx2chr[map1.frag_chr[i]])
        show(map1.frag_sta[i])
        show(map1.frag_end[i])
        show(cout[i])
        show()
Ejemplo n.º 2
0
def run1(name='demo'):
    show('''
    Compare decomposation results during iterations
    ''', True)
    map1 = ContactMap(name)
    assert map1.load()
    map1.decompose_auto()
    n, r = map1.contact_group.shape
    print n, r
    map1.decompose('NND', dim_num=r, A=map1.get_null_map())
    map2 = map1.duplicate()
    map2.decompose(dim_num=r)
    ref = np.array(map2.contact_group)
    show('Iter\tObj.\t# out of %s' % r)
    show('Corr. Mean\tCorr. STD\tMetric\n')
    from contact_map import gini_impurity
    for i in [1, 5, 10, 50] + range(100, 1201, 100):
        show(i)
        map2 = map1.duplicate()
        show(map2.decompose(dim_num=r, max_iter=i, stop_thrd=0))
        ## match dims
        s = []
        v = []
        for j in xrange(r):
            srt, val, pval = map2.test_enrichment(ref[:, j], 'PCC')
            s.append(srt[0])
            v.append(val[srt[0]])
        show(len(set(s)))
        show(mean_std(v))
        show(gini_impurity(np.diag(map2.group_map)))
        show()
Ejemplo n.º 3
0
def bin_count(infile, pdf):
    if not os.path.exists(infile): return
    from contact_map import ContactMap
    map1 = ContactMap()
    map1.load('YeastHiC')
    n = map1.contact_map.shape[0]
    idx, name = map1.get_locations(infile)
    srt, val, pval = map1.test_enrichment(idx, 'AUC', title=infile, plot=pdf, pages=9)
    show(infile)
    show(len(idx))
    show(len(set(idx)))
    show(pval[srt[0]])
    sign = [i for i in srt if pval[i] < 0.01]
    show(sign)
    show()
    if infile.find('telomere') > -1 or infile.find('tRNA') > -1 or infile.find('paralogs') > -1:
        outfile = open(infile+'_val.csv', 'w')
        outfile.write('Name,Bin Idx,Membership\n')
        for Name, Idx in zip(name, idx):
            outfile.write('%s,%s'%(Name, Idx))
            for i in sign:
                outfile.write(',%s'%map1.contact_group[Idx,i])
            outfile.write('\n')
        outfile.close()
    os.remove(infile)
Ejemplo n.º 4
0
 def __init__(self, exepath, name='DataName', enzyme='Enzyme', sparse=False):
     ContactMap.__init__(self, name, enzyme, sparse)
     path = os.path.abspath(exepath)
     if not os.path.exists(path):
         print 'Please install domaincall to', path
         print 'The project is at https://github.com/kingsfordgroup/armatus'
         exit(0)
     self.exepath = path
Ejemplo n.º 5
0
def one_cell(path, pdf, cell, genome):
    map1 = ContactMap()
    map1.genome_info(path + '/%s_chr_len.txt' % genome)
    for i in sorted(map1.idx2chr.keys()):
        info = one_chr(path=path,
                       cell=cell,
                       genome=genome,
                       ch=map1.idx2chr[i],
                       pdf=pdf)
Ejemplo n.º 6
0
def run4(name='demo'):
    show(
        '''
    Compare the change of clusters under different resolutions and iterations
    ''', True)
    map1 = ContactMap(name)
    assert map1.load()
    dims = [10, 15, 20, 25, 30, 35, 40, 45, 50]
    map1.decompose_auto(dim_num=dims)
    memb = np.array(map1.contact_group * map1.group_map)
    show('')
    show(dims, True)
    for ratio in xrange(1, 4):
        map2 = map1.duplicate()
        map2.get_interactions()
        map2.create_binnedmap(binsize=map1.get_binsize() * ratio)
        map2.mask_diag()
        paras = map2.decompose_auto(dim_num=dims)
        bins, vals = zip(*paras)
        idx, val = map1.get_locations(map2.output_groups(), st=0, ch=0, po=1)
        newv = np.array(map2.contact_group * map2.group_map)
        show(map2.get_binsize())
        show(vals, True)

    show(map1.contact_group.shape, True)
    show('\n# of Iter.')
    show(dims, True)
    for i in xrange(10):
        it = i * 100
        paras = map1.decompose_auto(max_iter=it, update=True, dim_num=dims)
        bins, vals = zip(*paras)
        show(it)
        show(vals, True)
Ejemplo n.º 7
0
def bin_count(infile, pdf):
    if not os.path.exists(infile): return
    from contact_map import ContactMap
    map1 = ContactMap()
    map1.load('YeastHiC')
    n = map1.contact_map.shape[0]
    idx, name = map1.get_locations(infile)
    srt, val, pval = map1.test_enrichment(idx,
                                          'AUC',
                                          title=infile,
                                          plot=pdf,
                                          pages=9)
    show(infile)
    show(len(idx))
    show(len(set(idx)))
    show(pval[srt[0]])
    sign = [i for i in srt if pval[i] < 0.01]
    show(sign)
    show()
    if infile.find('telomere') > -1 or infile.find('tRNA') > -1 or infile.find(
            'paralogs') > -1:
        outfile = open(infile + '_val.csv', 'w')
        outfile.write('Name,Bin Idx,Membership\n')
        for Name, Idx in zip(name, idx):
            outfile.write('%s,%s' % (Name, Idx))
            for i in sign:
                outfile.write(',%s' % map1.contact_group[Idx, i])
            outfile.write('\n')
        outfile.close()
    os.remove(infile)
Ejemplo n.º 8
0
def run2(name='demo'):
    show('''
    Show the distribution of objective values
    ''', True)
    map1 = ContactMap(name)
    assert map1.load()
    map1.decompose_auto(max_iter=50)
    n, r = map1.contact_group.shape
    map2 = map1.duplicate()
    objs = []
    for i in xrange(100):
        map2.reset_solution()
        objs.append(map2.decompose(dim_num=r, max_iter=50))
    max_obj = int(max(objs)) + 1
    min_obj = int(min(objs)) - 1
    bins = range(min_obj, max_obj, int((max_obj - min_obj) / 10))
    show('')
    show(bins, True)
    show('Frequency')
    show(histogram(objs, bins), True)
    show('Mean\tSTD\n')
    m, s = mean_std(objs)
    show([m, s], True)

    map2.decompose('Null', dim_num=r)
    obj1 = map2.decompose(dim_num=r)
    show('Use Null init has obj.')
    show(obj1)
    show('and Z-score is')
    show((obj1 - m) / s, True)
Ejemplo n.º 9
0
def map_contacts(fname):

    traj = md.load(fname)
    contacts = ContactMap(traj[0])
    (fig, ax) = contacts.residue_contacts.plot(cmap='seismic', vmin=-1, vmax=1)
    plt.xlabel("Residue")
    plt.ylabel("Residue")

    figname = os.path.splitext(fname)[0] + '.png'
    plt.savefig(figname, bbox_inches='tight')
    plt.show()

    return contacts
Ejemplo n.º 10
0
def go_test(para,  data='YeastHiC'):
    show("""
    Check gene groups relating to the same GO term
    """, True)
    MP = read_map(para['DataPath']+'/SGD/go_slim_mapping.tab')
    CO = read_complex(para['DataPath']+'/SGD/go_protein_complex_slim.tab')
    GO = read_go(para['DataPath']+'/SGD/go_terms.tab')
    GE = read_gene(para['DataPath']+'/SGD/SGD_features.tab')
    cc = 0
    go2gene = MP.copy()
    go2gene.update(CO)

    map1 = ContactMap()
    assert map1.load(data)
    map1.output_groups()
    for go in go2gene:
        go = go.strip()
        fname = save_gogene(go, go2gene, GE)
        idx, names = map1.get_locations(fname, st=1, ch=1, po=2, nm=0)
        os.remove(fname)
        if len(idx) < 1:
            continue
        srt, val, pval = map1.test_enrichment(idx, method='AvgCCD')
        cc += 1
        sign = []
        for i in srt:
            if pval[i] < 0.01:
                sign.append(i)
        if len(sign) > 0:
            show(go)
            show(len(idx))
            show(GO[int(go.split(':')[1])])
            show(pval[sign[0]])
            show(pval[sign[-1]])
            show(sign)
            show()
    show('We tested %s GO terms for %s.\n'%(cc,data))
Ejemplo n.º 11
0
def chr_detail(path, cell, genome, ch, loci, st=0, ed=None, pdf=None):
    map1 = ContactMap('tad-detail-%s-in-%s'%(ch,cell))
#    map1.clear()
    if not map1.load():
        map1.genome_info(path+'/%s_chr_len.txt'%genome)
        map1.focus_chromosome(ch)
        map1.create_binnedmap(40e3, lazy=True)
#        map1.contact_map = np.asmatrix(np.loadtxt(path+'/'+cell+'/uij.'+ch))
        map1.contact_map = np.asmatrix(np.loadtxt(path+'/'+cell+'/nij/nij.'+ch))
        print cell, ch, map1.frag_sta.shape[0], map1.contact_map.shape[0]
        assert map1.frag_sta.shape[0] == map1.contact_map.shape[0]
        map1.get_sparse_interactions()
        map1.focus_chromosome(ch, st=st, ed=ed)
        map1.create_contactmap(throw=0)
        map1.save()
    show(cell)
    show(ch)
    show(map1.contact_map.shape)
    map1.mask_diag()
    map1.mask_short()
    map1.decompose_auto(par_lam=1, beta=3, update=False)
    map1.sort_groups()
#    map1.add_bias_back()
    show(map1.contact_group.shape)
    show()
    if pdf is not None:
        map1.plot_map()
        pdf.savefig(); plt.clf()
        map1.plot_map(map1.contact_group*map1.group_map*map1.contact_group.T, vmin=0.01, title='H*S*H.T')
        pdf.savefig(); plt.clf()
        map1.plot_submap()
        pdf.savefig(); plt.clf()
    TAD_st, _ = map1.get_locations(path+'/'+cell+'/'+domain, st=0, ch=0, po=1, add=0)
    TAD_ed, _ = map1.get_locations(path+'/'+cell+'/'+domain, st=0, ch=0, po=2, add=-1)
    TAD = zip(TAD_st, TAD_ed)

    W = np.asarray(map1.contact_group * map1.group_map)
    Wsum = W.sum(1)
    W /= Wsum[Wsum>0].mean()
    gini = 1-np.power(W,2).sum(1)
    gini[Wsum==0] = 0

    if loci is not None:
        loc = map1.choose_map_loc(loci)
    else:
        loc = []
    grps = W[loc,:].sum(0) > 0
    map1.output_groups()
    show(loc, True)
    sel = np.arange(0, 40)
#    if pdf is not None:
#        plt.plot(sel, gini[sel], 'k.--')
    for i in xrange(W.shape[1]):
        if grps[i]:
            if pdf is not None:
                plt.plot(sel, W[sel,i], label='C%s'%i)
    tad = []
    for i,j in TAD:
        tad.append(j-i)
        if i in sel and j in sel:
            if pdf is not None:
                plt.plot([i,j], [1.1,1.1], 'k-', linewidth=2)
    if pdf is not None:
        plt.plot(loc, [1]*len(loc), 'r.')
        xt = sel[::(len(sel)/5)]
        plt.xticks(xt, ['%sM'%(X*0.04+st*1e-6) for X in xt])
        plt.ylim([0,1.2])
        plt.xlim([sel.min(), sel.max()])
        pdf.savefig(); plt.clf()
    if pdf is not None:
        map1.plot_map(map1.contact_map[sel,:][:,sel])
        pdf.savefig(); plt.clf()
        map1.plot_map(map1.contact_group[sel,:]*\
                      map1.group_map*\
                      map1.contact_group[sel,:].T,
                      vmin=0.01, title='H*S*H.T')
        pdf.savefig(); plt.clf()
    return map1
Ejemplo n.º 12
0
def run1(para):
    show('''
    Decompose headmap and show clusters in PDB format
    ''', True)
    path = para['DataPath']+'/Duan2010N'
    map1 = ContactMap('PDBMAP')
    map1.genome_info(path+'/restriction_fragments_mappable_HindIII.txt', i2=3, i3=0)
    map1.add_interactions(path+'/interactions_HindIII_fdr0.01_inter.txt')
    map1.add_interactions(path+'/interactions_HindIII_fdr0.01_intra.txt')
    map1.create_binnedmap(10000)
    map1.decompose_auto(dims=range(5,51,5))
    map_idx, pdb_idx = map1.get_locations(path+'/3d_model_of_yeast_with_genomic_positions.txt', st=1, ch=0, po=1, nm=-1)
    H = map1.contact_group
    n,r = H.shape
    for i in xrange(3):
        members = set()
        for j in xrange(n):
            if H[j,i] > 1:
                members.add(j)
        mark_idx = [ip for im,ip in zip(map_idx, pdb_idx) if im in members]
        output_pdb('Yeast3D-C%s.pdb'%i, path+'/3d_model_of_yeast_genome.pdb', mark_idx)
Ejemplo n.º 13
0

import pandas as pd
import matplotlib.pyplot as plt
import mdtraj as md
from contact_map import ContactMap, ContactFrequency, ContactDifference

pdb = 'poses/snx_chanel'
traj = md.load_pdb(pdb + '.pdb')
print(traj)
topology = traj.topology

tox = topology.select("segname TOX")
cav = topology.select("segname R1 R2 R3 R4")

frame_contacts = ContactMap(traj[0], query=tox, haystack=cav, cutoff=0.35)
#print (frame_contacts.residue_contacts.df)
df = frame_contacts.residue_contacts.df

(fig, ax) = frame_contacts.residue_contacts.plot(cmap='seismic',
                                                 vmin=-1,
                                                 vmax=1)

tox_residues_id = residue_for_atoms_id(tox, traj.topology)
cav_residues_id = residue_for_atoms_id(cav, traj.topology)
tox_residues = residue_for_atoms_name(tox_residues_id, traj.topology)
cav_residues = residue_for_atoms_name(cav_residues_id, traj.topology)
cav_residues_ori = residue_for_atoms_original(cav_residues_id, traj.topology)
ax.set_xlim(min(cav_residues_id), max(cav_residues_id) + 1)
ax.set_ylim(min(tox_residues_id), max(tox_residues_id) + 1)
#segment_for_residue(topology)
Ejemplo n.º 14
0
def decompose_dist(pdf, curve, r=None):
    'Decompose the Euc distance matrix on curve'
    from contact_map import ContactMap, EIG, NMF_main
    map1 = ContactMap()
    curve_show(curve)
    pdf.savefig()
    plt.clf()
    verts = np.array(curve)
    map1.plot_map(verts, title="Verteces", log=False)
    pdf.savefig()
    plt.clf()
    from scipy.spatial.distance import pdist, squareform
    ds = squareform(pdist(verts, 'euclidean'))

    ## transform
    #    V = ds.max() - ds
    #    V = ds.max() / (ds + 1)
    V = ds.max() * ((ds + 1)**-2)
    map1.plot_map(V, title="Synthetic Heatmap", log=True)
    pdf.savefig()
    plt.clf()

    #plt.hist(np.reshape(V,(-1)), bins=100, normed=1, facecolor='blue')
    #plt.title('Distribution of map values')
    #pdf.savefig(); plt.clf()
    plt.loglog([(i + 1.0) / V.shape[0] for i in range(V.shape[0])],
               trace_sum(V),
               linestyle='-.')
    plt.title('Distribution of interactions along 1D')
    plt.xlabel('Ratio of linked locations to the total length')
    plt.ylabel('Number of observed links')
    pdf.savefig()
    plt.clf()

    if r == None:
        r = choose_size(pdf, V, 9)
        show('Best number of dimentions is %s\n' % r)
        r = 4
    if False:  ## try PCA
        U = (V - np.mean(V.T, axis=1)).T
        Q, M = EIG(np.cov(U), r)
    else:
        Q, M = EIG(V, r)
    map1.plot_map(Q, title='Eig. Decomp. - Q Matrix', log=False)
    pdf.savefig()
    plt.clf()
    map1.plot_map(M, title='Eig. Decomp. - M Matrix', log=False)
    pdf.savefig()
    plt.clf()
    map1.plot_map(Q * M * Q.T, title='Eig. Decomp. - Recovered', log=False)
    pdf.savefig()
    plt.clf()
    sep_map_show(pdf, verts, Q)

    H, S, obj = NMF_main(V, J='NMF-PoissonManifoldEqual', H=Q, S=M, r=r)
    map1.plot_map(H * S * H.T, title='NMF Decomp. - Recovered', log=False)
    pdf.savefig()
    plt.clf()
    map1.plot_map(H, title='NMF Decomp. - H Matrix', log=False)
    pdf.savefig()
    plt.clf()
    map1.plot_map(S, title='NMF Decomp. - S Matrix', log=False)
    pdf.savefig()
    plt.clf()
    maxp = np.argmax(np.asarray(H), 0)
    srt = np.argsort(maxp)
    sep_map_show(pdf, verts, H[:, srt])

    try:
        from sklearn.cluster import KMeans
        km = KMeans(n_clusters=r)
        H = -np.matrix(km.fit_transform(V))
        S = np.matrix(np.eye(r))
        maxp = np.argmax(np.asarray(H), 0)
        srt = np.argsort(maxp)
        map1.plot_map(H, title='K-means Decomp. - H Matrix', log=False)
        pdf.savefig()
        plt.clf()
        sep_map_show(pdf, verts, H[:, srt])
    except:
        print 'Please install SK-kit to run K-means'
        pass
Ejemplo n.º 15
0
def get_syn_map(para, bin_size=3200, with_bias=True):
    pdf = PdfPages(para['ExeFile'] + 'plot1.pdf')
    ## prepare
    map1 = ContactMap('Syn3D')
    map1.genome_info(para['DataPath'] + '/Tjong2012GR/yeast_chr_len-Tjong.txt')
    map1.create_binnedmap(3200)  ## fixed
    map2 = map1.duplicate()
    map3 = map1.duplicate()

    ## obtain links from PDB
    link_map = np.load('syn_link.npy')
    if with_bias:  ## add random bias
        np.random.seed(0)
        bias = np.random.random(link_map.shape[0])
        link_map *= np.outer(bias, bias)
        print link_map.min(), link_map.max(), link_map.mean()
        link_map = np.floor(link_map)  ## sampling bias
    map1.contact_map = np.matrix(link_map, dtype='float')

    output = open(
        'syn_yeast_map_bin%s%s.txt' % (bin_size, 'bias' if with_bias else ''),
        'w')
    ch = map1.frag_chr
    po = (map1.frag_sta + map1.frag_end) / 2
    for i in xrange(link_map.shape[0]):
        for j in xrange(link_map.shape[1]):
            if link_map[i, j] > 0:
                output.write('%s\t%s\t%s\t%s\t0\t%s\t1e-10\t1e-10\n' %
                             (ch[i], po[i], ch[j], po[j], link_map[i, j]))
        output.write('\n')
    output.close()

    map1.get_interactions()
    map1.create_binnedmap(bin_size)
    map1.mask_diag()
    map1.plot_map(title='Heatmap for the number of links')
    pdf.savefig()
    plt.clf()
    map1.decompose('NND')
    idx, names = map2.get_locations(map1.output_groups(),
                                    st=0,
                                    ch=0,
                                    po=1,
                                    nm=0,
                                    add=0)
    dist_map = np.load('syn_dist.npy')
    dist = dist_map[idx, :][:, idx]
    map1.plot_map(dist, title='Heatmap for the average distances', log=False)
    pdf.savefig()
    plt.clf()
    pdf.close()
    return map1, dist
import matplotlib.pyplot as plt
import mdtraj as md
from contact_map import ContactMap

pdb_list = [ "../pdb_dir_1_500ns/frame0.pdb",
            "../pdb_dir_5001_6000ns/frame4164.pdb"]

# Program takes about several minutes to finish
# It is a bit slow;
for i in range(len(pdb_list)):
    pdb = md.load_pdb(pdb_list[i])
    frame_contacts = ContactMap(pdb[0], cutoff=1.5)
    (fig, ax) = frame_contacts.residue_contacts.plot(cmap='seismic', vmin=-1, vmax=1)
    plt.xlabel("Residue")
    plt.ylabel("Residue")
    fig.savefig(f'cont-map-{i}.pdf', format='pdf', dpi=500)
    plt.close()

# Calculate the difference between two contact maps
diff = contacts[1] - contacts[0]
(fig, ax) = diff.residue_contacts.plot(cmap='seismic', vmin=-1, vmax=1)
plt.xlabel("Residue")
plt.ylabel("Residue")
fig.savefig(f'cont-map-diff.pdf', format='pdf', dpi=500)
plt.close()
Ejemplo n.º 17
0
def one_region(path, cell, genome, ch, bi, loci, st=0, ed=None, pdf=None):
    if bi.endswith('kb'):
        reso = int(bi.replace('kb', '')) * 1000
    elif bi.endswith('mb'):
        reso = int(bi.replace('mb', '')) * 1000000
    else:
        raise ValueError('Unknow unit %s' % bi)
    map1 = ContactMap('loop-%s-in-%s' % (ch, cell))
    map1.clear()
    if not map1.load():
        map1.genome_info(path + '/%s_chr_len.txt' % genome)
        map1.create_binnedmap(reso, lazy=True)
        map1.focus_chromosome(ch, st=st, ed=ed)
        if True:  ## read files
            norm = []
            infile = open(path + '/' + cell + '/' + bi +
                          '_resolution_intrachromosomal/' + ch + '/MAPQGE30/' +
                          ch + '_' + bi + '.KRnorm')
            for line in infile:
                norm.append(float(line))
            infile.close()
            expect = []
            infile = open(path + '/' + cell + '/' + bi +
                          '_resolution_intrachromosomal/' + ch + '/MAPQGE30/' +
                          ch + '_' + bi + '.KRexpected')
            for line in infile:
                expect.append(float(line))
            infile.close()
            expect.append(1.0)
            print len(norm), len(expect)
            infile = open(
                path + '/' + cell + '/' + bi +
                '_resolution_intrachromosomal/' + ch + '/MAPQGE30/' + ch +
                '_' + bi + '.RAWobserved', 'r', 2 << 9)
            p1 = []
            p2 = []
            val = []
            for line in infile:
                P1, P2, Val = line.split()
                pos1 = int(P1)
                pos2 = int(P2)
                if pos1 < st or (ed is not None and pos1 >= ed):
                    continue
                if pos2 < st or (ed is not None and pos2 >= ed):
                    continue
                p1.append(pos1)
                p2.append(pos2)
                I = pos1 / reso
                J = pos2 / reso
                IJ = abs(pos1 - pos2) / reso
                #                val.append(float(Val))
                val.append(float(Val) / (norm[I] * norm[J]))
#                val.append(float(Val)/(norm[I]*norm[J]*expect[IJ]))
            map1.inter_loc1 = np.array(p1, dtype='int')
            map1.inter_loc2 = np.array(p2, dtype='int')
            map1.inter_freq = np.array(val, dtype='float')
            chidx = map1.chr2idx[ch]
            map1.inter_chr1 = chidx * np.ones(len(p1), dtype='int')
            map1.inter_chr2 = chidx * np.ones(len(p2), dtype='int')
            infile.close()
        map1.create_binnedmap(reso)
        map1.save()
    show(cell)
    show(ch)
    if pdf is not None:
        map1.plot_map()
        pdf.savefig()
        plt.clf()
    map1.decompose_auto()
    map1.sort_groups()
    show(map1.contact_group.shape)
    show()
    bins = map1.choose_map_loc(loci)

    W = np.asarray(map1.contact_group * map1.group_map)
    n, r = W.shape
    wm = W.sum(1)
    W /= np.mean(wm[wm > 0])
    gini = 1 - np.power(W, 2).sum(1)
    gini[wm == 0] = 0

    outfile = open('loop-%s-in-%s_groups.wig' % (ch, cell), 'w')
    #    outfile.write('track type=wiggle_0 name="Overall" description="BNMF" visibility=full autoScale=off viewLimits=800:1000 color=0,0,0 maxHeightPixels=100:50:20 graphType=bar priority=20\nfixedStep chrom='+ch+' start=%d'%st+' step=%d'%reso+' span=%d\n'%reso)
    #    for i in xrange(n):
    #        outfile.write('%d\n'%int(1000*gini[i]))
    jj = []
    ww = 0
    #    for j in xrange(r):
    #        if W[bins,j].max() < 0.1:
    #            continue
    for j in W[bins, :].argmax(1):
        ww += W[:, j]
        outfile.write(
            'track type=wiggle_0 name="C%s' % (j + 1) +
            '" description="BNMF" visibility=full autoScale=off viewLimits=0:200 color=0,0,0 maxHeightPixels=100:50:20 graphType=bar priority=20\nfixedStep chrom='
            + ch + ' start=%d' % st + ' step=%d' % reso + ' span=%d\n' % reso)
        for i in xrange(n):
            outfile.write('%d\n' % int(1000 * W[i, j]))
        jj.append(j)
#    outfile.write('track type=wiggle_0 name="Overall" description="BNMF" visibility=full autoScale=off viewLimits=0:200 color=0,0,0 maxHeightPixels=100:50:20 graphType=bar priority=20\nfixedStep chrom='+ch+' start=%d'%st+' step=%d'%reso+' span=%d\n'%reso)
#    for i in xrange(n):
#        outfile.write('%d\n'%int(1000*ww[i]))
#    outfile.close()

    sel = range(n)
    lab = ['%dk' % ((i * reso + st) / 1000) for i in sel]
    five = np.arange(0, len(sel), len(sel) / 5)
    if pdf is not None:
        map1.plot_map(map1.contact_group * map1.group_map *
                      map1.contact_group.T,
                      log=False)
        pdf.savefig()
        plt.clf()
        map1.plot_map(map1.contact_group[:, jj] *
                      map1.group_map[jj, :][:, jj] *
                      map1.contact_group[:, jj].T,
                      log=False)
        pdf.savefig()
        plt.clf()
        map1.plot_submap()
        pdf.savefig()
        plt.clf()
        plt.plot(sel, gini[sel], '--k')
        for j in jj:
            plt.plot(sel, W[sel, j], '-', label='C%s' % (j + 1))


#        plt.plot(sel, ww[sel], '-', label='Combined')
        plt.plot(bins, [1.1] * len(bins), 'ro')
        plt.legend()
        plt.xticks([sel[j] for j in five], [lab[j] for j in five])
        pdf.savefig()
        plt.clf()
    return
Ejemplo n.º 18
0
def get_syn_map(para, bin_size=3200, with_bias=True):
    pdf = PdfPages(para['ExeFile']+'plot1.pdf')
    ## prepare
    map1 = ContactMap('Syn3D')
    map1.genome_info(para['DataPath']+'/Tjong2012GR/yeast_chr_len-Tjong.txt')
    map1.create_binnedmap(3200) ## fixed
    map2 = map1.duplicate()
    map3 = map1.duplicate()

    ## obtain links from PDB
    link_map = np.load('syn_link.npy')
    if with_bias: ## add random bias
        np.random.seed(0)
        bias = np.random.random(link_map.shape[0])
        link_map *= np.outer(bias, bias)
        print link_map.min(), link_map.max(), link_map.mean()
        link_map = np.floor(link_map) ## sampling bias
    map1.contact_map = np.matrix(link_map, dtype='float')

    output = open('syn_yeast_map_bin%s%s.txt'%(bin_size, 'bias' if with_bias else ''), 'w')
    ch = map1.frag_chr
    po = (map1.frag_sta+map1.frag_end)/2
    for i in xrange(link_map.shape[0]):
        for j in xrange(link_map.shape[1]):
            if link_map[i,j] > 0:
                output.write('%s\t%s\t%s\t%s\t0\t%s\t1e-10\t1e-10\n'%(ch[i], po[i], ch[j], po[j], link_map[i,j]))
        output.write('\n')
    output.close()

    map1.get_interactions()
    map1.create_binnedmap(bin_size)
    map1.mask_diag()
    map1.plot_map(title='Heatmap for the number of links')
    pdf.savefig(); plt.clf();
    map1.decompose('NND')
    idx, names = map2.get_locations(map1.output_groups(), st=0, ch=0, po=1, nm=0, add=0)
    dist_map = np.load('syn_dist.npy')
    dist = dist_map[idx,:][:,idx]
    map1.plot_map(dist, title='Heatmap for the average distances', log=False)
    pdf.savefig(); plt.clf();
    pdf.close()
    return map1, dist
Ejemplo n.º 19
0
def plot2(para):
    pdf = PdfPages(para['ExeFile'] + 'plot2.pdf')
    ## initalization
    map1 = ContactMap('plot2')
    if True:
        map1.genome_info('../data/yeast_chr_len.txt')
        datafiles = [
            '../data/Duan2010N/interactions_HindIII_fdr0.01_inter.txt',
            '../data/Duan2010N/interactions_HindIII_fdr0.01_intra.txt'
        ]
        for datafile in datafiles:
            map1.add_interactions(datafile)
        map1.create_binnedmap(binsize=10e3)
        map1.mask_diag()
        map1.mask_short()
        map1.mask_low()
    map1.plot_map(map1.contact_map, log=True, vmin=1, vmax=100)
    pdf.savefig()
    plt.clf()
    sel = np.arange(200, 400)
    map1.plot_map(map1.contact_map[sel, :][:, sel], log=True, vmin=1, vmax=100)
    pdf.savefig()
    plt.clf()
    for l in [0, 0.1, 1, 10]:
        map1.reset_solution()
        map1.decompose('NMF-PoissonManifoldEqual', dim_num=55, par_lam=l)
        R = map1.contact_group * map1.group_map * map1.contact_group.T
        map1.plot_map(R[sel, :][:, sel], vmin=1, vmax=100, title=str(l))
        pdf.savefig()
        plt.clf()
    pdf.close()
Ejemplo n.º 20
0
def run5(name='demo'):
    show(
        '''
    Mapping clusters by changing the number of total clusters 
    ''', True)
    map1 = ContactMap(name)
    map2 = ContactMap(name)
    assert map1.load()
    assert map2.load()
    #    dims = [10,20,30,40,50,60,70,80]
    dims = range(5, 31, 1)
    show('Bin Size\tMetric')
    map1.decompose_auto(dim_num=30)
    full = np.arange(map1.contact_group.shape[1])
    show(full.tolist(), True)
    from contact_map import gini_impurity
    for r in dims:
        show(r)
        map2.decompose_auto(dim_num=r)
        show(gini_impurity(np.diag(map2.group_map)))
        match = map1.best_cor(map2, dims=True)
        dt = {}
        for i, j in match:
            dt[i] = j
        for i in full:
            if i in dt:
                show(dt[i])
            else:
                show('')
        show()
Ejemplo n.º 21
0
def one_cell(path, pdf, cell, genome):
    map1 = ContactMap()
    map1.genome_info(path+'/%s_chr_len.txt'%genome)

    cci = 0; ccj = 0
    ni = 0; nj = 0
    tadlen = []
    tadtype = []
    grptype=[]
    gini = []
    entropy = []
    for i in sorted(map1.idx2chr.keys()):
        info = one_chr(path=path, cell=cell, genome=genome, ch=map1.idx2chr[i], pdf=pdf)
        CCI,CCJ,NI,NJ,TADLEN,TADTYPE,GRPTYPE,GINI,ENTRO = info
        cci += CCI
        ccj += CCJ
        ni += NI
        nj += NJ
        tadlen += TADLEN
        tadtype += TADTYPE
        grptype += GRPTYPE
        gini += GINI[GINI>0].tolist()
        entropy += ENTRO[ENTRO>0].tolist()

    tadlen = np.array(tadlen)
    plt.hist(tadlen*resolution, np.arange(tadlen.max())*resolution)
    plt.title('Distribution of TAD sizes in %s'%cell)
    pdf.savefig(); plt.clf()

    bins = range(max(tadtype)+1)
    count = histogram(tadtype, bins, False)
    show(bins, True)
    show(count, True)
    tadtype = np.array(tadtype)
    plt.hist(tadtype, np.arange(tadtype.max()+1))
    plt.title('Distribution of covered clusters in %s'%cell)
    pdf.savefig(); plt.clf()

    bins = range(max(grptype)+1)
    count = histogram(grptype, bins, False)
    show(bins, True)
    show(count, True)
    grptype = np.array(grptype)
    plt.hist(grptype, np.arange(grptype.max()+1))
    plt.title('Distribution of covered TADs in %s'%cell)
    pdf.savefig(); plt.clf()

#    plt.plot(np.arange(plot_left, plot_right), cci/ni, '.-r', label='TAD start')
#    plt.plot(np.arange(plot_left, plot_right), ccj/nj, '.-b', label='TAD end')
    plt.plot(np.arange(plot_left, plot_right), (cci+ccj)/(ni+nj), '.-k', label='TAD boundary')
    plt.xlabel('Genomic distances (kb)')
    plt.ylabel('Gini impurity score')
    plt.xticks(np.arange(plot_left, plot_right), np.arange(plot_left, plot_right)*resolution)
    plt.xlim([plot_left, plot_right])
    plt.title('Average scores around TAD in %s'%cell)
    plt.legend()
    pdf.savefig(); plt.clf()

    show(mean_std(gini))
    plt.hist(gini, np.arange(0,1.001,0.05))
    plt.title('Distribution of Gini impurity scores')
    plt.xlabel('Gini impurity scores')
    plt.ylabel('Frequency')
    pdf.savefig(); plt.clf()
    show(mean_std(entropy))

    plt.hist(entropy, np.arange(0,6,0.2))
    plt.title('Distribution of entropy at TAD boundaries')
    plt.xlabel('Entropy')
    plt.ylabel('Frequency')
    pdf.savefig(); plt.clf()
Ejemplo n.º 22
0
def one_chr(path, cell, genome, ch, pdf=None):
    map1 = ContactMap('tad-%s-in-%s'%(ch,cell))
#    map1.clear()
    if not map1.load():
        map1.genome_info(path+'/%s_chr_len.txt'%genome)
        map1.focus_chromosome(ch)
        map1.create_binnedmap(40e3, lazy=True)
        map1.contact_map = np.loadtxt(path+'/'+cell+'/uij.'+ch)
        print cell, ch, map1.frag_sta.shape[0], map1.contact_map.shape[0]
        assert map1.frag_sta.shape[0] == map1.contact_map.shape[0]
        map1.get_sparse_interactions()
        map1.create_binnedmap(resolution*1000)
        map1.mask_diag()
        map1.mask_short()
        map1.mask_low()
        map1.save()
    show(cell)
    show(ch)
    if pdf is not None:
        map1.plot_map()
        pdf.savefig(); plt.clf()
    map1.decompose_auto(update=False)
    map1.sort_groups()
    show(map1.contact_group.shape)
    if pdf is not None:
        map1.plot_submap()
        pdf.savefig(); plt.clf()
    TAD_st, _ = map1.get_locations(path+'/'+cell+'/'+domain, st=0, ch=0, po=1, add=0)
    TAD_ed, _ = map1.get_locations(path+'/'+cell+'/'+domain, st=0, ch=0, po=2, add=-1)
    TAD = zip(TAD_st, TAD_ed)

    W = np.asarray(map1.contact_group * map1.group_map)
    wm = W.sum(1)
    W /= np.mean(wm[wm>0])
    group = np.argmax(W,1)+1
    group[wm==0] = -1 ## masked regions

    gini = 1-np.power(W,2).sum(1)
    gini[wm==0] = -1 ## masked regions
    log2W = np.log2(W)
    log2W[W==0] = 0
    entropy = (-W*log2W).sum(1)
    entropy[wm==0] = 0

    score = gini
    score[score<0] = 0 ## for ploting

    for i in [1,np.argmax(entropy)/50]:
        sel = np.arange(i*50, min(W.shape[0],(i+1)*50))
        pos = ['%.fM'%(j*resolution*1e-3) for j in sel]
        if pdf is not None:
            fig = plt.figure()
            axis = fig.add_subplot(211)
#            axis.plot(sel, score[sel], '--k')
        for i in xrange(W.shape[1]):
#            if W[sel,i].max() > 0.1:
                if pdf is not None:
                    axis.plot(sel, W[sel,i], label='C%s'%i)
        for i,j in TAD:
            if i in sel and j in sel:
                if pdf is not None:
                    axis.plot([i,j], [1,1], 'k-', linewidth=2)
        if pdf is not None:
            plt.ylim([0,1.2])
            plt.xticks(sel[::int(len(sel)/5)], pos[::int(len(sel)/5)])
            axis = fig.add_subplot(212)
            from matplotlib.colors import LogNorm
            axis.imshow(map1.contact_map[sel,:][:,sel], interpolation='none', norm=LogNorm(), aspect='equal', cmap='OrRd')
            axis.legend()
            fig.savefig(pdf, format='pdf')
            plt.clf()
    
    tad = np.zeros_like(gini)
    tadlen = []
    for i,j in TAD:
        for k in xrange(i+1, j-1):
            tad[k] = (i+j+1)/2 ## regions in the domain
        tadlen.append(j-i)

    tadtype = []
    for i in np.unique(tad):
        if i > 0:
            tadtype.append(len(np.unique(group[tad==i])))
    grptype = []
    for i in np.unique(group):
        if i > 0:
            grptype.append(len(np.unique(tad[group==i])))

    show(np.sum(np.logical_and(tad==0,gini>=0))) ## TADs
    for cut in [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
        show(np.sum(gini>=cut)) ## clusters
        show(np.sum(np.logical_and(tad==0,gini>=cut))) ## TADs % clusters

    cci = 0; ccj = 0 ## scores around the TAD boundaries
    ni = 0; nj = 0
    for i,j in TAD:
        if i+plot_left >=0 and i+plot_right < len(score):
            cci += score[(i+plot_left):(i+plot_right)]
            ni += 1
        if j+plot_left >=0 and j+plot_right < len(score):
            ccj += score[(j+plot_left):(j+plot_right)]
            nj += 1
    show()
    return cci,ccj,ni,nj,tadlen,tadtype,grptype,gini,entropy
Ejemplo n.º 23
0
def decompose_dist(pdf, curve, r=None):
    'Decompose the Euc distance matrix on curve'
    from contact_map import ContactMap, EIG, NMF_main
    map1 = ContactMap()
    curve_show(curve)
    pdf.savefig(); plt.clf()
    verts = np.array(curve)
    map1.plot_map(verts, title = "Verteces", log=False)
    pdf.savefig(); plt.clf()
    from scipy.spatial.distance import pdist, squareform
    ds = squareform(pdist(verts, 'euclidean'))

    ## transform
#    V = ds.max() - ds
#    V = ds.max() / (ds + 1)
    V = ds.max() * ((ds+1)**-2)
    map1.plot_map(V, title="Synthetic Heatmap", log=True)
    pdf.savefig(); plt.clf()

    #plt.hist(np.reshape(V,(-1)), bins=100, normed=1, facecolor='blue')
    #plt.title('Distribution of map values')
    #pdf.savefig(); plt.clf()
    plt.loglog([(i+1.0)/V.shape[0] for i in range(V.shape[0])], 
               trace_sum(V), linestyle='-.')
    plt.title('Distribution of interactions along 1D')
    plt.xlabel('Ratio of linked locations to the total length')
    plt.ylabel('Number of observed links')
    pdf.savefig(); plt.clf()

    if r == None:
        r = choose_size(pdf, V, 9)
        show('Best number of dimentions is %s\n'%r)
        r = 4
    if False: ## try PCA
        U = (V-np.mean(V.T,axis=1)).T
        Q, M = EIG(np.cov(U), r)
    else:
        Q, M = EIG(V, r)
    map1.plot_map(Q, title = 'Eig. Decomp. - Q Matrix', log=False)
    pdf.savefig(); plt.clf()
    map1.plot_map(M, title = 'Eig. Decomp. - M Matrix', log=False)
    pdf.savefig(); plt.clf()
    map1.plot_map(Q*M*Q.T, title = 'Eig. Decomp. - Recovered', log=False)
    pdf.savefig(); plt.clf()
    sep_map_show(pdf, verts, Q)

    H, S, obj = NMF_main(V, J='NMF-PoissonManifoldEqual', H=Q, S=M, r=r)
    map1.plot_map(H*S*H.T, title = 'NMF Decomp. - Recovered', log=False)
    pdf.savefig(); plt.clf()
    map1.plot_map(H, title = 'NMF Decomp. - H Matrix', log=False)
    pdf.savefig(); plt.clf()
    map1.plot_map(S, title = 'NMF Decomp. - S Matrix', log=False)
    pdf.savefig(); plt.clf()
    maxp = np.argmax(np.asarray(H),0)
    srt = np.argsort(maxp)
    sep_map_show(pdf, verts, H[:,srt])

    try:
        from sklearn.cluster import KMeans
        km = KMeans(n_clusters=r)
        H = -np.matrix(km.fit_transform(V))
        S = np.matrix(np.eye(r))
        maxp = np.argmax(np.asarray(H),0)
        srt = np.argsort(maxp)
        map1.plot_map(H, title = 'K-means Decomp. - H Matrix', log=False)
        pdf.savefig(); plt.clf()
        sep_map_show(pdf, verts, H[:,srt])
    except:
        print 'Please install SK-kit to run K-means'
        pass
Ejemplo n.º 24
0
def plot2(para):
    pdf = PdfPages(para['ExeFile']+'plot2.pdf')
    ## initalization
    map1 = ContactMap('plot2')
    if True:
        map1.genome_info('../data/yeast_chr_len.txt')
        datafiles = ['../data/Duan2010N/interactions_HindIII_fdr0.01_inter.txt',
                     '../data/Duan2010N/interactions_HindIII_fdr0.01_intra.txt'] 
        for datafile in datafiles:
            map1.add_interactions(datafile)
        map1.create_binnedmap(binsize=10e3)
        map1.mask_diag()
        map1.mask_short()
        map1.mask_low()
    map1.plot_map(map1.contact_map, log=True, vmin=1, vmax=100)
    pdf.savefig(); plt.clf();
    sel = np.arange(200,400)
    map1.plot_map(map1.contact_map[sel,:][:,sel], log=True, vmin=1, vmax=100)
    pdf.savefig(); plt.clf();
    for l in [0, 0.1, 1, 10]:
        map1.reset_solution()
        map1.decompose('NMF-PoissonManifoldEqual', dim_num=55, par_lam=l)
        R = map1.contact_group * map1.group_map * map1.contact_group.T
        map1.plot_map(R[sel,:][:,sel], vmin=1, vmax=100, title=str(l))
        pdf.savefig(); plt.clf();
    pdf.close()
Ejemplo n.º 25
0
def plot1(para):
    pdf = PdfPages(para['ExeFile']+'plot1.pdf')
    ## initalization
    map1 = ContactMap('plot1')
    map1.clear()
    ## read chromosome sizes
    if not map1.load():
        map1.genome_info('../data/yeast_chr_len.txt')
        datafiles = ['../data/Duan2010N/interactions_HindIII_fdr0.01_inter.txt',
                     '../data/Duan2010N/interactions_HindIII_fdr0.01_intra.txt'] 
        for datafile in datafiles:
            map1.add_interactions(datafile)
        map1.create_binnedmap(binsize=20e3)
        map1.mask_diag()
        map1.mask_short()
        map1.mask_low()

    map1.decompose_auto(plot=pdf)
    map1.sort_groups()
    map1.save()

    map1.plot_map(vmin=1, vmax=1000, title='$X$')
    pdf.savefig(); plt.clf();
    map1.plot_map(np.diag(map1.bias_vector), log=False, title='$B$')
    pdf.savefig(); plt.clf();
    map1.plot_map(map1.contact_group, log=False, title='$H$')
    pdf.savefig(); plt.clf();
    map1.plot_map(map1.group_map, log=False, title='$S$')
    pdf.savefig(); plt.clf();
    map1.plot_map(map1.group_map * map1.contact_group.T, log=False, title='$W=SH^T$')
    pdf.savefig(); plt.clf();
    map1.plot_map(map1.contact_group * map1.group_map * map1.contact_group.T, vmin=1, vmax=1000, title='$R=HSH^T$')
    pdf.savefig(); plt.clf();
    grps = map1.label_groups(plot=pdf)
    r = map1.contact_group.shape[1]
    for i in [0,r-2,r-1]:
        map1.plot_map(map1.contact_group[:,i] * map1.contact_group[:,i].T, vmin=1, title=str(i+1))
        pdf.savefig(); plt.clf();
    map1.plot_map(np.outer(map1.bias_vector, map1.bias_vector), log=False)
    pdf.savefig(); plt.clf();
    map1.add_bias_back()
    map1.plot_map(map1.contact_group * map1.group_map * map1.contact_group.T, vmin=1, vmax=1000, title='$R=HSH^T$')
    pdf.savefig(); plt.clf();
    pdf.close()
Ejemplo n.º 26
0
def run3(para, name='demo'):
    show(
        '''
    Compare objective values in NMF and average distances in PDB.
    ''', True)
    pdf = PdfPages(para['ExeFile'] + 'plot.pdf')
    map1 = ContactMap('Syn3D')
    map1.genome_info(para['DataPath'] + '/Tjong2012GR/yeast_chr_len-Tjong.txt')
    map1.create_binnedmap(3200)
    map2 = map1.duplicate()
    map1.contact_map = np.matrix(np.load('syn_link.npy'), dtype='float')
    map1.get_interactions()
    map1.create_binnedmap(32000)
    map1.mask_diag()
    paras = map1.decompose_auto()
    r = paras[-1][0]
    map3 = map1.duplicate()
    show(r)
    show('is the selected cluster number\n')
    print map1.contact_map.shape
    idx, names = map2.get_locations(map1.output_groups(),
                                    st=0,
                                    ch=0,
                                    po=1,
                                    nm=0,
                                    add=0)
    print len(idx)
    dist_map = np.load('syn_dist.npy')
    show(dist_map.mean())
    show('is the average of all bins\n')
    dist = dist_map[idx, :][:, idx]  ## distance among bins
    show(dist.mean())
    show('is the average of selected bins\n')
    inum = []
    objs = []
    avgs = []
    objs3 = []
    avgs3 = []
    show(
        '\tObjective function values\tAverage intra-cluster distances\tcase2\n'
    )
    map1.reset_solution()
    map1.decompose('NND', dim_num=r)
    map3.reset_solution()
    #    map3.decompose('NND', dim_num=r)
    icc = 0
    step = 20
    for i in xrange(100):
        icc += step
        show(icc)
        inum.append(icc)
        obj = map1.decompose(dim_num=r, par_lam=1, max_iter=step, stop_thrd=0)
        obj3 = map3.decompose(dim_num=r, par_lam=1, max_iter=step, stop_thrd=0)
        map1.sort_groups('diagnal')
        show(obj)
        avg = []
        avg3 = []
        for j in xrange(r):
            idx1 = np.asarray(map1.contact_group)[:, j] > float(
                map1.contact_group[:, j].mean())
            D1 = dist[idx1, :][:, idx1]
            d1 = D1[np.triu_indices(D1.shape[0], k=1)]
            avg += d1.tolist()

            idx3 = np.asarray(map3.contact_group)[:, j] > float(
                map3.contact_group[:, j].mean())
            D3 = dist[idx3, :][:, idx3]
            d3 = D3[np.triu_indices(D3.shape[0], k=1)]
            avg3 += d3.tolist()
        show(mean_std(avg)[0])
        show(mean_std(avg3)[0])
        show()
        objs.append(obj)
        objs3.append(obj3)
        avgs.append(mean_std(avg)[0])
        avgs3.append(mean_std(avg3)[0])
    plt.plot(inum, objs, 'r-', label='NNDSVD Initialization')
    #    plt.plot(inum, objs3, 'b--', label='Random Initialization')
    plt.legend()
    plt.xlabel('Number of iterations')
    plt.ylabel('Objective function values for NMF')
    pdf.savefig()
    plt.clf()
    plt.plot(objs, avgs, 'r.', label='NNDSVD Initialization')
    #    plt.plot(objs3, avgs3, 'b+', label='Random Initialization')
    plt.legend()
    plt.xlabel('Objective function values for NMF')
    plt.ylabel('Average intra cluster distances (nm)')
    pdf.savefig()
    plt.clf()
    show('\nCorrelation of objective with the average distances\n')
    show('Pearson Coef.')
    show(correlation(objs, avgs), True)
    show('Spearman Rank Coef.')
    show(correlation(objs, avgs, rank=True), True)
    map1.plot_submap()
    pdf.savefig()
    plt.clf()
    map3.plot_submap()
    pdf.savefig()
    plt.clf()
    pdf.close()
Ejemplo n.º 27
0
def plot1(para):
    pdf = PdfPages(para['ExeFile'] + 'plot1.pdf')
    ## initalization
    map1 = ContactMap('plot1')
    map1.clear()
    ## read chromosome sizes
    if not map1.load():
        map1.genome_info('../data/yeast_chr_len.txt')
        datafiles = [
            '../data/Duan2010N/interactions_HindIII_fdr0.01_inter.txt',
            '../data/Duan2010N/interactions_HindIII_fdr0.01_intra.txt'
        ]
        for datafile in datafiles:
            map1.add_interactions(datafile)
        map1.create_binnedmap(binsize=20e3)
        map1.mask_diag()
        map1.mask_short()
        map1.mask_low()

    map1.decompose_auto(plot=pdf)
    map1.sort_groups()
    map1.save()

    map1.plot_map(vmin=1, vmax=1000, title='$X$')
    pdf.savefig()
    plt.clf()
    map1.plot_map(np.diag(map1.bias_vector), log=False, title='$B$')
    pdf.savefig()
    plt.clf()
    map1.plot_map(map1.contact_group, log=False, title='$H$')
    pdf.savefig()
    plt.clf()
    map1.plot_map(map1.group_map, log=False, title='$S$')
    pdf.savefig()
    plt.clf()
    map1.plot_map(map1.group_map * map1.contact_group.T,
                  log=False,
                  title='$W=SH^T$')
    pdf.savefig()
    plt.clf()
    map1.plot_map(map1.contact_group * map1.group_map * map1.contact_group.T,
                  vmin=1,
                  vmax=1000,
                  title='$R=HSH^T$')
    pdf.savefig()
    plt.clf()
    grps = map1.label_groups(plot=pdf)
    r = map1.contact_group.shape[1]
    for i in [0, r - 2, r - 1]:
        map1.plot_map(map1.contact_group[:, i] * map1.contact_group[:, i].T,
                      vmin=1,
                      title=str(i + 1))
        pdf.savefig()
        plt.clf()
    map1.plot_map(np.outer(map1.bias_vector, map1.bias_vector), log=False)
    pdf.savefig()
    plt.clf()
    map1.add_bias_back()
    map1.plot_map(map1.contact_group * map1.group_map * map1.contact_group.T,
                  vmin=1,
                  vmax=1000,
                  title='$R=HSH^T$')
    pdf.savefig()
    plt.clf()
    pdf.close()