Example #1
0
def full_mds(path1,
             path2,
             alpha=4,
             penalty=0.05,
             num_threads=3,
             weight=0.05,
             prefix=""):
    """MDS without partitioning"""
    structure1 = dt.structureFromBed(path1)
    structure2 = dt.structureFromBed(path2)
    dt.make_compatible((structure1, structure2))
    contactMat1 = dt.matFromBed(path1, structure1)
    contactMat2 = dt.matFromBed(path2, structure2)
    infer_structures(contactMat1, structure1, contactMat2, structure2, alpha,
                     penalty, num_threads, weight)

    prefix1 = os.path.splitext(os.path.basename(path1))[0]
    structure1.write("{}{}_structure.tsv".format(prefix, prefix1))
    prefix2 = os.path.splitext(os.path.basename(path2))[0]
    structure2.write("{}{}_structure.tsv".format(prefix, prefix2))

    dists = la.calculate_distances(structure1, structure2)
    with open("{}{}_{}_relocalization.bed".format(prefix, prefix1, prefix2),
              "w") as out:
        for gen_coord, dist in zip(structure1.getGenCoords(), dists):
            out.write("\t".join(
                (structure1.chrom.name, str(gen_coord),
                 str(gen_coord + structure1.chrom.res), str(dist))))
            out.write("\n")
        out.close()

    print("Fractional compartment change: ")
    print(calculate_compartment_fraction(structure1, structure2, path1, path2))

    return structure1, structure2
Example #2
0
def fullMDS(path1, path2, alpha, penalty, num_threads, weight):
    """MDS without partitioning"""
    structure1 = dt.structureFromBed(path1)
    structure2 = dt.structureFromBed(path2)
    dt.make_compatible((structure1, structure2))
    contactMat1 = dt.matFromBed(path1, structure1)
    contactMat2 = dt.matFromBed(path2, structure2)
    infer_structures(contactMat1, structure1, contactMat2, structure2, alpha,
                     penalty, num_threads, weight)
    return structure1, structure2
Example #3
0
def fullMDS(path, classical, alpha, num_threads, weight):
    """MDS without partitioning"""
    structure = dt.structureFromBed(path)
    contactMat = dt.matFromBed(path, structure)
    infer_structure(contactMat, structure, alpha, num_threads, weight,
                    classical)
    return structure
Example #4
0
def fullMDS(path, classical=False):
    """MDS without partitioning"""
    cluster = dt.clusterFromBed(path, None, None)
    contactMat = dt.matFromBed(path, cluster)
    distMat = at.contactToDist(contactMat)
    infer_cluster(contactMat, cluster, classical)
    return cluster
Example #5
0
def calculate_compartment_fraction(structure1, structure2, path1, path2):
	#compartments
	contacts1 = dt.matFromBed(path1, structure1)
	contacts2 = dt.matFromBed(path2, structure2)

	compartments1 = np.array(ca.get_compartments(contacts1))
	compartments2 = np.array(ca.get_compartments(contacts2))

	r, p = st.pearsonr(compartments1, compartments2)
	if r < 0:
		compartments2 = -compartments2

	#SVR
	coords1 = structure1.getCoords()
	coords2 = structure2.getCoords()
	coords = np.concatenate((coords1, coords2))
	compartments = np.concatenate((compartments1, compartments2))
	clf = svm.LinearSVR()
	clf.fit(coords, compartments)
	coef = clf.coef_

	transformed_coords1 = np.array(la.change_coordinate_system(coef, coords1))
	transformed_coords2 = np.array(la.change_coordinate_system(coef, coords2))

	x_diffs = transformed_coords1[:,0] - transformed_coords2[:,0]
	y_diffs = transformed_coords1[:,1] - transformed_coords2[:,1]
	z_diffs = transformed_coords1[:,2] - transformed_coords2[:,2]

	#axis lengths
	centroid1 = np.mean(transformed_coords1, axis=0)
	centroid2 = np.mean(transformed_coords2, axis=0)
	x_length1 = np.mean([np.abs(coord1[0] - centroid1[0]) for coord1 in transformed_coords1])
	y_length1 = np.mean([np.abs(coord1[1] - centroid1[1]) for coord1 in transformed_coords1])
	z_length1 = np.mean([np.abs(coord1[2] - centroid1[2]) for coord1 in transformed_coords1])
	x_length2 = np.mean([np.abs(coord2[0] - centroid2[0]) for coord2 in transformed_coords2])
	y_length2 = np.mean([np.abs(coord2[1] - centroid2[1]) for coord2 in transformed_coords2])
	z_length2 = np.mean([np.abs(coord2[2] - centroid2[2]) for coord2 in transformed_coords2])

	x_length = np.mean((x_length1, x_length2))
	y_length = np.mean((y_length1, y_length2))
	z_length = np.mean((z_length1, z_length2))

	x_mean = np.mean(np.abs(x_diffs))/x_length
	y_mean = np.mean(np.abs(y_diffs))/y_length
	z_mean = np.mean(np.abs(z_diffs))/z_length

	return z_mean/(x_mean + y_mean + z_mean)
from matplotlib import pyplot as plt
import sys
sys.path.append("..")
import compartment_analysis as ca
import data_tools as dt
import os

paths = sys.argv[1:len(sys.argv)]
prefixes = [os.path.basename(path) for path in paths]
structs = [dt.structureFromBed(path) for path in paths]
mats = [dt.matFromBed(path, struct) for path, struct in zip(paths, structs)]
all_comps = [ca.get_compartments(mat) for mat in mats]
all_gen_coords = [struct.getGenCoords() for struct in structs]

#all_comps[len(all_comps)-1] = -all_comps[len(all_comps)-1]

for gen_coords, comps, prefix in zip(all_gen_coords, all_comps, prefixes):
    plt.plot(gen_coords, comps, label=prefix)

plt.legend()
plt.show()
Example #7
0
    path2 = "hic_data/{}_{}_{}kb.bed".format(cell_type2, chrom, res_kb)

    if os.path.isfile(path1) and os.path.isfile(path2):
        os.system("python ~/git/multimds/multimds.py --full {} {}".format(
            path1, path2))
        structure1 = dt.structure_from_file(
            "hic_data/{}_{}_{}kb_structure.tsv".format(cell_type1, chrom,
                                                       res_kb))
        structure2 = dt.structure_from_file(
            "hic_data/{}_{}_{}kb_structure.tsv".format(cell_type2, chrom,
                                                       res_kb))

        #plot.plot_structures_interactive((structure1, structure2))

        #compartments
        contacts1 = dt.matFromBed(path1, structure1)
        contacts2 = dt.matFromBed(path2, structure2)

        at.makeSymmetric(contacts1)
        at.makeSymmetric(contacts2)

        compartments1 = np.array(ca.get_compartments(contacts1))
        compartments2 = np.array(ca.get_compartments(contacts2))

        r, p = st.pearsonr(compartments1, compartments2)
        if r < 0:
            compartments2 = -compartments2

        #SVR
        coords1 = structure1.getCoords()
        coords2 = structure2.getCoords()
Example #8
0
cell_type1 = sys.argv[1]
cell_type2 = sys.argv[2]
res_kb = int(sys.argv[3])

struct1 = dt.structure_from_file("{}_21_{}kb_structure.tsv".format(
    cell_type1, res_kb))
struct2 = dt.structure_from_file("{}_21_{}kb_structure.tsv".format(
    cell_type2, res_kb))
gen_coords = np.array(struct1.getGenCoords())
dists = np.array([
    la.calcDistance(coord1, coord2)
    for coord1, coord2 in zip(struct1.getCoords(), struct2.getCoords())
])

mat1 = dt.matFromBed("hic_data/{}_21_{}kb.bed".format(cell_type1, res_kb),
                     struct1)
comps1 = ca.get_compartments(mat1, struct1)
mat2 = dt.matFromBed("hic_data/{}_21_{}kb.bed".format(cell_type2, res_kb),
                     struct2)
comps2 = ca.get_compartments(mat2, struct2)

r, p = st.pearsonr(comps1, comps2)
if r < 0:
    comps1 = -comps1

comp_diffs = np.abs(comps1 - comps2)

plt.subplot2grid((10, 10), (0, 0), 9, 10, frameon=False)
plt.plot(gen_coords,
         dists / max(dists),
         lw=2,
Example #9
0
def partitioned_mds(path1,
                    path2,
                    prefix="",
                    centromere=0,
                    num_partitions=4,
                    maxmemory=32000000,
                    num_threads=3,
                    alpha=4,
                    res_ratio=10,
                    penalty=0.05,
                    weight=0.05):
    """Partitions structure into substructures and performs MDS"""
    #create low-res structures
    lowstructure1 = create_low_res_structure(path1, res_ratio)
    lowstructure2 = create_low_res_structure(path2, res_ratio)
    dt.make_compatible((lowstructure1, lowstructure2))

    #get partitions
    n = len(lowstructure1.getPoints())
    if centromere == 0:
        midpoint = int(n / 2)
    else:
        midpoint = lowstructure1.chrom.getAbsoluteIndex(centromere)

    assert num_partitions % 2 == 0

    partition_size1 = int(np.ceil(float(midpoint) / (num_partitions / 2)))
    partition_size2 = int(np.ceil(float(n - midpoint) / (num_partitions / 2)))

    lowpartitions = [
    ]  #low substructures, defined on absolute indices not relative indices

    for i in range(int(num_partitions / 2)):
        lowpartitions.append(
            (i * partition_size1, min(((i + 1) * partition_size1), midpoint)))

    for i in range(int(num_partitions / 2)):
        lowpartitions.append((midpoint + i * partition_size2,
                              min((midpoint + (i + 1) * partition_size2),
                                  n - 1)))

    lowpartitions = np.array(lowpartitions)

    low_contactMat1 = dt.matFromBed(path1, lowstructure1)
    low_contactMat2 = dt.matFromBed(path2, lowstructure2)

    tad.substructuresFromAbsoluteTads(lowstructure1, lowpartitions)
    tad.substructuresFromAbsoluteTads(lowstructure2, lowpartitions)

    #create high-res chroms
    size1, res1 = dt.basicParamsFromBed(path1)
    highChrom1 = dt.ChromParameters(lowstructure1.chrom.minPos,
                                    lowstructure1.chrom.maxPos, res1,
                                    lowstructure1.chrom.name, size1)
    size2, res2 = dt.basicParamsFromBed(path2)
    highChrom2 = dt.ChromParameters(lowstructure2.chrom.minPos,
                                    lowstructure2.chrom.maxPos, res2,
                                    lowstructure2.chrom.name, size2)

    #initialize high-res substructures
    high_substructures1 = []
    high_substructures2 = []
    low_gen_coords = lowstructure1.getGenCoords()
    offset1 = 0  #initialize
    offset2 = 0
    for partition in lowpartitions:
        start_gen_coord = low_gen_coords[partition[0]]
        end_gen_coord = low_gen_coords[partition[1]]
        high_substructure1 = dt.structureFromBed(path1, highChrom1,
                                                 start_gen_coord,
                                                 end_gen_coord, offset1)
        high_substructure2 = dt.structureFromBed(path2, highChrom2,
                                                 start_gen_coord,
                                                 end_gen_coord, offset2)
        high_substructures1.append(high_substructure1)
        high_substructures2.append(high_substructure2)
        offset1 += (len(high_substructure1.points) - 1)  #update
        offset2 += (len(high_substructure2.points) - 1)  #update

    for high_substructure1, high_substructure2 in zip(high_substructures1,
                                                      high_substructures2):
        dt.make_points_compatible((high_substructure1, high_substructure2))

    highstructure1 = dt.Structure([], high_substructures1, highChrom1, 0)
    highstructure2 = dt.Structure([], high_substructures2, highChrom2, 0)

    infer_structures(low_contactMat1, lowstructure1, low_contactMat2,
                     lowstructure2, alpha, penalty, num_threads, weight)
    print("Low-resolution MDS complete")

    highSubstructures1 = pymp.shared.list(highstructure1.structures)
    highSubstructures2 = pymp.shared.list(highstructure2.structures)
    lowSubstructures1 = pymp.shared.list(lowstructure1.structures)
    lowSubstructures2 = pymp.shared.list(lowstructure2.structures)

    numSubstructures = len(highstructure1.structures)
    num_threads = min(
        (num_threads, mp.cpu_count(), numSubstructures)
    )  #don't exceed number of requested threads, available threads, or structures
    with pymp.Parallel(num_threads) as p:
        for substructurenum in p.range(numSubstructures):
            highSubstructure1 = highSubstructures1[substructurenum]
            highSubstructure2 = highSubstructures2[substructurenum]
            trueLow1 = lowSubstructures1[substructurenum]
            trueLow2 = lowSubstructures2[substructurenum]

            #joint MDS
            structure_contactMat1 = dt.matFromBed(
                path1,
                highSubstructure1)  #contact matrix for this structure only
            structure_contactMat2 = dt.matFromBed(
                path2,
                highSubstructure2)  #contact matrix for this structure only

            infer_structures(structure_contactMat1, highSubstructure1,
                             structure_contactMat2, highSubstructure2, 2.5,
                             penalty, num_threads, weight)

            transform(trueLow1, highSubstructure1, res_ratio)
            transform(trueLow2, highSubstructure2, res_ratio)

            highSubstructures1[substructurenum] = highSubstructure1
            highSubstructures2[substructurenum] = highSubstructure2

            print("MDS performed on structure {} of {}".format(
                substructurenum + 1, numSubstructures))

    highstructure1.setstructures(highSubstructures1)
    highstructure2.setstructures(highSubstructures2)

    highstructure1.set_rel_indices()
    highstructure2.set_rel_indices()

    return highstructure1, highstructure2
Example #10
0
import sys
sys.path.append("..")
import data_tools as dt
import numpy as np
import compartment_analysis as ca
from sklearn import svm
import linear_algebra as la
from mayavi import mlab

path1 = "hic_data/GM12878_combined_21_100kb.bed"
path2 = "hic_data/K562_21_100kb.bed"

struct1 = dt.structure_from_file("GM12878_combined_21_100kb_structure.tsv")
struct2 = dt.structure_from_file("K562_21_100kb_structure.tsv")

contacts1 = dt.matFromBed(path1, struct1)
enrichments1 = np.loadtxt("binding_data/GM12878_21_100kb_active_coverage.bed",
                          usecols=6)
bin_nums1 = struct1.nonzero_abs_indices() + int(
    struct1.chrom.minPos / struct1.chrom.res)
enrichments1 = enrichments1[bin_nums1]
comps1 = np.array(ca.get_compartments(contacts1, struct1, enrichments1))

contacts2 = dt.matFromBed(path2, struct2)
enrichments2 = np.loadtxt("binding_data/K562_21_100kb_active_coverage.bed",
                          usecols=6)
bin_nums2 = struct2.nonzero_abs_indices() + int(
    struct2.chrom.minPos / struct2.chrom.res)
enrichments2 = enrichments2[bin_nums2]
comps2 = np.array(ca.get_compartments(contacts2, struct2, enrichments2))
Example #11
0
import sys
sys.path.append("..")
import data_tools as dt
from matplotlib import pyplot as plt
import numpy as np

mat = dt.matFromBed(sys.argv[1])

n = len(mat)

tots = np.zeros(n - 1)
counts = np.zeros_like(tots)

for i in range(n):
    for j in range(i):
        s = i - j
        if mat[i, j] != 0:
            tots[s - 1] += mat[i, j]
            counts[s - 1] += 1

avgs = np.zeros_like(tots)

for i, (tot, count), in enumerate(zip(tots, counts)):
    if count != 0:
        avgs[i] = tot / count

plt.plot(list(range(n - 1)), avgs)
plt.xlabel("Separation (number of bins)")
plt.ylabel("Average contact frequency")
plt.show()
Example #12
0
import numpy as np
import sys
sys.path.append("..")
import data_tools as dt

in_path = sys.argv[1]
out_path = sys.argv[2]

contactMat = dt.matFromBed(in_path)
np.savetxt(out_path, contactMat, delimiter="\t")
Example #13
0
import numpy as np

all_species = ("Mouse", "Human", "Yeast")
all_res_kb = (100, 100, 32)

boxes = [[] for species in all_species]

for i, (species, res_kb) in enumerate(zip(all_species, all_res_kb)):
	with open("{}_list.txt".format(species)) as infile:
		for line in infile:
			prefix = line.strip()
			for chrom in range(1, 23):
				path = "hic_data/{}_{}_{}kb.bed".format(prefix, chrom, res_kb)

				if os.path.isfile(path):	
					mat = dt.matFromBed(path)
					oe_mat = ca.oe(mat)
					cor_mat = ca.cor(oe_mat)
					pca = PCA(n_components=1)
					pca.fit(cor_mat)
					boxes[i].append(pca.explained_variance_ratio_[0])

		infile.close()

#start with a frameless plot (extra room on the left)
plt.subplot2grid((10,10), (0,0), 9, 10, frameon=False)

#label axes
plt.ylabel("PC1 explained variance ratio", fontsize=10)

#define offsets
Example #14
0
import numpy as np
import sys
sys.path.append("..")
import data_tools as dt

inpath = sys.argv[1]
outpath = sys.argv[2]

structure = dt.structureFromBed(inpath, None, None)
contactMat = dt.matFromBed(inpath, structure)
n = len(contactMat)
fullMat = np.zeros((n, n + 2))

#locus IDs
for i, pointNum in enumerate(structure.getPointNums()):
    fullMat[i, 0] = structure.chrom.minPos + structure.chrom.res * pointNum
    fullMat[i,
            1] = structure.chrom.minPos + structure.chrom.res * (pointNum + 1)

fullMat[:, 2:n + 2] = contactMat

maxNumDigits = int(np.ceil(np.log10(np.amax(fullMat))))
formatstring = "%" + str(maxNumDigits) + "d"
np.savetxt(outpath, fullMat, formatstring, delimiter="\t")
Example #15
0
import numpy as np
import sys
sys.path.append("..")
import data_tools as dt

in_path = sys.argv[1]
out_path = sys.argv[2]

cluster = dt.clusterFromBed(in_path, None, None)
contactMat = dt.matFromBed(in_path, cluster)
np.savetxt(out_path, contactMat, delimiter="\t")
Example #16
0
    with open("{}_design.txt".format(comparison)) as infile:
        for line in infile:
            prefix1, prefix2 = line.strip().split()
            for chrom in range(1, 23):
                path1 = "hic_data/{}_{}_100kb.bed".format(prefix1, chrom)
                path2 = "hic_data/{}_{}_100kb.bed".format(prefix2, chrom)

                if os.path.isfile(path1) and os.path.isfile(path2):

                    #load structures
                    structure1 = dt.structureFromBed(path1)
                    structure2 = dt.structureFromBed(path2)

                    dt.make_compatible((structure1, structure2))

                    mat1 = dt.matFromBed(path1, structure1)
                    mat2 = dt.matFromBed(path2, structure2)

                    comps1 = ca.get_compartments(mat1, structure1)
                    comps2 = ca.get_compartments(mat2, structure2)

                    r, p = st.pearsonr(comps1, comps2)

                    boxes[i].append(np.abs(r))

        infile.close()

plt.subplot2grid((10, 10), (0, 0), 9, 10, frameon=False)

#label axes
plt.ylabel("Compartment correlation", fontsize=10)
Example #17
0
from matplotlib import pyplot as plt
import numpy as np
import sys
sys.path.append("..")
import data_tools as dt
import array_tools as at
import misc

bedpath = "hic_data/GM12878_combined_22_100kb.bed"

mmds_cluster = dt.clusterFromFile(
    "hic_data/GM12878_combined_22_100kb_mmds_coords.tsv")
contactMat = dt.matFromBed(bedpath, mmds_cluster)
mmds_true_mat = at.contactToDist(contactMat)
at.makeSymmetric(mmds_true_mat)
for j in range(len(mmds_true_mat)):  #remove diagonal
    mmds_true_mat[j, j] = 0
mmds_distMat = misc.distMat(mmds_cluster)
mmds_r = misc.pearson(mmds_true_mat, mmds_distMat)

cmds_cluster = dt.clusterFromFile(
    "hic_data/GM12878_combined_22_100kb_cmds_coords.tsv")
contactMat = dt.matFromBed(bedpath, cmds_cluster)
cmds_true_mat = at.contactToDist(contactMat)
at.makeSymmetric(cmds_true_mat)
for j in range(len(cmds_true_mat)):  #remove diagonal
    cmds_true_mat[j, j] = 0
cmds_distMat = misc.distMat(cmds_cluster)
cmds_r = misc.pearson(cmds_true_mat, cmds_distMat)

minimds_cluster = dt.clusterFromFile(
Example #18
0
def partitionedMDS(path, lowpath, args):
    """Partitions cluster into subclusters and performs MDS"""
    domainSmoothingParameter = args[0]
    minSizeFraction = args[1]
    maxmemory = args[2]
    num_threads = args[3]

    #create low-res cluster
    lowCluster = dt.clusterFromBed(lowpath, None, None)

    #get TADs
    low_contactMat = dt.matFromBed(lowpath, lowCluster)
    lowTads = tad.getDomains(low_contactMat, lowCluster,
                             domainSmoothingParameter,
                             minSizeFraction)  #low subclusters

    #create high-res chrom
    size, res = dt.basicParamsFromBed(path)
    highChrom = dt.ChromParameters(lowCluster.chrom.minPos,
                                   lowCluster.chrom.maxPos, res,
                                   lowCluster.chrom.name, size)

    #create high-res cluster
    resRatio = lowCluster.chrom.res / highChrom.res
    highTads = lowTads * resRatio
    highCluster = dt.clusterFromBed(path, highChrom, highTads)

    #create compatible subclusters
    tad.subclustersFromTads(highCluster, lowCluster, lowTads)

    infer_cluster(low_contactMat, lowCluster, False)
    print "Low-resolution MDS complete"

    highSubclusters = pymp.shared.list(highCluster.clusters)
    lowSubclusters = pymp.shared.list(lowCluster.clusters)

    numSubclusters = len(highCluster.clusters)
    num_threads = min(
        (num_threads, mp.cpu_count(), numSubclusters)
    )  #don't exceed number of requested threads, available threads, or clusters
    with pymp.Parallel(num_threads) as p:
        for subclusternum in p.range(numSubclusters):
            highSubcluster = highSubclusters[subclusternum]
            trueLow = lowSubclusters[subclusternum]

            #perform MDS individually
            cluster_contactMat = dt.matFromBed(
                path, highSubcluster)  #contact matrix for this cluster only
            infer_cluster(cluster_contactMat, highSubcluster, False)

            #approximate as low resolution
            inferredLow = dt.highToLow(highSubcluster, resRatio)

            #recover the transformation for inferred from true low cluster
            r, t, reflect = la.getTransformation(inferredLow, trueLow)
            t *= resRatio**(2. / 3)  #rescale

            #transform high cluster
            highSubcluster.transform(r, t, reflect)
            highSubclusters[subclusternum] = highSubcluster

            print "MDS performed on cluster {} of {}".format(
                subclusternum + 1, numSubclusters)

    highCluster.setClusters(highSubclusters)

    return highCluster
Example #19
0
sys.path.append("/home/lur159/git/miniMDS")
import data_tools as dt
import numpy as np
import tools

path = sys.argv[1]
res = int(sys.argv[2])
outpath = sys.argv[3]

chrom = dt.chromFromBed(path)
chrom.res = res
chrom.minPos = int(np.floor(float(chrom.minPos) / res)) * res  #round
chrom.maxPos = int(np.ceil(float(chrom.maxPos) / res)) * res

struct = dt.structureFromBed(path, chrom)
mat = dt.matFromBed(path, struct)

points = struct.getPoints()

with open(outpath, "w") as out:
    for i in range(len(mat)):
        abs_index1 = points[i].absolute_index
        for j in range(i):
            if mat[i, j] != 0:
                abs_index2 = points[j].absolute_index
                out.write("\t".join(
                    (chrom.name, str(chrom.getGenCoord(abs_index1)),
                     str(chrom.getGenCoord(abs_index1) + res), chrom.name,
                     str(chrom.getGenCoord(abs_index2)),
                     str(chrom.getGenCoord(abs_index2) + res), str(mat[i,
                                                                       j]))))
Example #20
0
import data_tools as dt
import array_tools as at
import os
import numpy as np

res = int(sys.argv[1])
res_kb = res / 1000

if os.path.isfile("A_compartment_{}kb.bed".format(res_kb)):
    os.system("rm A_compartment_{}kb.bed".format(res_kb))

for chrom in (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
              20, 21, 22):
    path = "hic_data/GM12878_combined_{}_100kb.bed".format(chrom)
    structure = dt.structureFromBed(path)
    contacts = dt.matFromBed(path, structure)
    at.makeSymmetric(contacts)
    enrichments = np.array(np.loadtxt(
        "binding_data/Gm12878_{}_100kb_active_coverage.bed".format(chrom),
        dtype=object)[:, 6],
                           dtype=float)
    bin_nums = structure.nonzero_abs_indices(
    ) + structure.chrom.minPos / structure.chrom.res
    enrichments = enrichments[bin_nums]
    compartments = np.array(ca.get_compartments(contacts, enrichments))
    gen_coords = np.array(structure.getGenCoords())
    a_gen_coords = gen_coords[np.where(compartments > 0)]
    with open("A_compartment_{}kb.bed".format(res_kb), "a") as out:
        for a_gen_coord in a_gen_coords:
            for i in range(100 / res_kb):
                out.write("\t".join(
Example #21
0
from matplotlib import pyplot as plt
import numpy as np
import sys
sys.path.append("..")
import data_tools as dt
import array_tools as at
import misc

#"true" distance matrix
cluster = dt.clusterFromBed(bedpath, None, None)
contactMat = dt.matFromBed(bedpath, cluster)
distMat = at.contactToDist(contactMat)
at.makeSymmetric(distMat)
for j in range(len(distMat)):  #remove diagonal
    distMat[j, j] = 0

chromthreed_distMat = misc.distsFromCoords(
    "Chromosome3D/output/chr22_100kb/chr22_100kb_coords.tsv")
chromthreed_r = misc.pearson(distMat, chromthreed_distMat)

mmds_distMat = dt.clusterFromFile(
    "hic_data/GM12878_combined_22_10kb_mmds_coords.tsv").distMat()
mmds_r = misc.pearson(distMat, mmds_distMat)

cmds_distMat = dt.clusterFromFile(
    "hic_data/GM12878_combined_22_10kb_cmds_coords.tsv").distMat()
cmds_r = misc.pearson(distMat, cmds_distMat)

minimds_distMat = dt.clusterFromFile(
    "hic_data/GM12878_combined_22_10kb_minimds_coords.tsv").distMat()
minimds_r = misc.pearson(distMat, minimds_distMat)
Example #22
0
chroms = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
          20, 21, 22, "X")
n = len(chroms)

mmds_rs = np.zeros(n)
cmds_rs = np.zeros(n)
minimds_rs = np.zeros(n)
mogen_rs = np.zeros(n)

for i, chrom in enumerate(chroms):
    bedpath = "hic_data/GM12878_combined_{}_10kb.bed".format(chrom)

    mmds_structure = dt.structure_from_file(
        "hic_data/GM12878_combined_{}_10kb_mmds_coords.tsv".format(chrom))
    contactMat = dt.matFromBed(bedpath, mmds_structure)
    mmds_true_mat = at.contactToDist(contactMat)
    at.makeSymmetric(mmds_true_mat)
    for j in range(len(mmds_true_mat)):  #remove diagonal
        mmds_true_mat[j, j] = 0
    mmds_distMat = misc.distMat(mmds_structure)
    mmds_rs[i] = misc.pearson(mmds_true_mat, mmds_distMat)

    cmds_structure = dt.structure_from_file(
        "hic_data/GM12878_combined_{}_10kb_cmds_coords.tsv".format(chrom))
    contactMat = dt.matFromBed(bedpath, cmds_structure)
    cmds_true_mat = at.contactToDist(contactMat)
    at.makeSymmetric(cmds_true_mat)
    for j in range(len(cmds_true_mat)):  #remove diagonal
        cmds_true_mat[j, j] = 0
    cmds_distMat = misc.distMat(cmds_structure)
Example #23
0
def partitionedMDS(path, args):
    """Partitions structure into substructures and performs MDS"""
    domainSmoothingParameter = args[0]
    minSizeFraction = args[1]
    maxmemory = args[2]
    num_threads = args[3]
    alpha = args[4]
    res_ratio = args[5]
    alpha2 = args[6]

    #create low-res structure
    low_chrom = dt.chromFromBed(path)
    low_chrom.res *= res_ratio
    lowstructure = dt.structureFromBed(path, low_chrom)  #low global structure

    #get TADs
    low_contactMat = dt.matFromBed(path, lowstructure)
    low_tad_indices = tad.getDomains(
        low_contactMat, lowstructure, domainSmoothingParameter, minSizeFraction
    )  #low substructures, defined on relative indices not absolute indices
    tad.substructuresFromTads(lowstructure, low_tad_indices)

    #create high-res chrom
    size, res = dt.basicParamsFromBed(path)
    highChrom = dt.ChromParameters(lowstructure.chrom.minPos,
                                   lowstructure.chrom.maxPos, res,
                                   lowstructure.chrom.name, size)

    highstructure = dt.Structure([], [], highChrom, 0)
    high_substructures = []

    low_gen_coords = lowstructure.getGenCoords()
    offset = 0  #initialize
    for td in low_tad_indices:
        start_gen_coord = low_gen_coords[td[0]]
        end_gen_coord = low_gen_coords[td[1]]
        high_substructure = dt.structureFromBed(path, highChrom,
                                                start_gen_coord, end_gen_coord,
                                                offset)
        high_substructures.append(high_substructure)
        offset += len(high_substructure.points)  #update
        offset -= 1

    highstructure.setstructures(high_substructures)

    infer_structure(low_contactMat, lowstructure, alpha, num_threads)
    print "Low-resolution MDS complete"

    highSubstructures = pymp.shared.list(highstructure.structures)
    lowSubstructures = pymp.shared.list(lowstructure.structures)

    numSubstructures = len(highstructure.structures)
    num_threads = min(
        (num_threads, mp.cpu_count(), numSubstructures)
    )  #don't exceed number of requested threads, available threads, or structures
    with pymp.Parallel(num_threads) as p:
        for substructurenum in p.range(numSubstructures):
            highSubstructure = highSubstructures[substructurenum]
            if len(highSubstructure.getPoints()) > 0:  #skip empty
                trueLow = lowSubstructures[substructurenum]

                #perform MDS individually
                structure_contactMat = dt.matFromBed(
                    path,
                    highSubstructure)  #contact matrix for this structure only
                infer_structure(structure_contactMat, highSubstructure, alpha2,
                                num_threads)

                #approximate as low resolution
                inferredLow = dt.highToLow(highSubstructure, res_ratio)

                #rescale
                scaling_factor = la.radius_of_gyration(
                    trueLow) / la.radius_of_gyration(inferredLow)
                for i, point in enumerate(inferredLow.points):
                    if point != 0:
                        x, y, z = point.pos
                        inferredLow.points[i].pos = (x * scaling_factor,
                                                     y * scaling_factor,
                                                     z * scaling_factor)

                #recover the transformation for inferred from true low structure
                r, t = la.getTransformation(inferredLow, trueLow)
                t /= scaling_factor

                #transform high structure
                highSubstructure.transform(r, t)
                highSubstructures[substructurenum] = highSubstructure

                print "MDS performed on structure {} of {}".format(
                    substructurenum + 1, numSubstructures)

    highstructure.setstructures(highSubstructures)

    return highstructure
Example #24
0
def fullMDS(path, classical, alpha):
    """MDS without partitioning"""
    cluster = dt.clusterFromBed(path, None, None)
    contactMat = dt.matFromBed(path, cluster)
    infer_cluster(contactMat, cluster, alpha, classical)
    return cluster