def full_mds(path1, path2, alpha=4, penalty=0.05, num_threads=3, weight=0.05, prefix=""): """MDS without partitioning""" structure1 = dt.structureFromBed(path1) structure2 = dt.structureFromBed(path2) dt.make_compatible((structure1, structure2)) contactMat1 = dt.matFromBed(path1, structure1) contactMat2 = dt.matFromBed(path2, structure2) infer_structures(contactMat1, structure1, contactMat2, structure2, alpha, penalty, num_threads, weight) prefix1 = os.path.splitext(os.path.basename(path1))[0] structure1.write("{}{}_structure.tsv".format(prefix, prefix1)) prefix2 = os.path.splitext(os.path.basename(path2))[0] structure2.write("{}{}_structure.tsv".format(prefix, prefix2)) dists = la.calculate_distances(structure1, structure2) with open("{}{}_{}_relocalization.bed".format(prefix, prefix1, prefix2), "w") as out: for gen_coord, dist in zip(structure1.getGenCoords(), dists): out.write("\t".join( (structure1.chrom.name, str(gen_coord), str(gen_coord + structure1.chrom.res), str(dist)))) out.write("\n") out.close() print("Fractional compartment change: ") print(calculate_compartment_fraction(structure1, structure2, path1, path2)) return structure1, structure2
def fullMDS(path1, path2, alpha, penalty, num_threads, weight): """MDS without partitioning""" structure1 = dt.structureFromBed(path1) structure2 = dt.structureFromBed(path2) dt.make_compatible((structure1, structure2)) contactMat1 = dt.matFromBed(path1, structure1) contactMat2 = dt.matFromBed(path2, structure2) infer_structures(contactMat1, structure1, contactMat2, structure2, alpha, penalty, num_threads, weight) return structure1, structure2
def fullMDS(path, classical, alpha, num_threads, weight): """MDS without partitioning""" structure = dt.structureFromBed(path) contactMat = dt.matFromBed(path, structure) infer_structure(contactMat, structure, alpha, num_threads, weight, classical) return structure
def create_low_res_structure(path, res_ratio): low_chrom = dt.chromFromBed(path) low_chrom.res *= res_ratio low_chrom.minPos = int(np.floor( float(low_chrom.minPos) / low_chrom.res)) * low_chrom.res #round low_chrom.maxPos = int(np.ceil( float(low_chrom.maxPos) / low_chrom.res)) * low_chrom.res return dt.structureFromBed(path, low_chrom)
def interMDS(names, prefix, inter_res, intra_res, full, args): inter_res_string = tools.get_res_string(inter_res) intra_res_string = tools.get_res_string(intra_res) #get low-res structures from intra files low_structures = [] for name in names: path = "{}_{}_{}.bed".format(prefix, name, intra_res_string) chrom = dt.chromFromBed(path) #reduce res chrom.res = inter_res chrom.minPos = int(np.floor(float(chrom.minPos)/chrom.res)) * chrom.res #round chrom.maxPos = int(np.ceil(float(chrom.maxPos)/chrom.res)) * chrom.res low_structures.append(dt.structureFromBed(path, chrom)) #for correct indexing n = len(names) offsets = np.zeros(n, dtype=int) for i in range(1, n): offsets[i] = offsets[i-1] + len(low_structures[i-1].getPoints()) inter_mat = get_inter_mat(prefix, inter_res_string, intra_res_string, low_structures, offsets) #perform MDS at low resolution on all chroms infer_structures(inter_mat, low_structures, offsets, args[3], args[4]) #perform MDS at high resolution on each chrom high_structures = [] inferred_low_structures = [] ts = [] for true_low, name in zip(low_structures, names): path = "{}_{}_{}.bed".format(prefix, name, intra_res_string) if full: high_structure = mm.fullMDS(path, False, args[4], args[3]) else: high_structure = mm.partitionedMDS(path, args) high_structures.append(high_structure) inferred_low = dt.highToLow(high_structure, true_low.chrom.res/high_structure.chrom.res) inferred_low_structures.append(inferred_low) #rescale rescaling_factor = la.radius_of_gyration(true_low)/la.radius_of_gyration(inferred_low) rescaled_coords = [rescaling_factor * coord for coord in inferred_low.getCoords()] for i, point in enumerate(inferred_low.getPoints()): point.pos = rescaled_coords[i] r, t = la.getTransformation(inferred_low, true_low) high_structure.transform(r, None) #do not translate now (need to rescale) ts.append(t) #translate (with rescaling) low_rgs = np.array([la.radius_of_gyration(structure) for structure in low_structures]) high_rgs = np.array([la.radius_of_gyration(structure) for structure in high_structures]) scaling_factor = np.mean(high_rgs/low_rgs) for high_structure, t in zip(high_structures, ts): high_structure.transform(None, scaling_factor*t) #rescale translation return high_structures
def partitionedMDS(path, args): """Partitions structure into substructures and performs MDS""" domainSmoothingParameter = args[0] minSizeFraction = args[1] maxmemory = args[2] num_threads = args[3] alpha = args[4] res_ratio = args[5] alpha2 = args[6] #create low-res structure low_chrom = dt.chromFromBed(path) low_chrom.res *= res_ratio lowstructure = dt.structureFromBed(path, low_chrom) #low global structure #get TADs low_contactMat = dt.matFromBed(path, lowstructure) low_tad_indices = tad.getDomains( low_contactMat, lowstructure, domainSmoothingParameter, minSizeFraction ) #low substructures, defined on relative indices not absolute indices tad.substructuresFromTads(lowstructure, low_tad_indices) #create high-res chrom size, res = dt.basicParamsFromBed(path) highChrom = dt.ChromParameters(lowstructure.chrom.minPos, lowstructure.chrom.maxPos, res, lowstructure.chrom.name, size) highstructure = dt.Structure([], [], highChrom, 0) high_substructures = [] low_gen_coords = lowstructure.getGenCoords() offset = 0 #initialize for td in low_tad_indices: start_gen_coord = low_gen_coords[td[0]] end_gen_coord = low_gen_coords[td[1]] high_substructure = dt.structureFromBed(path, highChrom, start_gen_coord, end_gen_coord, offset) high_substructures.append(high_substructure) offset += len(high_substructure.points) #update offset -= 1 highstructure.setstructures(high_substructures) infer_structure(low_contactMat, lowstructure, alpha, num_threads) print "Low-resolution MDS complete" highSubstructures = pymp.shared.list(highstructure.structures) lowSubstructures = pymp.shared.list(lowstructure.structures) numSubstructures = len(highstructure.structures) num_threads = min( (num_threads, mp.cpu_count(), numSubstructures) ) #don't exceed number of requested threads, available threads, or structures with pymp.Parallel(num_threads) as p: for substructurenum in p.range(numSubstructures): highSubstructure = highSubstructures[substructurenum] if len(highSubstructure.getPoints()) > 0: #skip empty trueLow = lowSubstructures[substructurenum] #perform MDS individually structure_contactMat = dt.matFromBed( path, highSubstructure) #contact matrix for this structure only infer_structure(structure_contactMat, highSubstructure, alpha2, num_threads) #approximate as low resolution inferredLow = dt.highToLow(highSubstructure, res_ratio) #rescale scaling_factor = la.radius_of_gyration( trueLow) / la.radius_of_gyration(inferredLow) for i, point in enumerate(inferredLow.points): if point != 0: x, y, z = point.pos inferredLow.points[i].pos = (x * scaling_factor, y * scaling_factor, z * scaling_factor) #recover the transformation for inferred from true low structure r, t = la.getTransformation(inferredLow, trueLow) t /= scaling_factor #transform high structure highSubstructure.transform(r, t) highSubstructures[substructurenum] = highSubstructure print "MDS performed on structure {} of {}".format( substructurenum + 1, numSubstructures) highstructure.setstructures(highSubstructures) return highstructure
import data_tools as dt import numpy as np import sys import linear_algebra as la from sklearn.manifold import MDS chrom = sys.argv[1] res_kb = 100 prefix1 = "GM12878_combined" prefix2 = "K562" path1 = "hic_data/{}_{}_{}kb.bed".format(prefix1, chrom, res_kb) path2 = "hic_data/{}_{}_{}kb.bed".format(prefix2, chrom, res_kb) structure1 = dt.structureFromBed(path1, None, None) structure2 = dt.structureFromBed(path2, None, None) #make structures compatible dt.make_compatible((structure1, structure2)) #get distance matrices dists1 = dt.normalized_dist_mat(path1, structure1) dists2 = dt.normalized_dist_mat(path2, structure2) #MDS coords1 = MDS(n_components=3, random_state=np.random.RandomState(), dissimilarity="precomputed", n_jobs=-1).fit_transform(dists1) coords2 = MDS(n_components=3, random_state=np.random.RandomState(),
import sys sys.path.append("..") import data_tools as dt res_kb = 100 chrom = sys.argv[1] cell_type1 = "GM12878_combined" cell_type2 = "K562" path1 = "hic_data/{}_{}_{}kb.bed".format(cell_type1, chrom, res_kb) path2 = "hic_data/{}_{}_{}kb.bed".format(cell_type2, chrom, res_kb) structure1 = dt.structureFromBed(path1) structure2 = dt.structureFromBed(path2) dt.make_compatible((structure1, structure2)) print "size\t" + str(len(structure1.getPoints()))
import numpy as np import sys sys.path.append("..") import data_tools as dt inpath = sys.argv[1] outpath = sys.argv[2] structure = dt.structureFromBed(inpath, None, None) contactMat = dt.matFromBed(inpath, structure) n = len(contactMat) fullMat = np.zeros((n, n + 2)) #locus IDs for i, pointNum in enumerate(structure.getPointNums()): fullMat[i, 0] = structure.chrom.minPos + structure.chrom.res * pointNum fullMat[i, 1] = structure.chrom.minPos + structure.chrom.res * (pointNum + 1) fullMat[:, 2:n + 2] = contactMat maxNumDigits = int(np.ceil(np.log10(np.amax(fullMat)))) formatstring = "%" + str(maxNumDigits) + "d" np.savetxt(outpath, fullMat, formatstring, delimiter="\t")
def partitioned_mds(path1, path2, prefix="", centromere=0, num_partitions=4, maxmemory=32000000, num_threads=3, alpha=4, res_ratio=10, penalty=0.05, weight=0.05): """Partitions structure into substructures and performs MDS""" #create low-res structures lowstructure1 = create_low_res_structure(path1, res_ratio) lowstructure2 = create_low_res_structure(path2, res_ratio) dt.make_compatible((lowstructure1, lowstructure2)) #get partitions n = len(lowstructure1.getPoints()) if centromere == 0: midpoint = int(n / 2) else: midpoint = lowstructure1.chrom.getAbsoluteIndex(centromere) assert num_partitions % 2 == 0 partition_size1 = int(np.ceil(float(midpoint) / (num_partitions / 2))) partition_size2 = int(np.ceil(float(n - midpoint) / (num_partitions / 2))) lowpartitions = [ ] #low substructures, defined on absolute indices not relative indices for i in range(int(num_partitions / 2)): lowpartitions.append( (i * partition_size1, min(((i + 1) * partition_size1), midpoint))) for i in range(int(num_partitions / 2)): lowpartitions.append((midpoint + i * partition_size2, min((midpoint + (i + 1) * partition_size2), n - 1))) lowpartitions = np.array(lowpartitions) low_contactMat1 = dt.matFromBed(path1, lowstructure1) low_contactMat2 = dt.matFromBed(path2, lowstructure2) tad.substructuresFromAbsoluteTads(lowstructure1, lowpartitions) tad.substructuresFromAbsoluteTads(lowstructure2, lowpartitions) #create high-res chroms size1, res1 = dt.basicParamsFromBed(path1) highChrom1 = dt.ChromParameters(lowstructure1.chrom.minPos, lowstructure1.chrom.maxPos, res1, lowstructure1.chrom.name, size1) size2, res2 = dt.basicParamsFromBed(path2) highChrom2 = dt.ChromParameters(lowstructure2.chrom.minPos, lowstructure2.chrom.maxPos, res2, lowstructure2.chrom.name, size2) #initialize high-res substructures high_substructures1 = [] high_substructures2 = [] low_gen_coords = lowstructure1.getGenCoords() offset1 = 0 #initialize offset2 = 0 for partition in lowpartitions: start_gen_coord = low_gen_coords[partition[0]] end_gen_coord = low_gen_coords[partition[1]] high_substructure1 = dt.structureFromBed(path1, highChrom1, start_gen_coord, end_gen_coord, offset1) high_substructure2 = dt.structureFromBed(path2, highChrom2, start_gen_coord, end_gen_coord, offset2) high_substructures1.append(high_substructure1) high_substructures2.append(high_substructure2) offset1 += (len(high_substructure1.points) - 1) #update offset2 += (len(high_substructure2.points) - 1) #update for high_substructure1, high_substructure2 in zip(high_substructures1, high_substructures2): dt.make_points_compatible((high_substructure1, high_substructure2)) highstructure1 = dt.Structure([], high_substructures1, highChrom1, 0) highstructure2 = dt.Structure([], high_substructures2, highChrom2, 0) infer_structures(low_contactMat1, lowstructure1, low_contactMat2, lowstructure2, alpha, penalty, num_threads, weight) print("Low-resolution MDS complete") highSubstructures1 = pymp.shared.list(highstructure1.structures) highSubstructures2 = pymp.shared.list(highstructure2.structures) lowSubstructures1 = pymp.shared.list(lowstructure1.structures) lowSubstructures2 = pymp.shared.list(lowstructure2.structures) numSubstructures = len(highstructure1.structures) num_threads = min( (num_threads, mp.cpu_count(), numSubstructures) ) #don't exceed number of requested threads, available threads, or structures with pymp.Parallel(num_threads) as p: for substructurenum in p.range(numSubstructures): highSubstructure1 = highSubstructures1[substructurenum] highSubstructure2 = highSubstructures2[substructurenum] trueLow1 = lowSubstructures1[substructurenum] trueLow2 = lowSubstructures2[substructurenum] #joint MDS structure_contactMat1 = dt.matFromBed( path1, highSubstructure1) #contact matrix for this structure only structure_contactMat2 = dt.matFromBed( path2, highSubstructure2) #contact matrix for this structure only infer_structures(structure_contactMat1, highSubstructure1, structure_contactMat2, highSubstructure2, 2.5, penalty, num_threads, weight) transform(trueLow1, highSubstructure1, res_ratio) transform(trueLow2, highSubstructure2, res_ratio) highSubstructures1[substructurenum] = highSubstructure1 highSubstructures2[substructurenum] = highSubstructure2 print("MDS performed on structure {} of {}".format( substructurenum + 1, numSubstructures)) highstructure1.setstructures(highSubstructures1) highstructure2.setstructures(highSubstructures2) highstructure1.set_rel_indices() highstructure2.set_rel_indices() return highstructure1, highstructure2
import compartment_analysis as ca import data_tools as dt import array_tools as at import os import numpy as np res = int(sys.argv[1]) res_kb = res / 1000 if os.path.isfile("A_compartment_{}kb.bed".format(res_kb)): os.system("rm A_compartment_{}kb.bed".format(res_kb)) for chrom in (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22): path = "hic_data/GM12878_combined_{}_100kb.bed".format(chrom) structure = dt.structureFromBed(path) contacts = dt.matFromBed(path, structure) at.makeSymmetric(contacts) enrichments = np.array(np.loadtxt( "binding_data/Gm12878_{}_100kb_active_coverage.bed".format(chrom), dtype=object)[:, 6], dtype=float) bin_nums = structure.nonzero_abs_indices( ) + structure.chrom.minPos / structure.chrom.res enrichments = enrichments[bin_nums] compartments = np.array(ca.get_compartments(contacts, enrichments)) gen_coords = np.array(structure.getGenCoords()) a_gen_coords = gen_coords[np.where(compartments > 0)] with open("A_compartment_{}kb.bed".format(res_kb), "a") as out: for a_gen_coord in a_gen_coords: for i in range(100 / res_kb):
import sys sys.path.append("/home/lur159/git/miniMDS") import data_tools as dt import numpy as np import tools path = sys.argv[1] res = int(sys.argv[2]) outpath = sys.argv[3] chrom = dt.chromFromBed(path) chrom.res = res chrom.minPos = int(np.floor(float(chrom.minPos) / res)) * res #round chrom.maxPos = int(np.ceil(float(chrom.maxPos) / res)) * res struct = dt.structureFromBed(path, chrom) mat = dt.matFromBed(path, struct) points = struct.getPoints() with open(outpath, "w") as out: for i in range(len(mat)): abs_index1 = points[i].absolute_index for j in range(i): if mat[i, j] != 0: abs_index2 = points[j].absolute_index out.write("\t".join( (chrom.name, str(chrom.getGenCoord(abs_index1)), str(chrom.getGenCoord(abs_index1) + res), chrom.name, str(chrom.getGenCoord(abs_index2)), str(chrom.getGenCoord(abs_index2) + res), str(mat[i,
from matplotlib import pyplot as plt import sys sys.path.append("..") import compartment_analysis as ca import data_tools as dt import os paths = sys.argv[1:len(sys.argv)] prefixes = [os.path.basename(path) for path in paths] structs = [dt.structureFromBed(path) for path in paths] mats = [dt.matFromBed(path, struct) for path, struct in zip(paths, structs)] all_comps = [ca.get_compartments(mat) for mat in mats] all_gen_coords = [struct.getGenCoords() for struct in structs] #all_comps[len(all_comps)-1] = -all_comps[len(all_comps)-1] for gen_coords, comps, prefix in zip(all_gen_coords, all_comps, prefixes): plt.plot(gen_coords, comps, label=prefix) plt.legend() plt.show()