from multimds import data_tools as dt from scipy import stats as st from matplotlib import pyplot as plt import numpy as np from multimds import linear_algebra as la from scipy import signal as sg from multimds import multimds as mm path1 = "hic_data/GM12878_combined_19_100kb.bed" path2 = "hic_data/K562_19_100kb.bed" struct1, struct2 = mm.full_mds(path1, path2, prefix="test_") mat1 = dt.matFromBed( "hic_data/GM12878_combined_{}_{}kb.bed".format(chrom, res_kb), struct1) comps1 = ca.get_compartments(mat1, struct1) mat2 = dt.matFromBed("hic_data/K562_{}_{}kb.bed".format(chrom, res_kb), struct2) comps2 = ca.get_compartments(mat2, struct2) r, p = st.pearsonr(comps1, comps2) if r < 0: comps1 = -comps1 comp_diffs = np.abs(comps1 - comps2) dists = np.array([ la.calcDistance(coord1, coord2) for coord1, coord2 in zip(struct1.getCoords(), struct2.getCoords()) ]) dist_peaks = sg.find_peaks_cwt(dists, np.arange(1, 10))
if os.path.isfile(path1) and os.path.isfile(path2): structure1, structure2 = multimds.full_mds(path1, path2, penalty=penalty) structure1.rescale() structure2.rescale() r, t = la.getTransformation(structure1, structure2) structure1.transform(r, t) #compartments contacts1 = dt.matFromBed(path1, structure=structure1) contacts2 = dt.matFromBed(path2, structure=structure2) compartments1 = np.array( ca.get_compartments(contacts1, structure1)) compartments2 = np.array( ca.get_compartments(contacts2, structure2)) r, p = st.pearsonr(compartments1, compartments2) if r < 0: compartments2 = -compartments2 #SVR coords1 = structure1.getCoords() coords2 = structure2.getCoords() coords = np.concatenate((coords1, coords2)) compartments = np.concatenate((compartments1, compartments2)) clf = svm.LinearSVR() clf.fit(coords, compartments) coef = clf.coef_
from multimds import compartment_analysis as ca import numpy as np from sklearn import svm from multimds import linear_algebra as la from mayavi import mlab struct = dt.structure_from_file( "hic_data/GM12878_combined_21_100kb_structure.tsv") new_start = struct.chrom.getAbsoluteIndex(15000000) struct.subsamplePoints(new_start, len(struct.points) - 3) #compartments contacts = dt.matFromBed("hic_data/GM12878_combined_21_100kb.bed", struct) compartments = np.array(ca.get_compartments(contacts, struct)) #SVR coords = struct.getCoords() clf = svm.LinearSVR() clf.fit(coords, compartments) coef = clf.coef_ transformed_coords = np.array(la.change_coordinate_system(coef, coords)) xs = transformed_coords[:, 0] min_x = min(xs) max_x = max(xs) x_range = max_x - min_x ys = transformed_coords[:, 1] min_y = min(ys) max_y = max(ys)
1) multimds_coeffs = np.zeros_like(chroms, dtype=float) unaligned_coeffs = np.zeros_like(multimds_coeffs) for i, chrom in enumerate(chroms): path1 = "hic_data/{}_{}_{}kb.bed".format(cell_type1, chrom, res_kb) path2 = "hic_data/{}_{}_{}kb.bed".format(cell_type2, chrom, res_kb) structure1, structure2 = mm.multimds(path1, path2, penalty=penalty) #compartments contacts1 = dt.matFromBed(path1, structure1) contacts2 = dt.matFromBed(path2, structure2) compartments1 = np.array(ca.get_compartments(contacts1, structure1)) compartments2 = np.array(ca.get_compartments(contacts2, structure2)) r, p = st.pearsonr(compartments1, compartments2) if r < 0: compartments2 = -compartments2 #SVR coords1 = structure1.getCoords() coords2 = structure2.getCoords() coords = np.concatenate((coords1, coords2)) compartments = np.concatenate((compartments1, compartments2)) clf = svm.LinearSVR() clf.fit(coords, compartments) multimds_coeffs[i] = clf.score(coords, compartments)
float(chrom2.minPos) / chrom2.res)) * chrom2.res #round chrom2.maxPos = int(np.ceil(float(chrom2.maxPos) / chrom2.res)) * chrom2.res low_struct1 = dt.structureFromBed(path1, chrom1) low_struct2 = dt.structureFromBed(path2, chrom2) dt.make_compatible((low_struct1, low_struct2)) contacts1 = dt.matFromBed(path1, low_struct1) contacts2 = dt.matFromBed(path2, low_struct2) enrichments = np.loadtxt("binding_data/{}_{}_100kb_active_coverage.bed".format( format_celltype(cell_type1), chrom), usecols=6) bin_nums = low_struct1.nonzero_bins_whole_chrom() enrichments = enrichments[bin_nums] compartments1 = np.array( ca.get_compartments(contacts1, low_struct1, enrichments)) enrichments = np.loadtxt("binding_data/{}_{}_100kb_active_coverage.bed".format( format_celltype(cell_type2), chrom), usecols=6) bin_nums = low_struct2.nonzero_bins_whole_chrom() enrichments = enrichments[bin_nums] compartments2 = np.array( ca.get_compartments(contacts2, low_struct2, enrichments)) gen_coords = structure1.getGenCoords() compartment_diffs = np.abs(compartments1 - compartments2) dist_peaks = sg.find_peaks_cwt(dists, np.arange(1, 10))
chrom2.res = 100000 chrom1.minPos = int(np.floor(float(chrom1.minPos)/chrom1.res)) * chrom1.res #round chrom1.maxPos = int(np.ceil(float(chrom1.maxPos)/chrom1.res)) * chrom1.res chrom2.minPos = int(np.floor(float(chrom2.minPos)/chrom2.res)) * chrom2.res #round chrom2.maxPos = int(np.ceil(float(chrom2.maxPos)/chrom2.res)) * chrom2.res low_struct1 = dt.structureFromBed(path1, chrom1) low_struct2 = dt.structureFromBed(path2, chrom2) dt.make_compatible((low_struct1, low_struct2)) contacts1 = dt.matFromBed(path1, low_struct1) contacts2 = dt.matFromBed(path2, low_struct2) enrichments = np.loadtxt("binding_data/{}_{}_100kb_active_coverage.bed".format(format_celltype(cell_type1), chrom), usecols=6) bin_nums = low_struct1.nonzero_bins_whole_chrom() enrichments = enrichments[bin_nums] compartments1 = np.array(ca.get_compartments(contacts1, low_struct1, enrichments)) enrichments = np.loadtxt("binding_data/{}_{}_100kb_active_coverage.bed".format(format_celltype(cell_type2), chrom), usecols=6) bin_nums = low_struct2.nonzero_bins_whole_chrom() enrichments = enrichments[bin_nums] compartments2 = np.array(ca.get_compartments(contacts2, low_struct2, enrichments)) gen_coords = structure1.getGenCoords() compartment_diffs = np.abs(compartments1 - compartments2) dist_peaks = sg.find_peaks_cwt(dists, np.arange(1,10)) high_coords = structure1.getGenCoords() low_coords = low_struct1.getGenCoords()
boxes = [[] for species in all_species] for i, (species, res_kb) in enumerate(zip(all_species, all_res_kb)): with open("{}_list.txt".format(species)) as infile: for line in infile: prefix = line.strip() for chrom in range(1, 23): path = "hic_data/{}_{}_{}kb.bed".format(prefix, chrom, res_kb) if os.path.isfile(path): os.system("python ../minimds.py {}".format(path)) structure = dt.structure_from_file( "hic_data/{}_{}_{}kb_structure.tsv".format( prefix, chrom, res_kb)) mat = dt.matFromBed(path, structure) comps = ca.get_compartments(mat, structure) coords = structure.getCoords() clf = svm.LinearSVR() clf.fit(coords, comps) boxes[i].append(clf.score(coords, comps)) infile.close() plt.subplot2grid((10, 10), (0, 0), 9, 10, frameon=False) #label axes plt.ylabel("SVR R^2", fontsize=10) #define offsets ys = boxes n = len(ys)
res_kb = int(res/1000) for chrom in (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21): path1 = "hic_data/GM12878_combined_{}_100kb.bed".format(chrom) structure1 = dt.structureFromBed(path1) path2 = "hic_data/K562_{}_100kb.bed".format(chrom) structure2 = dt.structureFromBed(path2) dt.make_compatible((structure1, structure2)) contacts = dt.matFromBed(path1, structure1) enrichments = np.array(np.loadtxt("binding_data/GM12878_{}_100kb_active_coverage.bed".format(chrom), dtype=object)[:,6], dtype=float) bin_nums = structure1.nonzero_abs_indices() + structure1.chrom.minPos/structure1.chrom.res enrichments = enrichments[bin_nums] compartments1 = np.array(ca.get_compartments(contacts, structure1, enrichments)) contacts = dt.matFromBed(path2, structure2) enrichments = np.array(np.loadtxt("binding_data/K562_{}_100kb_active_coverage.bed".format(chrom), dtype=object)[:,6], dtype=float) bin_nums = structure2.nonzero_abs_indices() + structure2.chrom.minPos/structure2.chrom.res enrichments = enrichments[bin_nums] compartments2 = np.array(ca.get_compartments(contacts, structure2, enrichments)) gen_coords = structure1.getGenCoords() with open("A_compartment_{}kb.bed".format(res_kb), "a") as out: for gen_coord, compartment1, compartment2 in zip(gen_coords, compartments1, compartments2): if compartment1 > 0 and compartment2 > 0 and np.abs(compartment1 - compartment2) < 0.2: for i in range(int(100/res_kb)): out.write("\t".join((structure1.chrom.name, str(gen_coord + i*res), str(gen_coord + (i+1)*res), str(compartment1), str(compartment2)))) out.write("\n")
path2 = "hic_data/K562_{}_100kb.bed".format(chrom) structure2 = dt.structureFromBed(path2) dt.make_compatible((structure1, structure2)) contacts = dt.matFromBed(path1, structure1) enrichments = np.array(np.loadtxt( "binding_data/GM12878_{}_100kb_active_coverage.bed".format(chrom), dtype=object)[:, 6], dtype=float) bin_nums = structure1.nonzero_abs_indices( ) + structure1.chrom.minPos / structure1.chrom.res enrichments = enrichments[bin_nums] compartments1 = np.array( ca.get_compartments(contacts, structure1, enrichments)) contacts = dt.matFromBed(path2, structure2) enrichments = np.array(np.loadtxt( "binding_data/K562_{}_100kb_active_coverage.bed".format(chrom), dtype=object)[:, 6], dtype=float) bin_nums = structure2.nonzero_abs_indices( ) + structure2.chrom.minPos / structure2.chrom.res enrichments = enrichments[bin_nums] compartments2 = np.array( ca.get_compartments(contacts, structure2, enrichments)) gen_coords = structure1.getGenCoords() with open("B_compartment_{}kb.bed".format(res_kb), "a") as out:
from multimds import linear_algebra as la from mayavi import mlab from multimds import multimds as mm path1 = "hic_data/GM12878_combined_21_100kb.bed" path2 = "hic_data/K562_21_100kb.bed" struct1, struct2 = mm.full_mds(path1, path2) contacts1 = dt.matFromBed(path1, struct1) enrichments1 = np.loadtxt("binding_data/GM12878_21_100kb_active_coverage.bed", usecols=6) bin_nums1 = struct1.nonzero_abs_indices() + int( struct1.chrom.minPos / struct1.chrom.res) enrichments1 = enrichments1[bin_nums1] comps1 = np.array(ca.get_compartments(contacts1, struct1, enrichments1)) contacts2 = dt.matFromBed(path2, struct2) enrichments2 = np.loadtxt("binding_data/K562_21_100kb_active_coverage.bed", usecols=6) bin_nums2 = struct2.nonzero_abs_indices() + int( struct2.chrom.minPos / struct2.chrom.res) enrichments2 = enrichments2[bin_nums2] comps2 = np.array(ca.get_compartments(contacts2, struct2, enrichments2)) coords1 = struct1.getCoords() coords2 = struct2.getCoords() coords = np.concatenate((coords1, coords2)) compartments = np.concatenate((comps1, comps2)) clf = svm.LinearSVR() clf.fit(coords, compartments)