Esempio n. 1
0
def fullMDS(path, classical, alpha, num_threads, weight):
    """MDS without partitioning"""
    size = dt.size_from_bed(path)
    structure = dt.structureFromBed(path, size)
    contactMat = dt.matFromBed(path, size, structure)
    infer_structure(contactMat, structure, alpha, num_threads, weight,
                    classical)
    return structure
Esempio n. 2
0
from multimds import compartment_analysis as ca
from multimds import data_tools as dt
from scipy import stats as st
from matplotlib import pyplot as plt
import numpy as np
from multimds import linear_algebra as la
from scipy import signal as sg
from multimds import multimds as mm

path1 = "hic_data/GM12878_combined_19_100kb.bed"
path2 = "hic_data/K562_19_100kb.bed"

struct1, struct2 = mm.full_mds(path1, path2, prefix="test_")

mat1 = dt.matFromBed(
    "hic_data/GM12878_combined_{}_{}kb.bed".format(chrom, res_kb), struct1)
comps1 = ca.get_compartments(mat1, struct1)
mat2 = dt.matFromBed("hic_data/K562_{}_{}kb.bed".format(chrom, res_kb),
                     struct2)
comps2 = ca.get_compartments(mat2, struct2)

r, p = st.pearsonr(comps1, comps2)
if r < 0:
    comps1 = -comps1

comp_diffs = np.abs(comps1 - comps2)

dists = np.array([
    la.calcDistance(coord1, coord2)
    for coord1, coord2 in zip(struct1.getCoords(), struct2.getCoords())
])
Esempio n. 3
0
from multimds import data_tools as dt
from multimds import compartment_analysis as ca
import numpy as np
from sklearn import svm
from multimds import linear_algebra as la
from mayavi import mlab

struct = dt.structure_from_file(
    "hic_data/GM12878_combined_21_100kb_structure.tsv")

new_start = struct.chrom.getAbsoluteIndex(15000000)
struct.subsamplePoints(new_start, len(struct.points) - 3)

#compartments
contacts = dt.matFromBed("hic_data/GM12878_combined_21_100kb.bed", struct)

compartments = np.array(ca.get_compartments(contacts, struct))

#SVR
coords = struct.getCoords()
clf = svm.LinearSVR()
clf.fit(coords, compartments)
coef = clf.coef_

transformed_coords = np.array(la.change_coordinate_system(coef, coords))
xs = transformed_coords[:, 0]
min_x = min(xs)
max_x = max(xs)
x_range = max_x - min_x
ys = transformed_coords[:, 1]
min_y = min(ys)
Esempio n. 4
0
        for chrom in chroms:
            path1 = "hic_data/{}_{}_{}kb.bed".format(cell_type1, chrom, res_kb)
            path2 = "hic_data/{}_{}_{}kb.bed".format(cell_type2, chrom, res_kb)

            if os.path.isfile(path1) and os.path.isfile(path2):
                structure1, structure2 = multimds.full_mds(path1,
                                                           path2,
                                                           penalty=penalty)

                structure1.rescale()
                structure2.rescale()
                r, t = la.getTransformation(structure1, structure2)
                structure1.transform(r, t)

                #compartments
                contacts1 = dt.matFromBed(path1, structure=structure1)
                contacts2 = dt.matFromBed(path2, structure=structure2)

                compartments1 = np.array(
                    ca.get_compartments(contacts1, structure1))
                compartments2 = np.array(
                    ca.get_compartments(contacts2, structure2))

                r, p = st.pearsonr(compartments1, compartments2)
                if r < 0:
                    compartments2 = -compartments2

                #SVR
                coords1 = structure1.getCoords()
                coords2 = structure2.getCoords()
                coords = np.concatenate((coords1, coords2))
Esempio n. 5
0
#compartments
chrom1 = dt.chromFromBed(path1)
chrom2 = dt.chromFromBed(path2)
chrom1.res = 100000  #reduce res to reduce RAM usage in compartment calculation
chrom2.res = 100000
chrom1.minPos = int(np.floor(
    float(chrom1.minPos) / chrom1.res)) * chrom1.res  #round
chrom1.maxPos = int(np.ceil(float(chrom1.maxPos) / chrom1.res)) * chrom1.res
chrom2.minPos = int(np.floor(
    float(chrom2.minPos) / chrom2.res)) * chrom2.res  #round
chrom2.maxPos = int(np.ceil(float(chrom2.maxPos) / chrom2.res)) * chrom2.res

low_struct1 = dt.structureFromBed(path1, chrom1)
low_struct2 = dt.structureFromBed(path2, chrom2)
dt.make_compatible((low_struct1, low_struct2))
contacts1 = dt.matFromBed(path1, low_struct1)
contacts2 = dt.matFromBed(path2, low_struct2)

enrichments = np.loadtxt("binding_data/{}_{}_100kb_active_coverage.bed".format(
    format_celltype(cell_type1), chrom),
                         usecols=6)
bin_nums = low_struct1.nonzero_bins_whole_chrom()
enrichments = enrichments[bin_nums]
compartments1 = np.array(
    ca.get_compartments(contacts1, low_struct1, enrichments))

enrichments = np.loadtxt("binding_data/{}_{}_100kb_active_coverage.bed".format(
    format_celltype(cell_type2), chrom),
                         usecols=6)
bin_nums = low_struct2.nonzero_bins_whole_chrom()
enrichments = enrichments[bin_nums]
Esempio n. 6
0
from multimds import data_tools as dt
import numpy as np
from multimds import tools

path = sys.argv[1]
res = int(sys.argv[2])
outpath = sys.argv[3]

size = dt.size_from_bed(path)
chrom = dt.chromFromBed(path)
chrom.res = res
chrom.minPos = int(np.floor(float(chrom.minPos) / res)) * res  #round
chrom.maxPos = int(np.ceil(float(chrom.maxPos) / res)) * res

struct = dt.structureFromBed(path, size, chrom)
mat = dt.matFromBed(path, size, struct)

points = struct.getPoints()

with open(outpath, "w") as out:
    for i in range(len(mat)):
        abs_index1 = points[i].absolute_index
        for j in range(i):
            if mat[i, j] != 0:
                abs_index2 = points[j].absolute_index
                out.write("\t".join(
                    (chrom.name, str(chrom.getGenCoord(abs_index1)),
                     str(chrom.getGenCoord(abs_index1) + res), chrom.name,
                     str(chrom.getGenCoord(abs_index2)),
                     str(chrom.getGenCoord(abs_index2) + res), str(mat[i,
                                                                       j]))))
Esempio n. 7
0
import numpy as np

all_species = ("Mouse", "Human", "Yeast")
all_res_kb = (100, 100, 32)

boxes = [[] for species in all_species]

for i, (species, res_kb) in enumerate(zip(all_species, all_res_kb)):
    with open("{}_list.txt".format(species)) as infile:
        for line in infile:
            prefix = line.strip()
            for chrom in range(1, 23):
                path = "hic_data/{}_{}_{}kb.bed".format(prefix, chrom, res_kb)

                if os.path.isfile(path):
                    mat = dt.matFromBed(path)
                    oe_mat = ca.oe(mat)
                    cor_mat = ca.cor(oe_mat)
                    pca = PCA(n_components=1)
                    pca.fit(cor_mat)
                    boxes[i].append(pca.explained_variance_ratio_[0])

        infile.close()

#start with a frameless plot (extra room on the left)
plt.subplot2grid((10, 10), (0, 0), 9, 10, frameon=False)

#label axes
plt.ylabel("PC1 explained variance ratio", fontsize=10)

#define offsets
Esempio n. 8
0
all_res_kb = (100, 100, 32)
boxes = [[] for species in all_species]

for i, (species, res_kb) in enumerate(zip(all_species, all_res_kb)):
    with open("{}_list.txt".format(species)) as infile:
        for line in infile:
            prefix = line.strip()
            for chrom in range(1, 23):
                path = "hic_data/{}_{}_{}kb.bed".format(prefix, chrom, res_kb)

                if os.path.isfile(path):
                    os.system("python ../minimds.py {}".format(path))
                    structure = dt.structure_from_file(
                        "hic_data/{}_{}_{}kb_structure.tsv".format(
                            prefix, chrom, res_kb))
                    mat = dt.matFromBed(path, structure)
                    comps = ca.get_compartments(mat, structure)
                    coords = structure.getCoords()
                    clf = svm.LinearSVR()
                    clf.fit(coords, comps)
                    boxes[i].append(clf.score(coords, comps))

        infile.close()

plt.subplot2grid((10, 10), (0, 0), 9, 10, frameon=False)

#label axes
plt.ylabel("SVR R^2", fontsize=10)

#define offsets
ys = boxes
Esempio n. 9
0
def partitionedMDS(path, args):
    """Partitions structure into substructures and performs MDS"""
    domainSmoothingParameter = args[0]
    minSizeFraction = args[1]
    maxmemory = args[2]
    num_threads = args[3]
    alpha = args[4]
    res_ratio = args[5]
    alpha2 = args[6]
    weight = args[7]

    #create low-res structure
    low_chrom = dt.chromFromBed(path)
    low_chrom.res *= res_ratio
    lowstructure = dt.structureFromBed(path, low_chrom)  #low global structure

    #get TADs
    low_contactMat = dt.matFromBed(path, lowstructure)
    low_tads = tad.getDomains(
        low_contactMat, lowstructure, domainSmoothingParameter, minSizeFraction
    )  #low substructures, defined on relative indices not absolute indices
    tad.substructuresFromTads(lowstructure, low_tads)

    #create high-res chrom
    size, res = dt.basicParamsFromBed(path)
    highChrom = dt.ChromParameters(lowstructure.chrom.minPos,
                                   lowstructure.chrom.maxPos, res,
                                   lowstructure.chrom.name, size)

    #create high-res structure
    highstructure = dt.Structure([], [], highChrom, 0)

    #initialize high-res substructures
    high_substructures = []
    low_gen_coords = lowstructure.getGenCoords()
    offset = 0  #initialize
    for i, low_tad in enumerate(low_tads):
        start_gen_coord = low_gen_coords[low_tad[0]]
        if i == len(low_tads) - 1:  #for last tad, avoid rounding error
            end_gen_coord = highstructure.chrom.maxPos
        else:
            end_gen_coord = low_gen_coords[low_tad[1]]
        high_substructure = dt.structureFromBed(path, highChrom,
                                                start_gen_coord, end_gen_coord,
                                                offset)
        high_substructures.append(high_substructure)
        offset += len(high_substructure.points)  #update
        offset -= 1

    highstructure.setstructures(high_substructures)

    infer_structure(low_contactMat, lowstructure, alpha, num_threads, weight)
    print("Low-resolution MDS complete")

    highSubstructures = pymp.shared.list(highstructure.structures)
    lowSubstructures = pymp.shared.list(lowstructure.structures)

    numSubstructures = len(highstructure.structures)
    num_threads = min(
        (num_threads, mp.cpu_count(), numSubstructures)
    )  #don't exceed number of requested threads, available threads, or structures
    with pymp.Parallel(num_threads) as p:
        for substructurenum in p.range(numSubstructures):
            highSubstructure = highSubstructures[substructurenum]
            if len(highSubstructure.getPoints()) > 0:  #skip empty
                trueLow = lowSubstructures[substructurenum]

                #perform MDS individually
                structure_contactMat = dt.matFromBed(
                    path,
                    highSubstructure)  #contact matrix for this structure only
                infer_structure(structure_contactMat, highSubstructure, alpha2,
                                num_threads, weight)

                #approximate as low resolution
                inferredLow = dt.highToLow(highSubstructure, res_ratio)

                #rescale
                scaling_factor = la.radius_of_gyration(
                    trueLow) / la.radius_of_gyration(inferredLow)
                for i, point in enumerate(inferredLow.points):
                    if point != 0:
                        x, y, z = point.pos
                        inferredLow.points[i].pos = (x * scaling_factor,
                                                     y * scaling_factor,
                                                     z * scaling_factor)

                #recover the transformation for inferred from true low structure
                r, t = la.getTransformation(inferredLow, trueLow)
                t /= scaling_factor

                #transform high structure
                highSubstructure.transform(r, t)
                highSubstructures[substructurenum] = highSubstructure

                print("MDS performed on structure {} of {}".format(
                    substructurenum + 1, numSubstructures))

    highstructure.setstructures(highSubstructures)
    highstructure.set_rel_indices()

    return highstructure
Esempio n. 10
0
import numpy as np

res = int(sys.argv[1])
res_kb = int(res / 1000)

for chrom in (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
              20, 21):
    path1 = "hic_data/GM12878_combined_{}_100kb.bed".format(chrom)
    structure1 = dt.structureFromBed(path1)

    path2 = "hic_data/K562_{}_100kb.bed".format(chrom)
    structure2 = dt.structureFromBed(path2)

    dt.make_compatible((structure1, structure2))

    contacts = dt.matFromBed(path1, structure1)
    enrichments = np.array(np.loadtxt(
        "binding_data/GM12878_{}_100kb_active_coverage.bed".format(chrom),
        dtype=object)[:, 6],
                           dtype=float)
    bin_nums = structure1.nonzero_abs_indices(
    ) + structure1.chrom.minPos / structure1.chrom.res
    enrichments = enrichments[bin_nums]
    compartments1 = np.array(
        ca.get_compartments(contacts, structure1, enrichments))

    contacts = dt.matFromBed(path2, structure2)
    enrichments = np.array(np.loadtxt(
        "binding_data/K562_{}_100kb_active_coverage.bed".format(chrom),
        dtype=object)[:, 6],
                           dtype=float)