Beispiel #1
0
def full_mds(path1,
             path2,
             alpha=4,
             penalty=0.05,
             num_threads=3,
             weight=0.05,
             prefix=""):
    """MDS without partitioning"""
    structure1 = dt.structureFromBed(path1)
    structure2 = dt.structureFromBed(path2)
    dt.make_compatible((structure1, structure2))
    contactMat1 = dt.matFromBed(path1, structure1)
    contactMat2 = dt.matFromBed(path2, structure2)
    infer_structures(contactMat1, structure1, contactMat2, structure2, alpha,
                     penalty, num_threads, weight)

    prefix1 = os.path.splitext(os.path.basename(path1))[0]
    structure1.write("{}{}_structure.tsv".format(prefix, prefix1))
    prefix2 = os.path.splitext(os.path.basename(path2))[0]
    structure2.write("{}{}_structure.tsv".format(prefix, prefix2))

    dists = la.calculate_distances(structure1, structure2)
    with open("{}{}_{}_relocalization.bed".format(prefix, prefix1, prefix2),
              "w") as out:
        for gen_coord, dist in zip(structure1.getGenCoords(), dists):
            out.write("\t".join(
                (structure1.chrom.name, str(gen_coord),
                 str(gen_coord + structure1.chrom.res), str(dist))))
            out.write("\n")
        out.close()

    print("Fractional compartment change: ")
    print(calculate_compartment_fraction(structure1, structure2, path1, path2))

    return structure1, structure2
Beispiel #2
0
def fullMDS(path1, path2, alpha, penalty, num_threads, weight):
    """MDS without partitioning"""
    structure1 = dt.structureFromBed(path1)
    structure2 = dt.structureFromBed(path2)
    dt.make_compatible((structure1, structure2))
    contactMat1 = dt.matFromBed(path1, structure1)
    contactMat2 = dt.matFromBed(path2, structure2)
    infer_structures(contactMat1, structure1, contactMat2, structure2, alpha,
                     penalty, num_threads, weight)
    return structure1, structure2
Beispiel #3
0
import linear_algebra as la
from sklearn.manifold import MDS

chrom = sys.argv[1]
res_kb = 100
prefix1 = "GM12878_combined"
prefix2 = "K562"

path1 = "hic_data/{}_{}_{}kb.bed".format(prefix1, chrom, res_kb)
path2 = "hic_data/{}_{}_{}kb.bed".format(prefix2, chrom, res_kb)

structure1 = dt.structureFromBed(path1, None, None)
structure2 = dt.structureFromBed(path2, None, None)

#make structures compatible
dt.make_compatible((structure1, structure2))

#get distance matrices
dists1 = dt.normalized_dist_mat(path1, structure1)
dists2 = dt.normalized_dist_mat(path2, structure2)

#MDS
coords1 = MDS(n_components=3,
              random_state=np.random.RandomState(),
              dissimilarity="precomputed",
              n_jobs=-1).fit_transform(dists1)
coords2 = MDS(n_components=3,
              random_state=np.random.RandomState(),
              dissimilarity="precomputed",
              n_jobs=-1).fit_transform(dists2)
Beispiel #4
0
def partitioned_mds(path1,
                    path2,
                    prefix="",
                    centromere=0,
                    num_partitions=4,
                    maxmemory=32000000,
                    num_threads=3,
                    alpha=4,
                    res_ratio=10,
                    penalty=0.05,
                    weight=0.05):
    """Partitions structure into substructures and performs MDS"""
    #create low-res structures
    lowstructure1 = create_low_res_structure(path1, res_ratio)
    lowstructure2 = create_low_res_structure(path2, res_ratio)
    dt.make_compatible((lowstructure1, lowstructure2))

    #get partitions
    n = len(lowstructure1.getPoints())
    if centromere == 0:
        midpoint = int(n / 2)
    else:
        midpoint = lowstructure1.chrom.getAbsoluteIndex(centromere)

    assert num_partitions % 2 == 0

    partition_size1 = int(np.ceil(float(midpoint) / (num_partitions / 2)))
    partition_size2 = int(np.ceil(float(n - midpoint) / (num_partitions / 2)))

    lowpartitions = [
    ]  #low substructures, defined on absolute indices not relative indices

    for i in range(int(num_partitions / 2)):
        lowpartitions.append(
            (i * partition_size1, min(((i + 1) * partition_size1), midpoint)))

    for i in range(int(num_partitions / 2)):
        lowpartitions.append((midpoint + i * partition_size2,
                              min((midpoint + (i + 1) * partition_size2),
                                  n - 1)))

    lowpartitions = np.array(lowpartitions)

    low_contactMat1 = dt.matFromBed(path1, lowstructure1)
    low_contactMat2 = dt.matFromBed(path2, lowstructure2)

    tad.substructuresFromAbsoluteTads(lowstructure1, lowpartitions)
    tad.substructuresFromAbsoluteTads(lowstructure2, lowpartitions)

    #create high-res chroms
    size1, res1 = dt.basicParamsFromBed(path1)
    highChrom1 = dt.ChromParameters(lowstructure1.chrom.minPos,
                                    lowstructure1.chrom.maxPos, res1,
                                    lowstructure1.chrom.name, size1)
    size2, res2 = dt.basicParamsFromBed(path2)
    highChrom2 = dt.ChromParameters(lowstructure2.chrom.minPos,
                                    lowstructure2.chrom.maxPos, res2,
                                    lowstructure2.chrom.name, size2)

    #initialize high-res substructures
    high_substructures1 = []
    high_substructures2 = []
    low_gen_coords = lowstructure1.getGenCoords()
    offset1 = 0  #initialize
    offset2 = 0
    for partition in lowpartitions:
        start_gen_coord = low_gen_coords[partition[0]]
        end_gen_coord = low_gen_coords[partition[1]]
        high_substructure1 = dt.structureFromBed(path1, highChrom1,
                                                 start_gen_coord,
                                                 end_gen_coord, offset1)
        high_substructure2 = dt.structureFromBed(path2, highChrom2,
                                                 start_gen_coord,
                                                 end_gen_coord, offset2)
        high_substructures1.append(high_substructure1)
        high_substructures2.append(high_substructure2)
        offset1 += (len(high_substructure1.points) - 1)  #update
        offset2 += (len(high_substructure2.points) - 1)  #update

    for high_substructure1, high_substructure2 in zip(high_substructures1,
                                                      high_substructures2):
        dt.make_points_compatible((high_substructure1, high_substructure2))

    highstructure1 = dt.Structure([], high_substructures1, highChrom1, 0)
    highstructure2 = dt.Structure([], high_substructures2, highChrom2, 0)

    infer_structures(low_contactMat1, lowstructure1, low_contactMat2,
                     lowstructure2, alpha, penalty, num_threads, weight)
    print("Low-resolution MDS complete")

    highSubstructures1 = pymp.shared.list(highstructure1.structures)
    highSubstructures2 = pymp.shared.list(highstructure2.structures)
    lowSubstructures1 = pymp.shared.list(lowstructure1.structures)
    lowSubstructures2 = pymp.shared.list(lowstructure2.structures)

    numSubstructures = len(highstructure1.structures)
    num_threads = min(
        (num_threads, mp.cpu_count(), numSubstructures)
    )  #don't exceed number of requested threads, available threads, or structures
    with pymp.Parallel(num_threads) as p:
        for substructurenum in p.range(numSubstructures):
            highSubstructure1 = highSubstructures1[substructurenum]
            highSubstructure2 = highSubstructures2[substructurenum]
            trueLow1 = lowSubstructures1[substructurenum]
            trueLow2 = lowSubstructures2[substructurenum]

            #joint MDS
            structure_contactMat1 = dt.matFromBed(
                path1,
                highSubstructure1)  #contact matrix for this structure only
            structure_contactMat2 = dt.matFromBed(
                path2,
                highSubstructure2)  #contact matrix for this structure only

            infer_structures(structure_contactMat1, highSubstructure1,
                             structure_contactMat2, highSubstructure2, 2.5,
                             penalty, num_threads, weight)

            transform(trueLow1, highSubstructure1, res_ratio)
            transform(trueLow2, highSubstructure2, res_ratio)

            highSubstructures1[substructurenum] = highSubstructure1
            highSubstructures2[substructurenum] = highSubstructure2

            print("MDS performed on structure {} of {}".format(
                substructurenum + 1, numSubstructures))

    highstructure1.setstructures(highSubstructures1)
    highstructure2.setstructures(highSubstructures2)

    highstructure1.set_rel_indices()
    highstructure2.set_rel_indices()

    return highstructure1, highstructure2