Exemple #1
0
def averageContacts(contactIterator, inValues, N, **kwargs):
    """
    Args:
        contactIterator:
            an iterator. See descriptions of "filenameContactMap" class below for example and explanations
        inValues:
            an array of values to pass to contactIterator. Would be an array of arrays of filenames or something like that.
        N:
            Size of the resulting contactmap

        **kwargs:
            arrayDtype: ctypes dtype (default c_int32) for the contact map
            classInitArgs: args to pass to the constructor of contact iterator as second+ args (first is the file list)
            classInitKwargs: dict of keyword args to pass to the coonstructor
            contactProcessing: function f(contacts), should return processed contacts
            nproc : int, number of processors(default 4)
            bucketNum: int (default = nproc) Number of memory bukcets to use
            contactBlock: int (default 500k) Number of contacts to aggregate before writing to memory
    """

    arrayDtype = kwargs.get("arrayDtype", ctypes.c_int32)
    nproc = min(kwargs.get("nproc", 4), len(inValues))
    bucketNum = kwargs.get("bucketNum", nproc)
    if nproc == 1:
        return averageContactsSimple(contactIterator, inValues, N, **kwargs)
    contactBlock = kwargs.get("contactBlock", 5000000)
    useFmap = kwargs.get("useFmap", False)
    classInitArgs = kwargs.get("classInitArgs", [])
    classInitKwargs = kwargs.get("classInitKwargs", {})
    contactProcessing = kwargs.get("contactProcessing", lambda x: x)
    finalSize = N * (N + 1) // 2
    boundaries = np.linspace(0, finalSize, bucketNum + 1, dtype=int)
    chunks = zip(boundaries[:-1], boundaries[1:])
    sharedArrays = [mp.Array(arrayDtype, int(j - i)) for i, j in chunks]
    argset = list(sharedArrays) + [
        contactProcessing, classInitArgs, classInitKwargs, contactIterator,
        contactBlock, N
    ]

    if not useFmap:
        with closing(
                mp.Pool(processes=nproc, initializer=init,
                        initargs=argset)) as p:
            p.map(worker, inValues)
    else:
        init(*argset)
        from mirnylib.systemutils import fmap
        fmap(worker, inValues, nproc=nproc)

    sharedNumpy = list(map(tonumpyarray, sharedArrays))
    res = np.concatenate(sharedNumpy)
    final = triagToNormal(res, N)
    return final
Exemple #2
0
def makeMoviePymol(fileList,
                   destFolder,
                   fps=15,
                   aviFilename='output.avi',
                   pymolScript=""):
    if False in [os.path.exists(i) for i in fileList]:
        raise IOError("Some files are not in filelist")
    numFrames = len(fileList)
    numDigits = int(np.ceil(np.log10(numFrames)))

    destFolder = os.path.abspath(destFolder)
    pdbFolder = destFolder + '/pdb'
    imgFolder = destFolder + '/img'
    if os.path.exists(imgFolder):
        shutil.rmtree(imgFolder)
    for folder in [destFolder, pdbFolder, imgFolder]:
        if not os.path.isdir(folder):
            os.mkdir(folder)

    def saveToPdb(input):
        i, dataPath = input
        d = polymerutils.load(dataPath)
        pdbFilename = '{0:0{width}}.pdb'.format(i, width=numDigits)
        savePath = pdbFolder + '/' + pdbFilename
        polymerutils.save(d, savePath, mode='pdb', pdbGroups=colorArray)
        return os.path.abspath(savePath)

    pdbPaths = fmap(saveToPdb, enumerate(fileList))

    script = 'hide all\n'
    for i in pdbPaths:
        script += 'load {0}, mov\n'.format(i)

    script += textwrap.dedent("""
    smooth mov
    """)
    script += pymolScript
    script += "\n"

    script += textwrap.dedent("""
    zoom mov
    """)

    tmpScriptPath = os.path.abspath(destFolder + '/movie.pymol')
    tmpScript = open(tmpScriptPath, 'w')
    tmpScript.write(script)
    tmpScript.flush()
    tmpScript.close()

    os.system("cd {0}; pymol  -u {1}; cd -".format(imgFolder, tmpScriptPath))
    _mencoder(imgFolder, fps, aviFilename)
Exemple #3
0
def displayHeatmap():
    plt.figure(figsize=(5, 5))
    shared_arr = mp.Array(ctypes.c_double, N**2)
    arr = tonumpyarray(shared_arr)
    arr.shape = (N, N)

    def doSim(i):
        nparr = tonumpyarray(shared_arr)
        SMCTran = initModel(i)

        for j in range(1):
            SMC = []
            N1 = 10000
            for k in range(np.random.randint(N1 // 2, N1)):
                SMCTran.steps(150)
                SMC.append(SMCTran.getSMCs())
            SMC = np.concatenate(SMC, axis=1)
            SMC1D = SMC[0] * N + SMC[1]
            position, counts = np.unique(SMC1D, return_counts=True)

            with shared_arr.get_lock():
                nparr[position] += counts
        print("Finished!")

        return None

    setExceptionHook()

    low20 = low // 10
    high20 = high // 10
    mydict = h5dict(
        "/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr",
        'r')

    hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20]
    hicdata = completeIC(hicdata)
    curshape = hicdata.shape
    newshape = (1000 * (high - low)) // (600 * 20)
    print(hicdata.shape, newshape)
    hicdata = zoomArray(hicdata, (newshape, newshape))
    hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99))
    hicdata /= np.mean(np.sum(hicdata, axis=1))

    fmap(doSim, range(30),
         n=20)  # number of threads to use.  On a 20-core machine I use 20.

    arr = coarsegrain(arr, 20)
    arr = np.clip(arr, 0, np.percentile(arr, 99.9))
    arr /= np.mean(np.sum(arr, axis=1))

    ran = np.arange(len(arr))
    mask = ran[:, None] > ran[None, :]

    arr[mask] = hicdata[mask]

    logarr = np.log(arr + 0.0001)
    plt.imshow(logarr,
               vmax=np.percentile(logarr, 99.9),
               extent=[low, high, high, low],
               interpolation="none")
    nicePlot()
Exemple #4
0
def averageContacts(contactIterator, inValues, N, **kwargs):
    """
    A main workhorse for averaging contacts on multiple cores into one shared contact
    map. It mostly does managing the arguments, and initializing the variables. All
    of the logic of how contacts are actually put in shared memory buckets is in the
    worker defined above.

    PARAMETERS
    ----------
        contactIterator : iterator
            an iterator. See descriptions of "filenameContactMap" class below for
            example and explanations
        inValues : iterable
            an array of values to pass to contactIterator. Would be an array of arrays
            of filenames or something like that.
        N : int
            Size of one side of the resulting contactmap

        arrayDtype : ctypes dtype (default c_int32) for the contact map
        classInitArgs : args to pass to the constructor of contact iterator
        classInitKwargs: dict of keyword args to pass to the constructor
        contactProcessing: function f(contacts), should return processed contacts
        nproc : int, number of processors(default 4)
        bucketNum: int (default = nproc) Number of memory buckets to use
        contactBlock: int (default 500k) Number of contacts to aggregate before writing

        useFmap : True, False, or callable
            If True, uses mirnylib.systemutils.fmap
            If False, uses multiprocessing.Pool.map
            Otherwise, uses provided function, assuming it of a fork-map type
            (different initializations are needed for forkmap and
            multiprocessing-style map)

            Sorry, no outside multiprocessing-style maps for now, it's easy to fix
            Let me know if it is needed.
                     
                     
    Code that calcualtes a contactmap from a set of polymer conformation is in the
    methods below (averageMonomerResolutionContactMap, etc.)
    
    An example code that would run a contactmap from a simulation is pasted below. 
    
    class simContactMap(object):

    "contactmap 'finder' for a simulation"
    def __init__(self, ind):  # accept a parameter (e.g. random number generator seed)
        self.model = initModel(ind)  # pass parameter to the functon that returns me a model object 
        self.count = 10000000   # how many times to run a step of the model 
        self.model.steps(10000)   # initial steps of the model to equilibrate it 
        
    def next(self):  # actual realization of the self.next method
        if self.count == 0:   # terminate the simulation if we did self.count iterations
            raise StopIteration
        self.count -= 1      #decrement the counter 
        self.model.steps(30)   # advance model by 30 steps 
        return np.array(self.model.getSMCs()).T   # return current LEF positions 

    mymap = polychrom.contactmaps.averageContacts(simContactMap, range(20), 30000,  nproc=20 )    

    
    """

    arrayDtype = kwargs.get("arrayDtype", ctypes.c_int32)
    nproc = min(kwargs.get("nproc", 4), len(inValues))
    bucketNum = kwargs.get("bucketNum", nproc)
    if nproc == 1:
        return averageContactsSimple(contactIterator, inValues, N, **kwargs)
    contactBlock = kwargs.get("contactBlock", 5000000)
    useFmap = kwargs.get("useFmap", False)
    classInitArgs = kwargs.get("classInitArgs", [])
    classInitKwargs = kwargs.get("classInitKwargs", {})
    contactProcessing = kwargs.get("contactProcessing", lambda x: x)
    finalSize = N * (N + 1) // 2
    boundaries = np.linspace(0, finalSize, bucketNum + 1, dtype=int)
    chunks = zip(boundaries[:-1], boundaries[1:])
    sharedArrays = [mp.Array(arrayDtype, int(j - i)) for i, j in chunks]
    argset = list(sharedArrays) + [
        contactProcessing,
        classInitArgs,
        classInitKwargs,
        contactIterator,
        contactBlock,
        N,
    ]

    if (
            not useFmap
    ):  # for mp.map we need initializer because shared memory cannot be pickled
        # # or passed as an argument in inValues
        with closing(
                mp.Pool(processes=nproc, initializer=init,
                        initargs=argset)) as p:
            p.map(worker, inValues)

    # diffent strategy for a local map
    # shared memory is just a global variable created by init()
    else:
        init(*argset)  # creating global variables here
        if callable(useFmap):
            fmap = useFmap
        else:
            from mirnylib.systemutils import fmap
        fmap(worker, inValues, nproc=nproc)

    res = np.concatenate([tonumpyarray(i) for i in sharedArrays])
    del sharedArrays  # save memory
    final = triagToNormal(res, N)
    return final
def give_slices(base,
                tosave,
                slices,
                sliceParams,
                multipliers,
                mode="chain",
                loadFunction=load,
                integrate=False,
                normalize=False,
                exceptionList=[],
                nproc=4,
                cutoff=1.7,
                binstep=1.15,
                integerSlices=True,
                verbose=False):
    np.seterr(invalid='raise')

    plotsBySlice = []

    for cur_slice in slices:

        files = []

        def slice2D(a, b, mult=[1]):
            tm = []
            if type(b) == tuple:
                for i in range(b[0], b[1] + 1):
                    tm.append((i, a))
            elif type(b) == int:
                for i in range(1, b + 1):
                    tm.append((i, a))
            elif type(b) == list:
                tm = [(i, a) for i in b]
            if integerSlices:
                tm2 = sorted(
                    list(
                        set([(i[0], int(float(i[1]) * m)) for i in tm
                             for m in mult])))
            else:
                tm2 = sorted(tm)
            print(tm2)
            return tm2

        def slice3D(a, b, c, mult=[1]):
            tm = []
            for i in range(b[0], b[1] + 1):
                for t in range(c[0], c[1] + 1):
                    tm.append((i, a, t))
            tm2 = sorted(
                list(
                    set([(i[0], int(float(i[1]) * m)) for i in tm
                         for m in mult])))
            print(tm2)
            return tm2

        # sluces actually are defined
        runs = slice2D(cur_slice, sliceParams, multipliers)
        # runs = slice3D(cur_slice, (1,14),(1,10),multipliers)

        for i in runs:
            # filename is replaced in slices
            try:
                files.append(
                    base.replace("DATA1", str(i[0])).replace(
                        "DATA2", str(i[1])).replace("DATA3", str(i[2])))
            except:
                files.append(
                    base.replace("DATA1",
                                 str(i[0])).replace("DATA2", str(i[1])))

        datas = []

        def newload(i):
            # loads a file
            try:
                data = loadFunction(i, False)
                if len(data) != 3:
                    data = data.T
                if len(data) != 3:
                    raise Exception("Wrong shape of data")
                data = np.asarray(data, order="C", dtype=float)
                return data
            except tuple(exceptionList):
                print("file not found", i)
                return None

        # use this for determining the file size
        datas = [
            x for x in fmap(newload, files[::len(files) // 20 + 1], n=3)
            if x is not None
        ]
        datlen = len(datas[0][0])

        if mode == "chain":
            bins2 = logbins(4, datlen - 100, binstep)
        if mode == "parts":
            bins2 = logbins(4, datlen - 100, binstep)
        if (mode == "ring") or (mode == "intring"):
            b1 = logbins(2, datlen // 4 - 1, binstep)
            bins2 = [2 * i for i in b1]
            print(bins2)
        binsrg = logbins(4, datlen - 100, binstep)

        def give_plots(i):
            data = newload(i)
            if data is None:
                return None
            i = data

            if (mode == "ring") or (mode == "intring"):
                b = give_radius_scaling(i, binsrg, ring=True)
            else:
                b = give_radius_scaling(i, binsrg, ring=False)

            if (mode == "chain"):
                a = giveCpScaling(i, bins2, cutoff, integrate, verbose=verbose)
            if (mode == "ring"):
                a = giveCpScaling(i,
                                  bins2,
                                  cutoff,
                                  integrate,
                                  ring=True,
                                  verbose=verbose)
            if (mode == "intring"):
                a = giveCpScaling(i,
                                  bins2,
                                  cutoff,
                                  integrate,
                                  ring=True,
                                  project=False,
                                  intContacts=True,
                                  verbose=verbose)
            if (mode == "project"):
                a = giveCpScaling(i,
                                  bins2,
                                  1.450,
                                  integrate,
                                  project=True,
                                  verbose=verbose)

            if (mode == "ring") or (mode == "intring"):
                c = give_distance(i, bins2, ring=True)
            else:
                c = give_distance(i, bins2, ring=False)

            if (normalize == True):
                a = np.array(a)
                pos = a[0]
                values = a[1]
                bins = np.r_[1.5 * pos[0] - 0.5 * pos[1],
                             0.5 * (pos[1:] + pos[:-1]), pos[-1]]
                lens = bins[1:] - bins[:-1]
                ints = np.cumsum(lens * values)

                values /= ints[-1]
                ints /= ints[-1]
                a = [pos, values]

            a = np.array(a, dtype=float)
            b = np.array(b, dtype=float)
            c = np.array(c, dtype=float)

            return np.array([a, b, c])

        random.shuffle(files)

        parPlots = fmap(give_plots, files, n=nproc)

        parPlots = [x for x in parPlots if x is not None]

        means = np.mean(parPlots, axis=0)
        plotsBySlice.append([means, {"slice": cur_slice}])

    if tosave is not None:
        pickle.dump(plotsBySlice, open(tosave, 'wb'), -1)
    print("Finished!!!")
    return plotsBySlice
def averagePureContactMap(
        filenames,
        cutoff=1.7,
        n=4,  # Num threads
        loadFunction=load,
        exceptionsToIgnore=[],
        printProbability=0.005):
    """
        Parameters
    ----------
    filenames : list of strings
        Filenames to average map over
    cutoff : float, optional
        Cutoff to calculate contacts
    n : int, optional
        Number of threads to use.
        By default 4 to minimize RAM consumption with pure maps.
    exceptionsToIgnore : list of Exceptions
        List of exceptions to ignore when finding the contact map.
        Put IOError there if you want it to ignore missing files.

    Returns
    -------

    An NxN (for pure map) numpy array with the contact map.
    """
    """
    Now we actually need to modify our contact map by adding
    contacts from each new file to the contact map.
    We do it this way because our contact map is huge (maybe a gigabyte!),
    so we can't just add many gigabyte-sized arrays together.
    Instead of this each worker creates an empty "average contact map",
    and then loads files one by one and adds contacts from each file to a contact map.
    Maps from different workers are then added together manually.
    """

    n = min(n, len(filenames))
    subvalues = [filenames[i::n] for i in range(n)]

    def myaction(values):  # our worker receives some filenames
        mysum = None  # future contact map.
        for i in values:
            try:
                data = loadFunction(i)
                if np.random.random() < printProbability:
                    print(i)
            except tuple(exceptionsToIgnore):
                print("file not found", i)
                continue
            except:
                print("Unexpected error:", sys.exc_info()[0])
                print("File is: ", i)
                return -1

            if data.shape[0] == 3:
                data = data.T
            if mysum is None:  # if it's the first filename,

                if len(data) > 6000:
                    warnings.warn(
                        UserWarning(
                            'very large contact map'
                            ' may cause errors. these may be fixed with n=1 threads.'
                        ))
                if len(data) > 20000:
                    warnings.warn(
                        UserWarning('very large contact map'
                                    ' may be difficult to visualize.'))

                mysum = pureMap(data, cutoff)  # create a map

            else:  # if not
                pureMap(data, cutoff, mysum)
                # use existing map and fill in contacts

        return mysum

    blocks = fmap(myaction, subvalues)
    blocks = [i for i in blocks if i is not None]
    a = blocks[0]
    for i in blocks[1:]:
        a = a + i
    return a
def averageBinnedContactMap(
        filenames,
        chains=None,
        binSize=None,
        cutoff=1.7,
        n=4,  # Num threads
        loadFunction=load,
        exceptionsToIgnore=None,
        printProbability=1):
    """
    Returns an average contact map of a set of conformations.
    Non-existing files are ignored if exceptionsToIgnore is set to IOError.
    example:\n

    An example:

    .. code-block:: python
        >>> filenames = ["myfolder/blockd%d.dat" % i for i in xrange(1000)]
        >>> cmap = averageBinnedContactMap(filenames) + 1  #getting cmap
        #either showing a log of a map (+1 for zeros)
        >>> plt.imshow(numpy.log(cmap +1))
        #or truncating a map
        >>> vmax = np.percentile(cmap, 99.9)
        >>> plt.imshow(cmap, vmax=vmax)
        >>> plt.show()

    Parameters
    ----------
    filenames : list of strings
        Filenames to average map over
    chains : list of tuples or Nx2 array
        (start,end+1) of each chain
    binSize : int
        size of each bin in monomers
    cutoff : float, optional
        Cutoff to calculate contacts
    n : int, optional
        Number of threads to use.
        By default 4 to minimize RAM consumption.
    exceptionsToIgnore : list of Exceptions
        List of exceptions to ignore when finding the contact map.
        Put IOError there if you want it to ignore missing files.

    Returns
    -------
    tuple of two values:
    (i) MxM numpy array with the conntact map binned to binSize resolution.
    (ii) chromosomeStarts a list of start sites for binned map.

    """
    n = min(n, len(filenames))
    subvalues = [filenames[i::n] for i in range(n)]

    getResolution = 0
    fileInd = 0
    while getResolution == 0:
        try:
            data = loadFunction(filenames[fileInd])  # load filename
            getResolution = 1
        except:
            fileInd = fileInd + 1
        if fileInd >= len(filenames):
            print("no valid files found in filenames")
            raise ValueError("no valid files found in filenames")

    if chains is None:
        chains = [[0, len(data)]]
    if binSize is None:
        binSize = int(np.floor(len(data) / 500))

    bins = []
    chains = np.asarray(chains)
    chainBinNums = (np.ceil((chains[:, 1] - chains[:, 0]) / (0.0 + binSize)))
    for i in range(len(chainBinNums)):
        bins.append(binSize * (np.arange(int(chainBinNums[i]))) + chains[i, 0])
    bins.append(np.array([chains[-1, 1] + 1]))
    bins = np.concatenate(bins)
    bins = bins - .5
    Nbase = len(bins) - 1

    if Nbase > 10000:
        warnings.warn(
            UserWarning('very large contact map'
                        ' may be difficult to visualize'))

    chromosomeStarts = np.cumsum(chainBinNums)
    chromosomeStarts = np.hstack((0, chromosomeStarts))

    def myaction(values):  # our worker receives some filenames
        mysum = None  # future contact map.
        for i in values:
            try:
                data = loadFunction(i)
                if np.random.random() < printProbability:
                    print(i)
            except tuple(exceptionsToIgnore):
                print("file not found", i)
                continue

            if data.shape[0] == 3:
                data = data.T
            if mysum is None:  # if it's the first filename,

                mysum = rescaledMap(data, bins, cutoff)  # create a map

            else:  # if not
                rescaledMap(data, bins, cutoff, mysum)
                # use existing map and fill in contacts

        return mysum

    blocks = fmap(myaction, subvalues)
    blocks = [i for i in blocks if i is not None]
    a = blocks[0]
    for i in blocks[1:]:
        a = a + i
    a = a + a.T

    return a, chromosomeStarts
Exemple #8
0
def makeMovie(fileList, imgFolder, fps=20, aviFilename='output.avi'):
    offset = 2

    if not fileList:
        return
    numFrames = len(fileList)
    numDigits = int(np.ceil(np.log10(numFrames)))

    def smallFunction(x):
        i, dataPath = x

        savePath = imgFolder + '/{0:0{width}}.png'.format(i, width=numDigits)
        coreParticles = cPickle.load(
            open(os.path.join(os.path.split(dataPath)[0], "coreParticles")))
        data = load(dataPath)

        #data = load("../globules_expanded/crumpled1.dat_expanded")
        colorArray = np.zeros(len(data), int)
        for j, i in enumerate(coreParticles):
            if (j < len(coreParticles) - 1) and (j % 20 == 1):
                colorArray[max(i, 0):min(coreParticles[j + 1] -
                                         1, len(data))] = 2

            colorArray[max(i - offset, 0):min(i + offset + 1, len(data))] = 1

        loopArray = np.zeros(len(data), int)

        coords = data

        regions1 = pymol_show.createRegions(colorArray == 1)
        M = len(regions1)
        print M
        colors1 = ["brown" for i in range(M)]
        transparencies1 = [0 for i in colors1]

        regions2 = pymol_show.createRegions(colorArray == 2)
        M = len(regions2)
        print M
        allColors = ["br{0}".format(i) for i in range(10)]
        S = len(allColors)
        colors2 = [
            allColors[int(float(i) * float(S) / float(M))] for i in range(M)
        ]
        transparencies2 = [0 for i in colors2]

        pymol_show.do_coloring(
            coords,
            list(regions2),
            list(colors2),
            list(transparencies2),
            chainRadius=.25,
            subchainRadius=.35,
            chainTransparency=0.9,
            #returnScriptName="mov",
            showChain="worm",
            pdbGroups=colorArray,
            showGui=True,
            #saveTo=savePath,
            multiplier=.8,
            support="""
                               create back, chain 1
                               set cartoon_transparency,0.000000,back
                               set cartoon_trace_atoms,1, back
                               set cartoon_tube_radius,0.280000, back
                               cartoon tube, back
                               color brown, back
                               set depth_cue, 0
                               set field_of_view, 10
                            set_view (\
                                 0.362947434,   -0.752908945,    0.548998356,\
                                -0.843832374,   -0.015661031,    0.536378920,\
                                -0.395252228,   -0.657936990,   -0.641013563,\
                                -0.000659317,    0.000253409, -788.783874512,\
                                81.561027527,   81.701515198,  121.653610229,\
                               615.844299316,  961.749450684,  -10.000000000 )
                                png {savepath}
                                quit


                               """.format(savepath=savePath))

    fmap(smallFunction, enumerate(fileList), n=8)
    _mencoder(imgFolder, fps, aviFilename)