def parse_onename(onename):
            np.random.seed()
            #Parsing individual files
            if parseInMemory == True:
                finalname = onename + "_parsed.frag"
                #if not os.path.exists(finalname):
                if True:

                    #create dataset in memory, parse and then save to destination
                    TR = HiCdataset("bla" + str(np.random.randint(100000000000)), genome=getGenome(workingGenome),
                                    maximumMoleculeLength=500,enzymeName = enzyme,tmpFolder = "tmp",
                                    inMemory=True)  # remove inMemory if you don't have enough RAM

                    TR.parseInputData(dictLike=onename)
                    folder = os.path.split(onename)[0]
                    print(onename)
                    TR.save(ensure(finalname))
                    folder, fname = os.path.split(onename)
                    statSubFolder = os.path.join(statFolder, folder)
                   

                    TR.printMetadata(saveTo=ensure(os.path.join(statSubFolder, fname + ".stat")))
                else:
                    print("skipping parsed: ", onename)
            else:
                #Create dataset at destination, parse on HDD, then no need to save.
                TR = HiCdataset(ensure(onename + "_parsed.frag"),
                                genome=getGenome(workingGenome),enzymeName = enzyme,tmpFolder = "tmp",
                                maximumMoleculeLength=500, mode='w')
                TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme)
                TR.printMetadata(saveTo=ensure(os.path.join(statFolder, onename + ".stat")))
Exemple #2
0
)  # a prefix preceeding .fastq.gz, which will be used to distinguish side 1 and side 2
genomeName = "hg19"
#genomeName = "/home/ubuntu/data3/HiCdata/DM_DpnII_Oct18/dm3"
threads = 8
bowtieIndex = "/home/ubuntu/rDNA_data/index/hg19/hg19mannual"
#bowtiePath = "/usr/bin/bowtie2"
bowtiePath = "/home/ubuntu/tools/miniconda2/envs/main/bin/bowtie2"
bowtieFlags = "--very-sensitive"

#bowtieFlags = "-D 20 -R 3 -N 0 -L 7 -i S,1,0.10"

bowtieFlags = "--very-sensitive --n-ceil L,0,0.1"

seqSkipStart = 0  # skip first 2 bp of the read, if you want
#minMapLen = 25  # start mapping at this length
genome_db = getGenome(genomeName)

if not os.path.exists(bowtiePath): raise

if mode == "sra":
    iterList = 2 * GEOids
elif mode == "fastq":
    iterList = sorted(os.listdir(inFastqDir))

print iterList

for i in iterList:
    if mode == "sra":
        sraNum = i
        expName = "SRR{0}".format(i)
        i = expName
Exemple #3
0
def refineDataset(filenames, create=True, delete=False, parseInMemory=True):
    """
    Parameters
    ----------

    filenames[0] is a list of filenames of incoming files
    filenames[1] is a folder for outgoing file
    filenames[2] is a working genome, that is output directory
    filenames[3] is an enzyme for a given experiment


    create : bool, optional
        If True, parse each file.
        If False, assume that files were already parsed
        (e.g. if you are just playing around with filtering parameters)
    delete : bool, optional
        If True, delete parsed files after merging.
        Man, these files may be huge... if you don't have a 10TB RAID, this may be useful.
    parseInMemory : bool, optional
        Perform parsing input files in memory.

    """
    in_files = filenames[0]
    out_file = filenames[1]

    statFolder = os.path.join("statistics", out_file)

    workingGenome = filenames[2]
    enzyme = filenames[3]

    if create == True:  # if we need to parse the input files (.hdf5 from mapping).

        def parse_onename(onename):
            np.random.seed()
            #Parsing individual files
            if parseInMemory == True:
                finalname = onename + "_parsed.frag"
                #if not os.path.exists(finalname):
                if True:

                    #create dataset in memory, parse and then save to destination
                    TR = HiCdataset(
                        "bla" + str(np.random.randint(100000000000)),
                        genome=getGenome(workingGenome),
                        maximumMoleculeLength=500,
                        enzymeName=enzyme,
                        tmpFolder="tmp",
                        inMemory=True
                    )  # remove inMemory if you don't have enough RAM

                    TR.parseInputData(dictLike=onename)
                    folder = os.path.split(onename)[0]
                    print(onename)
                    TR.save(ensure(finalname))
                    folder, fname = os.path.split(onename)
                    statSubFolder = os.path.join(statFolder, folder)

                    TR.printMetadata(saveTo=ensure(
                        os.path.join(statSubFolder, fname + ".stat")))
                else:
                    print("skipping parsed: ", onename)
            else:
                #Create dataset at destination, parse on HDD, then no need to save.
                TR = HiCdataset(ensure(onename + "_parsed.frag"),
                                genome=getGenome(workingGenome),
                                enzymeName=enzyme,
                                tmpFolder="tmp",
                                maximumMoleculeLength=500,
                                mode='w')
                TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme)
                TR.printMetadata(
                    saveTo=ensure(os.path.join(statFolder, onename + ".stat")))

        list(map(parse_onename, in_files))
        "Merging files alltogether, applying filters"
        TR = HiCdataset(ensure(out_file + "_merged.frag"),
                        genome=getGenome(workingGenome),
                        enzymeName=enzyme,
                        tmpFolder="tmp",
                        dictToStoreIDs="h5dict",
                        mode="w")
        TR.merge([i + "_parsed.frag" for i in in_files])
        #Merge in all parsed files from one experiment

        if delete == True:  # cleaning up parsed files
            for delFile in [i + "_parsed.frag" for i in in_files]:
                os.remove(delFile)

        "Now opening new dataset for refined data, and performing all the filtering "
        TR = HiCdataset(out_file + "_refined.frag",
                        enzymeName=enzyme,
                        genome=getGenome(workingGenome),
                        tmpFolder="tmp",
                        dictToStoreIDs="h5dict",
                        mode='w')
        TR.load(out_file + "_merged.frag")

        #----------------------------Set of filters applied -------------
        TR.filterDuplicates()
        #TR.save(out_file+".dat")
        #TR.filterExtreme(cutH=0.0001, cutL=0)
        #TR.filterRsiteStart()
        #TR.filterLarge()
        TR.writeFilteringStats()
        TR.printMetadata(saveTo=statFolder + ".stat")

        #------------------------End set of filters applied----------

    else:
        #If merging & filters has already been done, just load files
        TR = HiCdataset(out_file + "_working.frag",
                        enzymeName=enzyme,
                        mode='w',
                        genome=getGenome(workingGenome))
        TR.load(out_file + "_refined.frag")
        TR.printMetadata(saveTo=statFolder + ".stat")

    print("----->Building Raw heatmap at different resolutions")
    TR.printStats()
    for res in coolerResolutions:
        TR.saveCooler(out_file + ".{0}.cool".format(res), res)
    pass


#Now merging different experiments alltogether
#note that the first column is not here, as it is a replica 
experiments = set([(i[0], i[2], i[3]) for i in combinedExperimentNames])
print(experiments)

for experiment in experiments:
    workingGenome = experiment[1]
    myExperimentNames = [i[1] + "_refined.frag" for i in combinedExperimentNames if (i[0], i[2], i[3]) == (experiment[0], experiment[1],experiment[2])]    
    assert len(myExperimentNames) > 0
    if len(myExperimentNames) > 0:
        #If we have more than one experiment (replica) for the same data, we can combine. 
        TR = HiCdataset(os.path.join(workingGenome, "%s-all-%s_refined.frag" %
                                     (experiment[0],experiment[2])), genome=getGenome(workingGenome),
                                     enzymeName = experiment[2],tmpFolder = "tmp",dictToStoreIDs="h5dict")
        statSaveName = os.path.join("statistics", workingGenome, "%s-all-%s_refined.stat" % (experiment[0], experiment[2]))

        TR.merge(myExperimentNames)
        TR.printMetadata(saveTo=statSaveName)
        for res in wholeGenomeResolutionsKb:    
            TR.saveHeatmap(os.path.join(workingGenome, "%s-all-%s-{0}k.hm" % (experiment[0], experiment[2])).format(res), res*1000)
        for res in byChromosomeResolutionsKb: 
            TR.saveByChromosomeHeatmap(os.path.join(workingGenome, "%s-all-%s-{0}k.byChr" % (experiment[0], experiment[2])).format(res), res*1000)
        for res in HiResWithOverlapResolutionsKb:
            TR.saveHiResHeatmapWithOverlaps(os.path.join(workingGenome, "%s-all-%s-{0}k_HighRes.byChr" % (experiment[0], experiment[2])).format(res), res*1000)
        for res in SuperHiResWithOverlapResolutionsKb:
            TR.saveSuperHighResMapWithOverlaps(os.path.join(workingGenome, "%s-all-%s-{0}k_SuperHighRes.byChr" % (experiment[0], experiment[2])).format(res), res*1000)

Exemple #5
0
        except:
            raise ValueError("Cannot create directory")
    return f


path = "/home/ubuntu/RNAdata/Golov/hic/mapped-hg19/35_AGAGGCCT_reads_merged_len_adj/"
f1 = h5py.File(path + "chunk0001.hdf5" ,'r+')  

f1 = mirnylib.h5dict.h5dict(path + "chunk0001.hdf5" ,'r+')  


print f1["misc"]["genome"]["idx2label"]
chrm_conversion_table = f1["misc"]["genome"]["idx2label"]


genome_db = getGenome("hg19")
workingGenome = "hg19"

for key in genome_db.idx2label:
    print key ,
    print genome_db.idx2label[key] ,

print("Keys: %s" % f1.keys())

chrms1_key = list(f1.keys())[0]
chrms2_key = list(f1.keys())[1]

cuts1_key = list(f1.keys())[2]
cuts2_key = list(f1.keys())[3]

misc_key = list(f1.keys())[4]