def __init__(self, intervals_file, fasta_file, dnase_file, cell_line=None, RNAseq_PC_file=None, mappability_file=None, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None # mappability if mappability_file is None: # download the mappability file if not existing common_dl_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files") makedir_exist_ok(common_dl_dir) rf = RemoteFile(url="http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig", md5="1d15ddafe2c8df51cf08495db96679e7") mappability_file = os.path.join(common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig") if not os.path.exists(mappability_file) or not rf.validate(mappability_file): # download the path rf.get_file(mappability_file) self.mappability_file = mappability_file self.mappability_extractor = None # Get the metadata features if cell_line is None: if RNAseq_PC_file is None: raise ValueError("RNAseq_PC_file has to be specified when cell_line=None") assert os.path.exists(RNAseq_PC_file) else: # Using the pre-defined cell-line output_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files/RNAseq_features/") makedir_exist_ok(output_dir) RNAseq_PC_file = os.path.join(output_dir, cell_line, "meta.txt") url_template = ('https://s3.eu-central-1.amazonaws.com/kipoi-models/dataloader_files/' 'FactorNet/dataloader_files/RNAseq_features/{}/meta.txt') # rf = RemoteFile(url=url_template.format(cell_line)) if not os.path.exists(RNAseq_PC_file): # or not rf.validate(mappability_file): # download the path download_url(url_template.format(cell_line), os.path.join(output_dir, cell_line), "meta.txt") # rf.get_file(RNAseq_PC_file) self.meta_feat = pd.read_csv(RNAseq_PC_file, sep="\t", header=None)[0].values
def get_file(self, path): """Download the remote file to cache_dir and return the file path to it """ if self.md5: file_hash = self.md5 else: file_hash = None root, filename = os.path.dirname(path), os.path.basename(path) root = os.path.abspath(root) download_url(self.url, root, filename, file_hash) return os.path.join(root, filename)
def get_file(self, path): """Download the remote file to cache_dir and return the file path to it """ from kipoi.external.torchvision.dataset_utils import download_url if self.md5: file_hash = self.md5 else: file_hash = None root, filename = os.path.dirname(path), os.path.basename(path) root = os.path.abspath(root) download_url(self.url, root, filename, file_hash) return os.path.join(root, filename)
def __init__(self, intervals_file, fasta_file, dnase_file, cell_line=None, RNAseq_PC_file=None, mappability_file=None, GENCODE_dir=None, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None # mappability if mappability_file is None: # download the mappability file if not existing common_dl_dir = os.path.join( this_dir, "../../template/downloaded/dataloader_files") makedir_exist_ok(common_dl_dir) rf = RemoteFile( url= "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig", md5="1d15ddafe2c8df51cf08495db96679e7") mappability_file = os.path.join( common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig") if not os.path.exists(mappability_file) or not rf.validate( mappability_file): # download the path rf.get_file(mappability_file) self.mappability_file = mappability_file self.mappability_extractor = None # Gencode features if GENCODE_dir is None: gp = os.path.join( this_dir, "../../template/downloaded/dataloader_files/gencode_features/") else: gp = GENCODE_dir download_gencode_dir(gp) # download files self.gencode_beds = [ ("cpg", BedTool(gp + '/cpgisland.bed.gz')), ("cds", BedTool(gp + '/wgEncodeGencodeBasicV19.cds.merged.bed.gz')), ("intron", BedTool(gp + '/wgEncodeGencodeBasicV19.intron.merged.bed.gz')), ("promoter", BedTool(gp + '/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')), ("utr5", BedTool(gp + '/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')), ("utr3", BedTool(gp + '/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')), ] # Overlap beds - could be done incrementally print("Overlapping all the bed-files") # The BT() and .fn are there in order to leverage BedToolLinecache self.overlap_beds = [(b, BT(self.bt.intersect(v, wa=True, c=True).fn)) for b, v in self.gencode_beds] print("Assesing the file") assert len(self.overlap_beds[1][1]) == len(self.bt) # Get the metadata features if cell_line is None: if RNAseq_PC_file is None: raise ValueError( "RNAseq_PC_file has to be specified when cell_line=None") assert os.path.exists(RNAseq_PC_file) else: # Using the pre-defined cell-line output_dir = os.path.join( this_dir, "../../template/downloaded/dataloader_files/RNAseq_features/") makedir_exist_ok(output_dir) RNAseq_PC_file = os.path.join(output_dir, cell_line, "meta.txt") url_template = ( 'https://s3.eu-central-1.amazonaws.com/kipoi-models/dataloader_files/' 'FactorNet/dataloader_files/RNAseq_features/{}/meta.txt') # rf = RemoteFile(url=url_template.format(cell_line)) if not os.path.exists( RNAseq_PC_file): # or not rf.validate(mappability_file): # download the path download_url(url_template.format(cell_line), os.path.join(output_dir, cell_line), "meta.txt") # rf.get_file(RNAseq_PC_file) self.meta_feat = pd.read_csv(RNAseq_PC_file, sep="\t", header=None)[0].values