def singularity_command(kipoi_cmd, model, dataloader_kwargs, output_files=[], source='kipoi', dry_run=False): if source != 'kipoi': raise NotImplementedError("Containers for sources other than Kipoi are not yet implemented") singularity_container_dict = container_remote_url(model, source) if singularity_container_dict: remote_path = singularity_container_dict['url'] container_name = singularity_container_dict['name'] local_path = container_local_path(remote_path, container_name) from kipoi_utils.external.torchvision.dataset_utils import download_url download_url(url=remote_path, root=local_path, filename=f"{container_name}.sif", md5=singularity_container_dict['md5']) assert kipoi_cmd[0] == 'kipoi' # remove all spaces within each command kipoi_cmd = [x.replace(" ", "").replace("\n", "").replace("\t", "") for x in kipoi_cmd] singularity_exec(f"{local_path}/{container_name}.sif", kipoi_cmd, # kipoi_cmd_conda, bind_directories=involved_directories(dataloader_kwargs, output_files, exclude_dirs=['/tmp', '~']), dry_run=dry_run ) else: logger.warning(f"Singularity container for {model} either is not available yet or {model} is not in Kipoi.")
def test_download_url_valid_link(tmp_path, capfd): download_url( "https://zenodo.org/record/1466088/files/example_files-hg19.chr22.fa?download=1", tmp_path, 'downloaded.fa', '936544855b253835442a0f253dd4b083') out, err = capfd.readouterr() for second in ['0.1', '0.2', '0.4', '0.8', '1.6', '3.2']: assert "Waiting " + second + " seconds before retrying" not in out assert (tmp_path / 'downloaded.fa').exists()
def get_file(self, path): """Download the remote file to cache_dir and return the file path to it """ if self.md5: file_hash = self.md5 else: file_hash = None root, filename = os.path.dirname(path), os.path.basename(path) root = os.path.abspath(root) download_url(self.url, root, filename, file_hash) return os.path.join(root, filename)
def test_download_url_retry(tmp_path, capfd): with pytest.raises(Exception) as exc_info: download_url("http://invalid.url", tmp_path, 'downloaded.h5') out, err = capfd.readouterr() for second in ['0.1', '0.2', '0.4', '0.8', '1.6', '3.2']: assert "Waiting " + second + " seconds before retrying" in out output = str(exc_info.value) assert 'Can not download http://invalid.url' in output error_msgs = [ 'Name or service not known', 'nodename nor servname provided, or not known' ] assert any(msg in output for msg in error_msgs)
def __init__(self, intervals_file, fasta_file, dnase_file, cell_line=None, RNAseq_PC_file=None, mappability_file=None, GENCODE_dir=None, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None # mappability if mappability_file is None: # download the mappability file if not existing common_dl_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files") makedir_exist_ok(common_dl_dir) rf = RemoteFile(url="http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig", md5="1d15ddafe2c8df51cf08495db96679e7") mappability_file = os.path.join(common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig") if not os.path.exists(mappability_file) or not rf.validate(mappability_file): # download the path rf.get_file(mappability_file) self.mappability_file = mappability_file self.mappability_extractor = None # Gencode features if GENCODE_dir is None: gp = os.path.join(this_dir, "../../template/downloaded/dataloader_files/gencode_features/") else: gp = GENCODE_dir download_gencode_dir(gp) # download files self.gencode_beds = [ ("cpg", BedTool(gp + '/cpgisland.bed.gz')), ("cds", BedTool(gp + '/wgEncodeGencodeBasicV19.cds.merged.bed.gz')), ("intron", BedTool(gp + '/wgEncodeGencodeBasicV19.intron.merged.bed.gz')), ("promoter", BedTool(gp + '/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')), ("utr5", BedTool(gp + '/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')), ("utr3", BedTool(gp + '/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')), ] # Overlap beds - could be done incrementally print("Overlapping all the bed-files") # The BT() and .fn are there in order to leverage BedToolLinecache self.overlap_beds = [(b, BT(self.bt.intersect(v, wa=True, c=True).fn)) for b, v in self.gencode_beds] print("Assesing the file") assert len(self.overlap_beds[1][1]) == len(self.bt) # Get the metadata features if cell_line is None: if RNAseq_PC_file is None: raise ValueError("RNAseq_PC_file has to be specified when cell_line=None") assert os.path.exists(RNAseq_PC_file) else: # Using the pre-defined cell-line output_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files/RNAseq_features/") makedir_exist_ok(output_dir) RNAseq_PC_file = os.path.join(output_dir, cell_line, "meta.txt") url_template = ('https://s3.eu-central-1.amazonaws.com/kipoi-models/dataloader_files/' 'FactorNet/dataloader_files/RNAseq_features/{}/meta.txt') # rf = RemoteFile(url=url_template.format(cell_line)) if not os.path.exists(RNAseq_PC_file): # or not rf.validate(mappability_file): # download the path download_url(url_template.format(cell_line), os.path.join(output_dir, cell_line), "meta.txt") # rf.get_file(RNAseq_PC_file) self.meta_feat = pd.read_csv(RNAseq_PC_file, sep="\t", header=None)[0].values