Beispiel #1
0
def singularity_command(kipoi_cmd, model, dataloader_kwargs, output_files=[], source='kipoi', dry_run=False):
    if source != 'kipoi':
        raise NotImplementedError("Containers for sources other than Kipoi are not yet implemented")
    singularity_container_dict = container_remote_url(model, source)
    if singularity_container_dict:
        remote_path = singularity_container_dict['url']
        container_name = singularity_container_dict['name']
        local_path = container_local_path(remote_path, container_name)
        from kipoi_utils.external.torchvision.dataset_utils import download_url
        download_url(url=remote_path, root=local_path, filename=f"{container_name}.sif", md5=singularity_container_dict['md5'])

        assert kipoi_cmd[0] == 'kipoi'

        # remove all spaces within each command
        kipoi_cmd = [x.replace(" ", "").replace("\n", "").replace("\t", "") for x in kipoi_cmd]


        singularity_exec(f"{local_path}/{container_name}.sif",
                        kipoi_cmd,
                        # kipoi_cmd_conda,
                        bind_directories=involved_directories(dataloader_kwargs, output_files, 
                        exclude_dirs=['/tmp', '~']), 
                        dry_run=dry_run
                        )
    else:
        logger.warning(f"Singularity container for {model} either is not available yet or {model} is not in Kipoi.")
def test_download_url_valid_link(tmp_path, capfd):
    download_url(
        "https://zenodo.org/record/1466088/files/example_files-hg19.chr22.fa?download=1",
        tmp_path, 'downloaded.fa', '936544855b253835442a0f253dd4b083')
    out, err = capfd.readouterr()
    for second in ['0.1', '0.2', '0.4', '0.8', '1.6', '3.2']:
        assert "Waiting " + second + " seconds before retrying" not in out

    assert (tmp_path / 'downloaded.fa').exists()
Beispiel #3
0
 def get_file(self, path):
     """Download the remote file to cache_dir and return
     the file path to it
     """
     if self.md5:
         file_hash = self.md5
     else:
         file_hash = None
     root, filename = os.path.dirname(path), os.path.basename(path)
     root = os.path.abspath(root)
     download_url(self.url, root, filename, file_hash)
     return os.path.join(root, filename)
def test_download_url_retry(tmp_path, capfd):
    with pytest.raises(Exception) as exc_info:
        download_url("http://invalid.url", tmp_path, 'downloaded.h5')
    out, err = capfd.readouterr()
    for second in ['0.1', '0.2', '0.4', '0.8', '1.6', '3.2']:
        assert "Waiting " + second + " seconds before retrying" in out
    output = str(exc_info.value)
    assert 'Can not download http://invalid.url' in output
    error_msgs = [
        'Name or service not known',
        'nodename nor servname provided, or not known'
    ]
    assert any(msg in output for msg in error_msgs)
Beispiel #5
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 cell_line=None,
                 RNAseq_PC_file=None,
                 mappability_file=None,
                 GENCODE_dir=None,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_file = fasta_file
        self.fasta_extractor = None  # initialize later
        # DNase
        self.dnase_file = dnase_file
        self.dnase_extractor = None
        # mappability
        if mappability_file is None:
            # download the mappability file if not existing
            common_dl_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files")
            makedir_exist_ok(common_dl_dir)
            rf = RemoteFile(url="http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig",
                            md5="1d15ddafe2c8df51cf08495db96679e7")
            mappability_file = os.path.join(common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig")
            if not os.path.exists(mappability_file) or not rf.validate(mappability_file):
                # download the path
                rf.get_file(mappability_file)
        self.mappability_file = mappability_file
        self.mappability_extractor = None
        # Gencode features
        if GENCODE_dir is None:
            gp = os.path.join(this_dir, "../../template/downloaded/dataloader_files/gencode_features/")
        else:
            gp = GENCODE_dir

        download_gencode_dir(gp)  # download files
        self.gencode_beds = [
            ("cpg", BedTool(gp + '/cpgisland.bed.gz')),
            ("cds", BedTool(gp + '/wgEncodeGencodeBasicV19.cds.merged.bed.gz')),
            ("intron", BedTool(gp + '/wgEncodeGencodeBasicV19.intron.merged.bed.gz')),
            ("promoter", BedTool(gp + '/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')),
            ("utr5", BedTool(gp + '/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')),
            ("utr3", BedTool(gp + '/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')),
        ]
        # Overlap beds - could be done incrementally
        print("Overlapping all the bed-files")
        # The BT() and .fn are there in order to leverage BedToolLinecache
        self.overlap_beds = [(b, BT(self.bt.intersect(v, wa=True, c=True).fn))
                             for b, v in self.gencode_beds]
        print("Assesing the file")
        assert len(self.overlap_beds[1][1]) == len(self.bt)
        # Get the metadata features
        if cell_line is None:
            if RNAseq_PC_file is None:
                raise ValueError("RNAseq_PC_file has to be specified when cell_line=None")
            assert os.path.exists(RNAseq_PC_file)
        else:
            # Using the pre-defined cell-line
            output_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files/RNAseq_features/")
            makedir_exist_ok(output_dir)
            RNAseq_PC_file = os.path.join(output_dir, cell_line, "meta.txt")
            url_template = ('https://s3.eu-central-1.amazonaws.com/kipoi-models/dataloader_files/'
                            'FactorNet/dataloader_files/RNAseq_features/{}/meta.txt')
            # rf = RemoteFile(url=url_template.format(cell_line))
            if not os.path.exists(RNAseq_PC_file):  # or not rf.validate(mappability_file):
                # download the path
                download_url(url_template.format(cell_line), os.path.join(output_dir, cell_line), "meta.txt")
                # rf.get_file(RNAseq_PC_file)

        self.meta_feat = pd.read_csv(RNAseq_PC_file,
                                     sep="\t", header=None)[0].values