Esempio n. 1
0
    def __init__(
            self,
            protocol,
            annotation_type="eyes-center",
            fixed_positions=None,
            dataset_original_directory=rc.get("bob.db.mobio.directory", ""),
            dataset_original_extension=rc.get("bob.db.mobio.extension",
                                              ".png"),
    ):

        # Downloading model if not exists
        urls = MobioDatabase.urls()
        filename = get_file("mobio.tar.gz",
                            urls,
                            file_hash="4a7f99b33a54b2dd337ddcaecb09edb8")

        super().__init__(
            name="mobio",
            dataset_protocol_path=filename,
            protocol=protocol,
            csv_to_sample_loader=make_pipeline(
                CSVToSampleLoaderBiometrics(
                    data_loader=bob.io.base.load,
                    dataset_original_directory=dataset_original_directory,
                    extension=dataset_original_extension,
                ),
                EyesAnnotations(),
            ),
            annotation_type=annotation_type,
            fixed_positions=fixed_positions,
        )
Esempio n. 2
0
    def __init__(
        self,
        protocol,
        dataset_original_directory=rc.get("bob.bio.face.vgg2.directory", ""),
        dataset_original_extension=rc.get("bob.bio.face.vgg2.extension",
                                          ".jpg"),
        annotation_type="eyes-center",
        fixed_positions=None,
    ):

        # Downloading model if not exists
        urls = VGG2Database.urls()
        filename = get_file("vgg2.tar.gz",
                            urls,
                            file_hash="4a05d797a326374a6b52bcd8d5a89d48")

        super().__init__(
            name="vgg2",
            dataset_protocol_path=filename,
            protocol=protocol,
            csv_to_sample_loader=make_pipeline(
                CSVToSampleLoaderBiometrics(
                    data_loader=bob.io.base.load,
                    dataset_original_directory=dataset_original_directory,
                    extension=dataset_original_extension,
                ),
                VGG2Annotations(),
            ),
            annotation_type=annotation_type,
            fixed_positions=fixed_positions,
        )
Esempio n. 3
0
 def __init__(self, *args, **kwargs):
     super(SchedulerResourceRestriction, self).__init__(
         idle_timeout=3600
         if rc.get("bob.pipelines.sge.idle_timeout") is None
         else rc.get("bob.pipelines.sge.idle_timeout"),
         allowed_failures=100
         if rc.get("bob.pipelines.sge.allowed_failures") is None
         else rc.get("bob.pipelines.sge.allowed_failures"),
         synchronize_worker_interval="10s",
         *args,
         **kwargs,
     )
     self.handlers[
         "get_no_worker_tasks_resource_restrictions"
     ] = self.get_no_worker_tasks_resource_restrictions
Esempio n. 4
0
    def __init__(
        self,
        protocol,
        annotation_type="eyes-center",
        fixed_positions=None,
        dataset_original_directory=rc.get("bob.db.morph.directory", ""),
        dataset_original_extension=".JPG",
    ):

        # Downloading model if not exists
        urls = MorphDatabase.urls()
        filename = get_file("morph.tar.gz",
                            urls,
                            file_hash="9efa1ff13ef6984ebfcf86f1b1f58873")

        super().__init__(
            name="morph",
            dataset_protocol_path=filename,
            protocol=protocol,
            csv_to_sample_loader=make_pipeline(
                CSVToSampleLoaderBiometrics(
                    data_loader=bob.io.base.load,
                    dataset_original_directory=dataset_original_directory
                    if dataset_original_directory else "",
                    extension=dataset_original_extension,
                ),
                EyesAnnotations(),
            ),
            annotation_type=annotation_type,
            fixed_positions=fixed_positions,
        )
Esempio n. 5
0
    def __init__(self,
                 protocol,
                 annotation_type="eyes-center",
                 fixed_positions=None):

        # Downloading model if not exists
        urls = FRGCDatabase.urls()
        filename = get_file(
            "frgc.tar.gz",
            urls,
            file_hash="242168e993fe0f6f29bd59fccf3c79a0",
        )

        super().__init__(
            name="frgc",
            dataset_protocol_path=filename,
            protocol=protocol,
            csv_to_sample_loader=make_pipeline(
                CSVToSampleLoaderBiometrics(
                    data_loader=bob.io.base.load,
                    dataset_original_directory=rc.get(
                        "bob.bio.face.frgc.directory", ""),
                    extension="",
                    reference_id_equal_subject_id=False,
                ),
                EyesAnnotations(),
            ),
            annotation_type=annotation_type,
            fixed_positions=fixed_positions,
            score_all_vs_all=True,
            group_probes_by_reference_id=True,
            memory_demanding=True,
        )

        self.hash_fn = hash_string
Esempio n. 6
0
    def __init__(
        self,
        protocol,
        annotation_type="eyes-center",
        fixed_positions=None,
        dataset_original_directory=rc.get("bob.db.meds.directory", ""),
        dataset_original_extension=".jpg",
    ):

        # Downloading model if not exists
        urls = MEDSDatabase.urls()
        filename = get_file("meds.tar.gz",
                            urls,
                            file_hash="3b01354d4c170672ac14120b80dace75")

        super().__init__(
            name="meds",
            dataset_protocol_path=filename,
            protocol=protocol,
            csv_to_sample_loader=make_pipeline(
                CSVToSampleLoaderBiometrics(
                    data_loader=bob.io.base.load,
                    dataset_original_directory=dataset_original_directory
                    if dataset_original_directory else "",
                    extension=dataset_original_extension,
                ),
                EyesAnnotations(),
            ),
            annotation_type=annotation_type,
            fixed_positions=fixed_positions,
        )
Esempio n. 7
0
def test_msceleb():

    database_path = rc.get("bob.bio.face.msceleb.directory")

    # WITH UNKNOW DEMOGRAPHICS
    dataset = MSCelebTorchDataset(database_path,
                                  include_unknow_demographics=True)
    assert dataset.n_classes == 89735
    assert len(dataset.demographic_keys) == 18

    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

    batch = next(iter(dataloader))
    batch["data"].shape == (64, 3, 112, 112)

    # WITHOUT UNKNOW DEMOGRAPHICS

    dataset = MSCelebTorchDataset(database_path,
                                  include_unknow_demographics=False)
    assert dataset.n_classes == 81279
    assert len(dataset.demographic_keys) == 15

    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

    batch = next(iter(dataloader))
    batch["data"].shape == (64, 3, 112, 112)

    weights = dataset.get_demographic_class_weights()

    assert np.allclose(sum(weights), 1, atol=0.001)
Esempio n. 8
0
    def __init__(
            self,
            protocol,
            original_directory=rc.get("bob.bio.face.ijbc.directory"),
            **kwargs,
    ):

        if original_directory is None or not os.path.exists(
                original_directory):
            raise ValueError(
                f"Invalid or non existent `original_directory`: {original_directory}"
            )

        self._check_protocol(protocol)
        super().__init__(
            name="ijbc",
            protocol=protocol,
            score_all_vs_all=False,
            annotation_type="bounding-box",
            fixed_positions=None,
            memory_demanding=True,
        )

        self.image_directory = os.path.join(original_directory, "images")
        self.protocol_directory = os.path.join(original_directory, "protocols")
        self._cached_probes = None
        self._cached_references = None
        self.hash_fn = hash_string

        self._load_metadata(protocol)

        # For the test4 protocols
        if "test4" in protocol:
            self.score_all_vs_all = True
Esempio n. 9
0
def test_replaymobile():
    database = bob.bio.base.load_resource(
        "replaymobile-img", "database", preferred_package="bob.bio.face"
    )
    sample = database.probes()[0][0]
    assert (
        sample.path
        == "devel/real/client005_session02_authenticate_mobile_adverse"
    ), sample.path
    assert sample.frame == "12", sample.frame
    assert sample.should_flip, sample
    assert hasattr(sample, "annotations")
    assert "reye" in sample.annotations
    assert "leye" in sample.annotations
    assert hasattr(sample, "path")
    assert hasattr(sample, "frame")
    assert len(database.references()) == 16
    assert len(database.references(group="eval")) == 12
    assert len(database.probes()) == 4160
    assert len(database.probes(group="eval")) == 3020
    assert sample.annotations == {
        "bottomright": [734, 407],
        "topleft": [436, 182],
        "leye": [541, 350],
        "reye": [540, 245],
        "mouthleft": [655, 254],
        "mouthright": [657, 338],
        "nose": [591, 299],
    }

    # test another sample where should_flip is False
    sample2 = [s for s in database.all_samples() if not s.should_flip][0]
    assert (
        sample2.path
        == "enroll/train/client001_session01_enroll_tablet_lightoff"
    ), sample2.path
    assert sample2.frame == "12", sample2.frame
    assert not sample2.should_flip, sample2
    assert sample2.annotations == {
        "reye": [515, 267],
        "leye": [516, 399],
        "nose": [576, 332],
        "mouthleft": [662, 282],
        "mouthright": [664, 384],
        "topleft": [372, 196],
        "bottomright": [761, 480],
    }, dict(sample2.annotations)

    # Only if data is available
    if rc.get("bob.db.replaymobile.directory", None):
        assert sample.data.shape == (3, 1280, 720), sample.data.shape
        assert sample.data[0, 0, 0] == 94, sample.data[0, 0, 0]

        assert sample2.data.shape == (3, 1280, 720), sample2.data.shape
        assert sample2.data[0, 0, 0] == 129, sample2.data[0, 0, 0]
Esempio n. 10
0
    def __init__(
        self,
        protocol,
        annotation_type="bounding-box",
        fixed_positions=None,
        original_directory=rc.get("bob.bio.video.youtube.directory", ""),
        extension=".jpg",
        annotation_extension=".labeled_faces.txt",
        frame_selector=None,
    ):

        self._check_protocol(protocol)

        original_directory = original_directory or ""
        if not os.path.exists(original_directory):
            logger.warning(
                "Invalid or non existent `original_directory`: f{original_directory}."
                "Please, do `bob config set bob.bio.video.youtube.directory PATH` to set the Youtube data directory."
            )

        urls = YoutubeDatabase.urls()
        cache_subdir = os.path.join("datasets", "youtube_protocols")
        self.filename = get_file(
            "youtube_protocols-6962cd2e.tar.gz",
            urls,
            file_hash="8a4792872ff30b37eab7f25790b0b10d",
            extract=True,
            cache_subdir=cache_subdir,
        )
        self.protocol_path = os.path.dirname(self.filename)

        self.references_dict = {}
        self.probes_dict = {}

        # Dict that holds a `subject_id` as a key and has
        # filenames as values
        self.subject_id_files = {}
        self.reference_id_to_subject_id = None
        self.reference_id_to_sample = None
        self.load_file_client_id()
        self.original_directory = original_directory
        self.extension = extension
        self.annotation_extension = annotation_extension
        self.frame_selector = frame_selector

        super().__init__(
            name="youtube",
            protocol=protocol,
            score_all_vs_all=False,
            annotation_type=annotation_type,
            fixed_positions=None,
            memory_demanding=True,
        )
Esempio n. 11
0
    def __init__(
            self,
            protocol,
            original_directory=rc.get("bob.bio.face.rfw.directory"),
            **kwargs,
    ):

        if original_directory is None or not os.path.exists(
                original_directory):
            raise ValueError(
                "Invalid or non existant `original_directory`: f{original_directory}"
            )

        self._check_protocol(protocol)
        self._races = ["African", "Asian", "Caucasian", "Indian"]
        self.original_directory = original_directory
        self._default_extension = ".jpg"

        super().__init__(
            name="rfw",
            protocol=protocol,
            score_all_vs_all=False,
            annotation_type="eyes-center",
            fixed_positions=None,
            memory_demanding=False,
        )

        self._pairs = dict()
        self._first_reference_of_subject = (dict()
                                            )  # Used with the Idiap protocol
        self._inverted_pairs = dict()
        self._id_race = dict()  # ID -- > RACE
        self._race_ids = dict()  # RACE --> ID
        self._landmarks = dict()
        self._cached_biometric_references = None
        self._cached_probes = None
        self._cached_zprobes = None
        self._cached_treferences = None
        self._cached_treferences = None
        self._discarded_subjects = (
            [])  # Some subjects were labeled with both races
        self._load_metadata(target_set="test")
        self._demographics = None
        self._demographics = self._get_demographics_dict()

        # Setting the seed for the IDIAP PROTOCOL,
        # so we have a consisent set of probes
        self._idiap_protocol_seed = 652

        # Number of samples used to Z-Norm and T-Norm (per race)
        self._nzprobes = 25
        self._ntreferences = 25
Esempio n. 12
0
    def get_cache_path(self):

        filename = ("vgg2_short_cached_bucket.pickle"
                    if self.bob_dataset.protocol == "vgg2-short" else
                    "vgg2_full_cached_bucket.pickle")

        return os.path.join(
            rc.get(
                "bob_data_folder",
                os.path.join(os.path.expanduser("~"), "bob_data"),
            ),
            "datasets",
            f"{filename}",
        )
Esempio n. 13
0
    def get_cache_path(self):

        filename = ("msceleb_cached_bucket_WITH_unknow_demographics.csv"
                    if self.include_unknow_demographics else
                    "msceleb_cached_bucket_WITHOUT_unknow_demographics.csv")

        return os.path.join(
            rc.get(
                "bob_data_folder",
                os.path.join(os.path.expanduser("~"), "bob_data"),
            ),
            "datasets",
            f"{filename}",
        )
Esempio n. 14
0
def test_siamese():
    siamese_transforms = get_standard_data_augmentation()

    # database_path = os.path.join(
    #    rc.get("bob.bio.demographics.directory"), "morph", "samplewrapper"
    # )

    database_path = rc.get("bob.bio.face.vgg2-crops.directory")

    # dataset = MobioTorchDataset(
    #    protocol="mobile0-male-female",
    #    database_path=database_path,
    #    transform=siamese_transforms,
    # )
    # dataset = MedsTorchDataset(
    #    protocol="verification_fold1",
    #    database_path=database_path,
    #    transform=siamese_transforms,
    #    take_from_znorm=False,
    # )
    dataset = VGG2TorchDataset(
        protocol="vgg2-short",
        database_path=database_path,
        database_extension=".jpg",
        transform=siamese_transforms,
    )

    # dataset = MorphTorchDataset(
    #    protocol="verification_fold1",
    #    database_path=database_path,
    #    transform=siamese_transforms,
    #    take_from_znorm=False,
    # )

    siamese_dataset = SiameseDemographicWrapper(
        dataset,
        max_positive_pairs_per_subject=5,
        negative_pairs_per_subject=3)

    dataloader = DataLoader(siamese_dataset, batch_size=64, shuffle=True)

    batch = next(iter(dataloader))

    batch["data"][0].shape == (64, 3, 112, 112)
    batch["data"][1].shape == (64, 3, 112, 112)
Esempio n. 15
0
def test_morph():

    database_path = os.path.join(rc.get("bob.bio.demographics.directory"),
                                 "morph", "samplewrapper")

    dataset = MorphTorchDataset(
        protocol="verification_fold1",
        database_path=database_path,
        take_from_znorm=False,
    )

    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
    batch = next(iter(dataloader))
    batch["data"].shape == (64, 3, 112, 112)

    weights = dataset.get_demographic_class_weights()

    assert np.allclose(sum(weights), 1, atol=0.0001)
Esempio n. 16
0
def test_mobio():

    database_path = os.path.join(rc.get("bob.bio.demographics.directory"),
                                 "mobio", "samplewrapper")

    dataset = MobioTorchDataset(
        protocol="mobile0-male-female",
        database_path=database_path,
    )

    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

    batch = next(iter(dataloader))
    batch["data"].shape == (64, 3, 112, 112)

    # Testing class weights
    weights = dataset.get_demographic_class_weights()

    assert np.allclose(sum(weights), 1)
Esempio n. 17
0
    def __init__(
        self,
        protocol,
        annotation_type="eyes-center",
        fixed_positions=None,
        original_directory=rc.get("bob.bio.face.gbu.directory"),
        extension=".jpg",
    ):

        # self.filename = "/idiap/user/tpereira/gitlab/bob/bob.nightlies/temp/gbu.tar.gz"
        # Downloading model if not exists
        urls = GBUDatabase.urls()
        self.filename = get_file(
            "gbu-xmls.tar.gz",
            urls,
            file_hash="827de43434ee84020c6a949ece5e4a4d",
        )

        self.references_dict = {}
        self.probes_dict = {}

        self.annotations = None
        self.original_directory = original_directory
        self.extension = extension

        self.background_samples = None
        self._background_files = [
            "GBU_Training_Uncontrolledx1.xml",
            "GBU_Training_Uncontrolledx2.xml",
            "GBU_Training_Uncontrolledx4.xml",
            "GBU_Training_Uncontrolledx8.xml",
        ]

        super().__init__(
            name="gbu",
            protocol=protocol,
            score_all_vs_all=True,
            annotation_type="eyes-center",
            fixed_positions=fixed_positions,
            memory_demanding=True,
        )
Esempio n. 18
0
def test_meds():

    database_path = os.path.join(rc.get("bob.bio.demographics.directory"),
                                 "meds", "samplewrapper")

    dataset = MedsTorchDataset(
        protocol="verification_fold1",
        database_path=database_path,
    )

    dataloader = DataLoader(dataset,
                            batch_size=64,
                            shuffle=True,
                            pin_memory=True,
                            num_workers=2)

    batch = next(iter(dataloader))
    batch["data"].shape == (64, 3, 112, 112)

    # Testing class weights
    weights = dataset.get_demographic_class_weights()
    assert np.allclose(sum(weights), 1)
Esempio n. 19
0
def test_vgg2():

    database_path = rc.get("bob.bio.face.vgg2-crops.directory")

    dataset = VGG2TorchDataset(protocol="vgg2-short",
                               database_path=database_path)

    assert np.allclose(sum(dataset.get_demographic_weights(as_dict=False)),
                       1,
                       atol=1)

    assert dataset.n_classes == 8631
    assert len(dataset.demographic_keys) == 8

    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

    batch = next(iter(dataloader))
    batch["data"].shape == (64, 3, 112, 112)

    weights = dataset.get_demographic_class_weights()

    assert np.allclose(sum(weights), 1, atol=0.001)

    # Testing dev

    dataset = VGG2TorchDataset(protocol="vgg2-short",
                               database_path=database_path,
                               train=False)

    assert dataset.n_classes == 8631
    assert len(dataset.demographic_keys) == 8

    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
    batch = next(iter(dataloader))
    batch["data"].shape == (64, 3, 112, 112)

    weights = dataset.get_demographic_class_weights()

    assert np.allclose(sum(weights), 1, atol=0.001)
Esempio n. 20
0
    def __init__(
        self,
        database_path=rc.get("bob.bio.face.webface42M.directory", ""),
        transform=None,
    ):
        self.database_path = database_path

        if database_path == "":
            raise ValueError(
                "`database_path` is empty; please do `bob config set bob.bio.face.webface42M.directory` to set the absolute path of the data"
            )

        urls = WebFace42M.urls()
        filename = get_file(
            "webface42M.tar.gz",
            urls,
            file_hash="50c32cbe61de261466e1ea3af2721cea",
        )
        self.file = search_file(filename, "webface42M.csv")

        self._line_offset = 51
        self.transform = transform
Esempio n. 21
0
    def __init__(
        self, protocol, annotation_type="eyes-center", fixed_positions=None
    ):

        # Downloading model if not exists
        urls = PolaThermalDatabase.urls()
        filename = get_file(
            "polathermal.tar.gz",
            urls,
            file_hash="4693149bc883debe5a9e1441a4f5f4ae",
        )

        directory = rc.get("bob.db.pola-thermal.directory", "")

        def load(path):
            """
            Images in this dataset are stored as 16-bit PNG [0-65535]
            and bob.bio.face assumes images are between 0 and 255,
            so we divide by 257: 65535 / 255 = 257
            """
            return bob.io.base.load(path) / 257

        super().__init__(
            name="polathermal",
            protocol=protocol,
            dataset_protocol_path=filename,
            csv_to_sample_loader=make_pipeline(
                CSVToSampleLoaderBiometrics(
                    data_loader=load,
                    dataset_original_directory=directory,
                    extension=".png",
                ),
                EyesAnnotations(),
            ),
            annotation_type=annotation_type,
            fixed_positions=fixed_positions,
        )
Esempio n. 22
0
from bob.bio.face.pytorch.datasets import (
    MedsTorchDataset,
    MobioTorchDataset,
    MorphTorchDataset,
    MSCelebTorchDataset,
    SiameseDemographicWrapper,
    VGG2TorchDataset,
    WebFace42M,
)
from bob.bio.face.pytorch.preprocessing import get_standard_data_augmentation
from bob.extension import rc


@pytest.mark.skipif(
    rc.get("bob.bio.face.webface42M.directory") is None,
    reason=
    "WEBFace42M  not available. Please do `bob config set bob.bio.face.ijbc.directory [IJBC PATH]` to set the IJBC data path.",
)
def test_webface42M():

    dataset = WebFace42M()

    sample = dataset[0]
    assert sample["label"] == 0
    assert sample["data"].shape == (3, 112, 112)

    sample = dataset[100000]
    assert sample["label"] == 4960
    assert sample["data"].shape == (3, 112, 112)
Esempio n. 23
0
    def __init__(
        self,
        *args,
        queue=None,
        project=rc.get("sge.project"),
        resource_spec=None,
        job_extra=None,
        config_name="sge",
        **kwargs,
    ):

        if queue is None:
            queue = dask.config.get("jobqueue.%s.queue" % config_name)
        if project is None:
            project = dask.config.get("jobqueue.%s.project" % config_name)
        if resource_spec is None:
            resource_spec = dask.config.get("jobqueue.%s.resource-spec" % config_name)
        if job_extra is None:
            job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name)

        super().__init__(*args, config_name=config_name, death_timeout=10000, **kwargs)

        # Amending the --resources in the `distributed.cli.dask_worker` CLI command
        if "resources" in kwargs and kwargs["resources"]:
            resources = kwargs["resources"]

            # Preparing the string to be sent to `dask-worker` command
            def _resource_to_str(resources):
                resources_str = ""
                for k in resources:
                    resources_str += f"{k}={resources[k]}"
                return resources_str

            resources_str = _resource_to_str(resources)

            self._command_template += f" --resources {resources_str}"

        header_lines = []
        if self.job_name is not None:
            header_lines.append("#$ -N %(job-name)s")
        if queue is not None:
            header_lines.append("#$ -q %(queue)s")
        if project is not None:
            header_lines.append("#$ -P %(project)s")
        if resource_spec is not None:
            header_lines.append("#$ -l %(resource_spec)s")

        if self.log_directory is not None:
            header_lines.append("#$ -e %(log_directory)s/")
            header_lines.append("#$ -o %(log_directory)s/")
        header_lines.extend(["#$ -cwd", "#$ -j y"])
        header_lines.extend(["#$ %s" % arg for arg in job_extra])
        header_template = "\n".join(header_lines)

        config = {
            "job-name": self.job_name,
            "queue": queue,
            "project": project,
            "processes": self.worker_processes,
            "resource_spec": resource_spec,
            "log_directory": self.log_directory,
        }
        self.job_header = header_template % config
        logger.debug("Job script: \n %s" % self.job_script())
Esempio n. 24
0
    def __init__(
        self,
        log_directory="./logs",
        protocol="tcp://",
        dashboard_address=":8787",
        env_extra=None,
        sge_job_spec=QUEUE_DEFAULT,
        min_jobs=1,
        project=rc.get("sge.project"),
        **kwargs,
    ):

        # Defining the job launcher
        self.job_cls = SGEIdiapJob
        self.sge_job_spec = sge_job_spec

        self.protocol = protocol
        self.log_directory = log_directory
        self.project = project

        silence_logs = "error"
        interface = None
        host = None
        security = None

        if env_extra is None:
            env_extra = []
        elif not isinstance(env_extra, list):
            env_extra = [env_extra]
        self.env_extra = env_extra + ["export PYTHONPATH=" + ":".join(sys.path)]

        scheduler = {
            "cls": SchedulerResourceRestriction,  # Use local scheduler for now
            "options": {
                "protocol": self.protocol,
                "interface": interface,
                "host": host,
                "dashboard_address": dashboard_address,
                "security": security,
            },
        }

        # Spec cluster parameters
        loop = None
        asynchronous = False
        name = None

        # Starting the SpecCluster constructor
        super(JobQueueCluster, self).__init__(
            scheduler=scheduler,
            worker={},
            loop=loop,
            silence_logs=silence_logs,
            asynchronous=asynchronous,
            name=name,
        )

        max_jobs = get_max_jobs(sge_job_spec)
        self.scale(max_jobs)
        # Adapting to minimim 1 job to maximum 48 jobs
        # interval: Milliseconds between checks from the scheduler
        # wait_count: Number of consecutive times that a worker should be suggested for
        #             removal before we remove it.

        self.adapt(
            minimum=min_jobs,
            maximum=max_jobs,
            wait_count=5,
            interval=10,
            target_duration="10s",
        )
Esempio n. 25
0
    def __init__(
        self,
        protocol,
        annotation_type="eyes-center",
        image_relative_path="all_images",
        fixed_positions=None,
        original_directory=rc.get("bob.bio.face.lfw.directory"),
        extension=".jpg",
        annotation_directory=rc.get("bob.bio.face.lfw.annotation_directory"),
        annotation_issuer="funneled",
    ):

        if original_directory is None or not os.path.exists(
                original_directory):
            raise ValueError(
                f"Invalid or non existent `original_directory`: {original_directory}."
                "Please, do `bob config set bob.bio.face.lfw.directory PATH` to set the LFW data directory."
            )

        if annotation_issuer not in ("funneled", "idiap", "named"):
            raise ValueError(
                f"Invalid annotation issuer: {annotation_issuer}. Possible values are `idiap`, `funneled` or `named`"
            )

        if annotation_directory is None or not os.path.exists(
                annotation_directory):
            # Downloading annotations if not exists
            annotation_urls = LFWDatabase.urls()

            logger.info(
                f"`annotation_directory`: {annotation_directory} not set. "
                f"Fetching it from {annotation_urls[0]}")

            annotation_directory = get_file(
                "lfw_annotations.tar.gz",
                annotation_urls,
                file_hash="c0ce6e090e19d0ed159172fcba2e8252",
                extract=True,
            )

            # Removing extension
            annotation_directory = annotation_directory[:-7]

            # Attaching the issuer sub-directory
            annotation_directory = os.path.join(annotation_directory,
                                                annotation_issuer)

        self.annotation_issuer = annotation_issuer
        # Hard-coding the extension of the annotations
        # I don't think we need this exposed
        # Please, open an issue if otherwise
        self.annotation_extension = (".jpg.pts" if annotation_issuer
                                     == "funneled" else ".pos")

        self._check_protocol(protocol)

        self.references_dict = {}
        self.probes_dict = {}
        self.pairs = {}
        self.probe_reference_keys = {}  # Inverted pairs

        self.annotations = None
        self.original_directory = original_directory
        self.annotation_directory = annotation_directory
        self.extension = extension
        self.image_relative_path = image_relative_path

        # Some path manipulation lambdas
        self.subject_id_from_filename = lambda x: "_".join(x.split("_")[0:-1])

        self.make_path_from_filename = lambda x: os.path.join(
            self.subject_id_from_filename(x), x)

        super().__init__(
            name="lfw",
            protocol=protocol,
            score_all_vs_all=protocol[0] == "o",
            annotation_type=annotation_type,
            fixed_positions=fixed_positions,
            memory_demanding=False,
        )

        self.load_pairs()
Esempio n. 26
0
def SpearBioDatabase(
    name: str,
    protocol: Optional[str] = None,
    dataset_protocol_path: Optional[str] = None,
    data_path: Optional[str] = None,
    data_ext: str = ".wav",
    annotations_path: Optional[str] = None,
    annotations_ext: str = ".json",
    force_sample_rate: Optional[int] = None,
    force_channel: Optional[int] = None,
    **kwargs,
):
    """Database interface for the bob.bio.spear datasets for speaker recognition.

    This database interface is meant to be used with bob.bio.base pipelines.

    Given a series of CSV files (or downloading them from the bob data server), it
    creates the Sample objects for each roles needed by the pipeline (enroll, probe),
    for different groups (train, dev, eval).

    Each sample contains:

        - `data`: the wav audio data,
        - `rate`: the sample rate of `data`,
        - (optional)`annotations`: some annotations loaded from files if
          `annotations_path` is provided.

    `protocol definition` files (CSV files) are not the `data` files (WAV files):

        - `protocol definition` files are a list of paths and corresponding reference
          name. They are available on the bob data server.
        - `data` files are the actual files of the dataset (pointed to by the definition
          files). They are not provided by bob.

    You have to set the bob configuration to the root folder of the data files using
    the following command:

    ``$ bob config set bob.db.<database_name>.directory <your_path_to_data>``

    The final data paths will be constructed with the bob.db.<database_name>.directory
    key, and the paths in the CSV protocol definition files.

    Parameters
    ----------

    name
        name of the database used for retrieving config keys and files.

    protocol
        protocol to use (sub-folder containing the protocol definition files).

    dataset_protocol_path
        Path to an existing protocol definition folder structure.
        If None: will download the definition files to a datasets folder in the path
        pointed by the ``bob_data_folder`` config (see
        :py:func:`bob.extension.download.get_file`).

    data_path
        Path to the data files of the database.
        If None: will use the path in the ``bob.db.<database_name>.directory`` config.

    data_ext
        File extension of the data files.

    annotations_path
        Path to the annotations files of the dataset, if available.
        If None: will not load any annotations (you could then annotate on the fly with
        a transformer).

    annotations_ext
        If annotations_path is provided, will load annotation using this extension.

    force_sample_rate
        If not None, will force the sample rate of the data to a specific value.
        Otherwise the sample rate will be specified by each loaded file.

    force_channel
        If not None, will force to load the nth channel of each file. If None and the
        samples have a ``channel`` attribute, this channel will be loaded, and
        otherwise all channels will be loaded in a 2D array if multiple are present.
    """

    if dataset_protocol_path is None:
        dataset_protocol_path = get_protocol_file(name)

    logger.info(
        f"Database: Will read the CSV protocol definitions in '{dataset_protocol_path}'."
    )

    rc_db_name = known_databases.get(name, {}).get("rc_name", name)

    if data_path is None:
        data_path = rc.get(f"bob.db.{rc_db_name}.directory")
    if data_path is None:
        raise RuntimeError(
            f"No data path was provided! Either set 'bob.db.{rc_db_name}.directory' "
            "with the 'bob config set' command, or provide a 'data_path' to "
            "'SpearBioDatabase'."
        )

    logger.info(f"Database: Will read raw data files in '{data_path}'.")

    # Define the data loading transformers

    # Load a path into the data of the sample
    sample_loader = CSVToSampleLoaderBiometrics(
        data_loader=path_loader,
        dataset_original_directory=data_path,
        extension=data_ext,
        reference_id_equal_subject_id=name not in ["voxceleb"],
    )

    # Read the file at path and set the data and metadata of a sample
    path_to_sample = PathToAudio(
        forced_channel=force_channel, forced_sr=force_sample_rate
    )

    # Build the data loading pipeline
    if annotations_path is None:
        sample_loader = Pipeline(
            [
                ("db:reader_loader", sample_loader),
                ("db:path_to_sample", path_to_sample),
            ]
        )
    else:
        logger.info(
            f"Database: Will read annotation files in '{annotations_path}'."
        )
        annotations_transformer = AnnotationsLoader(
            annotation_directory=annotations_path,
            annotation_extension=annotations_ext,
        )
        sample_loader = Pipeline(
            [
                ("db:reader_loader", sample_loader),
                ("db:path_to_sample", path_to_sample),
                ("db:annotations_loader", annotations_transformer),
            ]
        )

    return CSVDataset(
        name=name,
        protocol=protocol,
        dataset_protocol_path=dataset_protocol_path,
        csv_to_sample_loader=sample_loader,
        score_all_vs_all=name not in ["voxceleb"],
        is_sparse=name in ["voxceleb"],
        **kwargs,
    )
Esempio n. 27
0
        "mouthright": [664, 384],
        "topleft": [372, 196],
        "bottomright": [761, 480],
    }, dict(sample2.annotations)

    # Only if data is available
    if rc.get("bob.db.replaymobile.directory", None):
        assert sample.data.shape == (3, 1280, 720), sample.data.shape
        assert sample.data[0, 0, 0] == 94, sample.data[0, 0, 0]

        assert sample2.data.shape == (3, 1280, 720), sample2.data.shape
        assert sample2.data[0, 0, 0] == 129, sample2.data[0, 0, 0]


@pytest.mark.skipif(
    rc.get("bob.bio.face.ijbc.directory") is None,
    reason="IJBC original protocols not available. Please do `bob config set bob.bio.face.ijbc.directory [IJBC PATH]` to set the IJBC data path.",
)
@pytest.mark.slow
def test_ijbc():
    from bob.bio.face.database import IJBCDatabase

    # test1 #####

    database = IJBCDatabase(protocol="test1")

    # assert len(database.background_model_samples()) == 140732
    assert len(database.references()) == 3531
    assert len(database.probes()) == 19593
    num_comparisons = sum([len(item.references) for item in database.probes()])
    assert num_comparisons == 19557 + 15638932  # Genuine + Impostor
Esempio n. 28
0
File: jman.py Progetto: noa/gridtk
def main(command_line_options=None):

    from ..config import __version__
    from bob.extension import rc

    formatter = argparse.ArgumentDefaultsHelpFormatter
    parser = argparse.ArgumentParser(description=__doc__,
                                     epilog=__epilog__,
                                     formatter_class=formatter)
    # part of the hack to support aliases in subparsers
    parser.register('action', 'parsers', AliasedSubParsersAction)

    # general options
    parser.add_argument(
        '-v',
        '--verbose',
        action='count',
        default=0,
        help=
        "Increase the verbosity level from 0 (only error messages) to 1 (warnings), 2 (log messages), 3 (debug information) by adding the --verbose option as often as desired (e.g. '-vvv' for debug)."
    )
    parser.add_argument('-V',
                        '--version',
                        action='version',
                        version='GridTk version %s' % __version__)
    parser.add_argument(
        '-d',
        '--database',
        '--db',
        metavar='DATABASE',
        default='submitted.sql3',
        help=
        'replace the default database "submitted.sql3" by one provided by you.'
    )

    parser.add_argument(
        '-l',
        '--local',
        action='store_true',
        help='Uses the local job manager instead of the SGE one.')
    cmdparser = parser.add_subparsers(title='commands',
                                      help='commands accepted by %(prog)s')

    # subcommand 'submit'
    submit_parser = cmdparser.add_parser(
        'submit',
        aliases=['sub'],
        formatter_class=formatter,
        help=
        'Submits jobs to the SGE queue or to the local job scheduler and logs them in a database.'
    )
    submit_parser.add_argument(
        '-q',
        '--queue',
        metavar='QNAME',
        dest='qname',
        default='all.q',
        choices=QUEUES,
        help='the name of the SGE queue to submit the job to')
    submit_parser.add_argument(
        '-e',
        '--sge-extra-args',
        default=rc.get('gridtk.sge.extra.args.default', ''),
        type=str,
        help=
        'Passes extra arguments to qsub. See the documentation of the package for usage and ways of overriding default behavior.'
    )
    submit_parser.add_argument(
        '-m',
        '--memory',
        help='Sets both the h_vmem and the mem_free parameters when submitting '
        'the job to a non-GPU queue, e.g., 8G to set the memory '
        'requirements to 8 gigabytes. Sets gpumem parameter when '
        'submitting the job to a GPU-based queue.')
    submit_parser.add_argument(
        '-p',
        '--parallel',
        '--pe_mth',
        type=int,
        help=
        'Sets the number of slots per job (-pe pe_mth) and multiplies the mem_free parameter. E.g. to get 16 G of memory, use -m 8G -p 2.'
    )
    submit_parser.add_argument('-n',
                               '--name',
                               dest='name',
                               help='Gives the job a name')
    submit_parser.add_argument(
        '-x',
        '--dependencies',
        type=int,
        default=[],
        metavar='ID',
        nargs='*',
        help=
        'Set job dependencies to the list of job identifiers separated by spaces'
    )
    submit_parser.add_argument(
        '-k',
        '--stop-on-failure',
        action='store_true',
        help='Stop depending jobs when this job finished with an error.')
    submit_parser.add_argument(
        '-d',
        '--exec-dir',
        metavar='DIR',
        help=
        'Sets the executing directory, where the script should be executed. If not given, jobs will be executed in the current directory'
    )
    submit_parser.add_argument(
        '-l',
        '--log-dir',
        metavar='DIR',
        help=
        'Sets the log directory. By default, "logs" is selected for the SGE. If the jobs are executed locally, by default the result is written to console.'
    )
    submit_parser.add_argument(
        '-s',
        '--environment',
        metavar='KEY=VALUE',
        dest='env',
        nargs='*',
        default=[],
        help='Passes specific environment variables to the job.')
    submit_parser.add_argument(
        '-t',
        '--array',
        '--parametric',
        metavar='(first-)last(:step)',
        help=
        "Creates a parametric (array) job. You must specify the 'last' value, but 'first' (default=1) and 'step' (default=1) can be specified as well (when specifying 'step', 'first' has to be given, too)."
    )
    submit_parser.add_argument(
        '-z',
        '--dry-run',
        action='store_true',
        help=
        'Do not really submit anything, just print out what would submit in this case'
    )
    submit_parser.add_argument(
        '-i',
        '--io-big',
        action='store_true',
        help=
        'Sets "io_big" on the submitted jobs so it limits the machines in which the job is submitted to those that can do high-throughput.'
    )
    submit_parser.add_argument(
        '-r',
        '--repeat',
        type=int,
        metavar='N',
        default=1,
        help='Submits the job N times. Each job will depend on the job before.'
    )
    submit_parser.add_argument(
        '-o',
        '--print-id',
        action='store_true',
        help=
        'Prints the new job id (so that they can be parsed by automatic scripts).'
    )
    submit_parser.add_argument(
        'job',
        metavar='command',
        nargs=argparse.REMAINDER,
        help=
        "The job that should be executed. Sometimes a -- is required to separate the job from other command line options."
    )
    submit_parser.set_defaults(func=submit)

    # subcommand 're-submit'
    resubmit_parser = cmdparser.add_parser('resubmit',
                                           aliases=['reset', 'requeue', 're'],
                                           formatter_class=formatter,
                                           help='Re-submits a list of jobs.')
    resubmit_parser.add_argument(
        '-j',
        '--job-ids',
        metavar='ID',
        nargs='+',
        help=
        'Re-submit only the jobs with the given ids (by default, all finished jobs are re-submitted).'
    )
    resubmit_parser.add_argument(
        '-q',
        '--queue',
        metavar='QNAME',
        dest='qname',
        choices=QUEUES,
        help='Reset the SGE queue to submit the job to')
    resubmit_parser.add_argument(
        '-m',
        '--memory',
        help='Resets both the h_vmem and the mem_free parameters when '
        'submitting the job to a non-GPU queue, e.g., 8G '
        'to set the memory requirements to 8 gigabytes. Resets gpumem '
        'parameter when submitting the job to a GPU-based queue.')
    resubmit_parser.add_argument(
        '-p',
        '--parallel',
        '--pe_mth',
        type=int,
        help=
        'Resets the number of slots per job (-pe pe_mth) and multiplies the mem_free parameter. E.g. to get 16 G of memory, use -m 8G -p 2.'
    )
    resubmit_parser.add_argument(
        '-i',
        '--io-big',
        action='store_true',
        help='Resubmits the job to the "io_big" queue.')
    resubmit_parser.add_argument(
        '-I',
        '--no-io-big',
        action='store_true',
        help='Resubmits the job NOT to the "io_big" queue.')
    resubmit_parser.add_argument(
        '-k',
        '--keep-logs',
        action='store_true',
        help='Do not clean the log files of the old job before re-submitting.')
    resubmit_parser.add_argument(
        '-s',
        '--also-success',
        action='store_true',
        help='Re-submit also jobs that have finished successfully.')
    resubmit_parser.add_argument(
        '-a',
        '--running-jobs',
        action='store_true',
        help=
        'Re-submit even jobs that are running or waiting (use this flag with care).'
    )
    resubmit_parser.add_argument(
        '-o',
        '--overwrite-command',
        nargs=argparse.REMAINDER,
        help=
        "Overwrite the command line (of a single job) that should be executed (useful to keep job dependencies)."
    )
    resubmit_parser.set_defaults(func=resubmit)

    # subcommand 'stop'
    stop_parser = cmdparser.add_parser(
        'stop',
        formatter_class=formatter,
        help='Stops the execution of jobs in the grid.')
    stop_parser.add_argument(
        '-j',
        '--job-ids',
        metavar='ID',
        nargs='+',
        help=
        'Stop only the jobs with the given ids (by default, all jobs are stopped).'
    )
    stop_parser.set_defaults(func=stop)

    # subcommand 'list'
    list_parser = cmdparser.add_parser(
        'list',
        aliases=['ls'],
        formatter_class=formatter,
        help=
        'Lists jobs stored in the database. Use the -vv option to get a long listing.'
    )
    list_parser.add_argument(
        '-j',
        '--job-ids',
        metavar='ID',
        nargs='+',
        help=
        'List only the jobs with the given ids (by default, all jobs are listed)'
    )
    list_parser.add_argument(
        '-n',
        '--names',
        metavar='NAME',
        nargs='+',
        help=
        'List only the jobs with the given names (by default, all jobs are listed)'
    )
    list_parser.add_argument('-a',
                             '--print-array-jobs',
                             action='store_true',
                             help='Also list the array ids.')
    list_parser.add_argument(
        '-l',
        '--long',
        action='store_true',
        help='Prints additional information about the submitted job.')
    list_parser.add_argument(
        '-t',
        '--print-times',
        action='store_true',
        help=
        'Prints timing information on when jobs were submited, executed and finished'
    )
    list_parser.add_argument(
        '-x',
        '--print-dependencies',
        action='store_true',
        help='Print the dependencies of the jobs as well.')
    list_parser.add_argument(
        '-o',
        '--ids-only',
        action='store_true',
        help=
        'Prints ONLY the job ids (so that they can be parsed by automatic scripts).'
    )
    list_parser.add_argument(
        '-s',
        '--status',
        nargs='+',
        choices=Status,
        default=Status,
        help=
        'Delete only jobs that have the given statuses; by default all jobs are deleted.'
    )
    list_parser.set_defaults(func=list)

    # subcommand 'communicate'
    stop_parser = cmdparser.add_parser(
        'communicate',
        aliases=['com'],
        formatter_class=formatter,
        help=
        'Communicates with the grid to see if there were unexpected errors (e.g. a timeout) during the job execution.'
    )
    stop_parser.add_argument(
        '-j',
        '--job-ids',
        metavar='ID',
        nargs='+',
        help=
        'Check only the jobs with the given ids (by default, all jobs are checked)'
    )
    stop_parser.set_defaults(func=communicate)

    # subcommand 'report'
    report_parser = cmdparser.add_parser(
        'report',
        aliases=['rep', 'r', 'explain', 'why'],
        formatter_class=formatter,
        help=
        'Iterates through the result and error log files and prints out the logs.'
    )
    report_parser.add_argument(
        '-e',
        '--errors-only',
        action='store_true',
        help='Only report the error logs (by default, both logs are reported).'
    )
    report_parser.add_argument(
        '-o',
        '--output-only',
        action='store_true',
        help=
        'Only report the output logs  (by default, both logs are reported).')
    report_parser.add_argument(
        '-j',
        '--job-ids',
        metavar='ID',
        nargs='+',
        help=
        'Report only the jobs with the given ids (by default, all finished jobs are reported)'
    )
    report_parser.add_argument(
        '-a',
        '--array-ids',
        metavar='ID',
        nargs='+',
        help=
        'Report only the jobs with the given array ids. If specified, a single job-id must be given as well.'
    )
    report_parser.add_argument(
        '-n',
        '--name',
        help=
        "Report only the jobs with the given name; by default all jobs are reported."
    )
    report_parser.add_argument(
        '-s',
        '--status',
        nargs='+',
        choices=Status,
        default=Status,
        help=
        'Report only jobs that have the given statuses; by default all jobs are reported.'
    )
    report_parser.set_defaults(func=report)

    # subcommand 'delete'
    delete_parser = cmdparser.add_parser(
        'delete',
        aliases=['del', 'rm', 'remove'],
        formatter_class=formatter,
        help=
        'Removes jobs from the database; if jobs are running or are still scheduled in SGE, the jobs are also removed from the SGE queue.'
    )
    delete_parser.add_argument(
        '-j',
        '--job-ids',
        metavar='ID',
        nargs='+',
        help=
        'Delete only the jobs with the given ids (by default, all jobs are deleted).'
    )
    delete_parser.add_argument(
        '-a',
        '--array-ids',
        metavar='ID',
        nargs='+',
        help=
        'Delete only the jobs with the given array ids. If specified, a single job-id must be given as well. Note that the whole job including all array jobs will be removed from the SGE queue.'
    )
    delete_parser.add_argument(
        '-r',
        '--keep-logs',
        action='store_true',
        help='If set, the log files will NOT be removed.')
    delete_parser.add_argument(
        '-R',
        '--keep-log-dir',
        action='store_true',
        help='When removing the logs, keep the log directory.')
    delete_parser.add_argument(
        '-s',
        '--status',
        nargs='+',
        choices=Status,
        default=Status,
        help=
        'Delete only jobs that have the given statuses; by default all jobs are deleted.'
    )
    delete_parser.set_defaults(func=delete)

    # subcommand 'run_scheduler'
    scheduler_parser = cmdparser.add_parser(
        'run-scheduler',
        aliases=['sched', 'x'],
        formatter_class=formatter,
        help=
        'Runs the scheduler on the local machine. To stop the scheduler safely, please use Ctrl-C; only valid in combination with the \'--local\' option.'
    )
    scheduler_parser.add_argument(
        '-p',
        '--parallel',
        type=int,
        default=1,
        help=
        'Select the number of parallel jobs that you want to execute locally')
    scheduler_parser.add_argument(
        '-j',
        '--job-ids',
        metavar='ID',
        nargs='+',
        help=
        'Select the job ids that should be run (be default, all submitted and queued jobs are run).'
    )
    scheduler_parser.add_argument(
        '-s',
        '--sleep-time',
        type=float,
        default=0.1,
        help='Set the sleep time between for the scheduler in seconds.')
    scheduler_parser.add_argument(
        '-x',
        '--die-when-finished',
        action='store_true',
        help=
        'Let the job manager die when it has finished all jobs of the database.'
    )
    scheduler_parser.add_argument(
        '-l',
        '--no-log-files',
        action='store_true',
        help=
        'Overwrites the log file setup to print the results to the console.')
    scheduler_parser.add_argument(
        '-n',
        '--nice',
        type=int,
        help=
        'Jobs will be run with the given priority (can only be positive, i.e., to have lower priority'
    )
    scheduler_parser.set_defaults(func=run_scheduler)

    # subcommand 'run-job'; this should not be seen on the command line since it is actually a wrapper script
    run_parser = cmdparser.add_parser('run-job', help=argparse.SUPPRESS)
    run_parser.set_defaults(func=run_job)

    if command_line_options:
        args = parser.parse_args(command_line_options[1:])
        args.wrapper_script = command_line_options[0]
    else:
        args = parser.parse_args()
        args.wrapper_script = sys.argv[0]

    if not hasattr(args, "func"):
        return parser.print_help(sys.stderr)

    args.func(args)

    return 0
Esempio n. 29
0
def qsub(command,
         queue=None,
         cwd=True,
         name=None,
         deps=[],
         stdout='',
         stderr='',
         env=[],
         array=None,
         context='grid',
         hostname=None,
         memfree=None,
         hvmem=None,
         gpumem=None,
         pe_opt=None,
         io_big=False,
         sge_extra_args=""):
    """Submits a shell job to a given grid queue

  Keyword parameters:

  command
    The command to be submitted to the grid

  queue
    A valid queue name or None, to use the default queue

  cwd
    If the job should change to the current working directory before starting

  name
    An optional name to set for the job. If not given, defaults to the script
    name being launched.

  deps
    Job ids to which this job will be dependent on

  stdout
    The standard output directory. If not given, defaults to what qsub has as a
    default.

  stderr
    The standard error directory (if not given, defaults to the stdout
    directory).

  env
    This is a list of extra variables that will be set on the environment
    running the command of your choice.

  array
    If set should be either:

    1. a string in the form m[-n[:s]] which indicates the starting range 'm',
       the closing range 'n' and the step 's'.
    2. an integer value indicating the total number of jobs to be submitted.
       This is equivalent ot set the parameter to a string "1-k:1" where "k" is
       the passed integer value
    3. a tuple that contains either 1, 2 or 3 elements indicating the start,
       end and step arguments ("m", "n", "s").

    The minimum value for "m" is 1. Giving "0" is an error.

    If submitted with this option, the job to be created will be an SGE
    parametric job. In this mode SGE does not allow individual control of each
    job. The environment variable SGE_TASK_ID will be set on the executing
    process automatically by SGE and indicates the unique identifier in the
    range for which the current job instance is for.

  context
    The setshell context in which we should try a 'qsub'. Normally you don't
    need to change the default. This variable can also be set to a context
    dictionary in which case we just setup using that context instead of
    probing for a new one, what can be fast.

  memfree
    If set, it asks the queue for a node with a minimum amount of memory
    Used only if mem is not set
    (cf. qsub -l mem_free=<...>)

  hvmem
    If set, it asks the queue for a node with a minimum amount of memory
    Used only if mem is not set
    (cf. qsub -l h_vmem=<...>)

  gpumem
    Applicable only for GPU-based queues. If set, it asks for the GPU queue
    with a minimum amount of memory. The amount should not be more than 24.
    (cf. qsub -l gpumem=<...>)

  hostname
    If set, it asks the queue to use only a subset of the available nodes
    Symbols: | for OR, & for AND, ! for NOT, etc.
    (cf. qsub -l hostname=<...>)

  pe_opt
    If set, add a -pe option when launching a job (for instance pe_exclusive* 1-)

  io_big
    If set to true, the io_big flag will be set.
    Use this flag if your process will need a lot of Input/Output operations.

  sge_extra_args
    This is used to send extra argument to SGE. Note that all its arguments are directly
    used in `qsub` command. For example, `jman submit -e "-P project_name -l pytorch=true" -- ...` will
    be translated to `qsub -P project_name -l pytorch=true -- ...`


  Returns the job id assigned to this job (integer)
  """
    import six
    from bob.extension import rc

    scmd = ['qsub']

    prepend = rc.get('gridtk.sge.extra.args.prepend') or ""
    sge_extra_args = f"{prepend} {sge_extra_args or ''}"
    scmd += shlex.split(sge_extra_args)

    if isinstance(queue,
                  six.string_types) and queue not in ('all.q', 'default'):
        scmd += ['-l', queue]

    if memfree: scmd += ['-l', 'mem_free=%s' % memfree]
    if hvmem: scmd += ['-l', 'h_vmem=%s' % hvmem]

    if gpumem: scmd += ['-l', 'gpumem=%s' % gpumem]

    if io_big: scmd += ['-l', 'io_big']

    if hostname: scmd += ['-l', 'hostname=%s' % hostname]

    if pe_opt: scmd += ['-pe'] + pe_opt.split()

    if cwd: scmd += ['-cwd']

    if name: scmd += ['-N', name]

    if deps: scmd += ['-hold_jid', ','.join(['%d' % k for k in deps])]

    if stdout:

        if not cwd:
            # pivot, temporarily, to home directory
            curdir = os.path.realpath(os.curdir)
            os.chdir(os.environ['HOME'])

        if not os.path.exists(stdout): makedirs_safe(stdout)

        if not cwd:
            # go back
            os.chdir(os.path.realpath(curdir))

        scmd += ['-o', stdout]

    if stderr:
        if not os.path.exists(stderr): makedirs_safe(stderr)
        scmd += ['-e', stderr]
    elif stdout:  #just re-use the stdout settings
        scmd += ['-e', stdout]

    scmd += ['-terse'
             ]  # simplified job identifiers returned by the command line

    for k in env:
        scmd += ['-v', k]

    if array is not None:
        scmd.append('-t')
        if isinstance(array, six.string_types):
            try:
                i = int(array)
                scmd.append('1-%d:1' % i)
            except ValueError:
                #must be complete...
                scmd.append('%s' % array)
        if isinstance(array, six.integer_types):
            scmd.append('1-%d:1' % array)
        if isinstance(array, (tuple, list)):
            if len(array) < 1 or len(array) > 3:
                raise RuntimeError(
                    "Array tuple should have length between 1 and 3")
            elif len(array) == 1:
                scmd.append('%s' % array[0])
            elif len(array) == 2:
                scmd.append('%s-%s' % (array[0], array[1]))
            elif len(array) == 3:
                scmd.append('%s-%s:%s' % (array[0], array[1], array[2]))

    if not isinstance(command, (list, tuple)): command = [command]
    scmd += command

    logger.debug("Qsub command '%s'", ' '.join(scmd))

    from .setshell import sexec
    jobid = str_(sexec(context, scmd))
    return int(jobid.split("\n")[-1].split('.', 1)[0])
Esempio n. 30
0
def main(command_line_options=None):

    from bob.extension import rc

    from ..config import __version__

    formatter = argparse.ArgumentDefaultsHelpFormatter
    parser = argparse.ArgumentParser(
        description=__doc__, epilog=__epilog__, formatter_class=formatter
    )
    # part of the hack to support aliases in subparsers
    parser.register("action", "parsers", AliasedSubParsersAction)

    # general options
    parser.add_argument(
        "-v",
        "--verbose",
        action="count",
        default=0,
        help="Increase the verbosity level from 0 (only error messages) to 1 (warnings), 2 (log messages), 3 (debug information) by adding the --verbose option as often as desired (e.g. '-vvv' for debug).",
    )
    parser.add_argument(
        "-V",
        "--version",
        action="version",
        version="GridTk version %s" % __version__,
    )
    parser.add_argument(
        "-d",
        "--database",
        "--db",
        metavar="DATABASE",
        default="submitted.sql3",
        help='replace the default database "submitted.sql3" by one provided by you.',
    )

    parser.add_argument(
        "-l",
        "--local",
        action="store_true",
        help="Uses the local job manager instead of the SGE one.",
    )
    cmdparser = parser.add_subparsers(
        title="commands", help="commands accepted by %(prog)s"
    )

    # subcommand 'submit'
    submit_parser = cmdparser.add_parser(
        "submit",
        aliases=["sub"],
        formatter_class=formatter,
        help="Submits jobs to the SGE queue or to the local job scheduler and logs them in a database.",
    )
    submit_parser.add_argument(
        "-q",
        "--queue",
        metavar="QNAME",
        dest="qname",
        default="all.q",
        choices=QUEUES,
        help="the name of the SGE queue to submit the job to",
    )
    submit_parser.add_argument(
        "-e",
        "--sge-extra-args",
        default=rc.get("gridtk.sge.extra.args.default", ""),
        type=str,
        help="Passes extra arguments to qsub. See the documentation of the package for usage and ways of overriding default behavior.",
    )
    submit_parser.add_argument(
        "-m",
        "--memory",
        help="Sets both the h_vmem and the mem_free parameters when submitting "
        "the job to a non-GPU queue, e.g., 8G to set the memory "
        "requirements to 8 gigabytes. Sets gpumem parameter when "
        "submitting the job to a GPU-based queue.",
    )
    submit_parser.add_argument(
        "-p",
        "--parallel",
        "--pe_mth",
        type=int,
        help="Sets the number of slots per job (-pe pe_mth) and multiplies the mem_free parameter. E.g. to get 16 G of memory, use -m 8G -p 2.",
    )
    submit_parser.add_argument(
        "-n", "--name", dest="name", help="Gives the job a name"
    )
    submit_parser.add_argument(
        "-x",
        "--dependencies",
        type=int,
        default=[],
        metavar="ID",
        nargs="*",
        help="Set job dependencies to the list of job identifiers separated by spaces",
    )
    submit_parser.add_argument(
        "-k",
        "--stop-on-failure",
        action="store_true",
        help="Stop depending jobs when this job finished with an error.",
    )
    submit_parser.add_argument(
        "-d",
        "--exec-dir",
        metavar="DIR",
        help="Sets the executing directory, where the script should be executed. If not given, jobs will be executed in the current directory",
    )
    submit_parser.add_argument(
        "-l",
        "--log-dir",
        default="logs",
        metavar="DIR",
        help="Sets the log directory.",
    )
    submit_parser.add_argument(
        "-s",
        "--environment",
        metavar="KEY=VALUE",
        dest="env",
        nargs="*",
        default=[],
        help="Passes specific environment variables to the job.",
    )
    submit_parser.add_argument(
        "-t",
        "--array",
        "--parametric",
        metavar="(first-)last(:step)",
        help="Creates a parametric (array) job. You must specify the 'last' value, but 'first' (default=1) and 'step' (default=1) can be specified as well (when specifying 'step', 'first' has to be given, too).",
    )
    submit_parser.add_argument(
        "-z",
        "--dry-run",
        action="store_true",
        help="Do not really submit anything, just print out what would submit in this case",
    )
    submit_parser.add_argument(
        "-i",
        "--io-big",
        action="store_true",
        help='Sets "io_big" on the submitted jobs so it limits the machines in which the job is submitted to those that can do high-throughput.',
    )
    submit_parser.add_argument(
        "-r",
        "--repeat",
        type=int,
        metavar="N",
        default=1,
        help="Submits the job N times. Each job will depend on the job before.",
    )
    submit_parser.add_argument(
        "-o",
        "--print-id",
        action="store_true",
        help="Prints the new job id (so that they can be parsed by automatic scripts).",
    )
    submit_parser.add_argument(
        "job",
        metavar="command",
        nargs=argparse.REMAINDER,
        help="The job that should be executed. Sometimes a -- is required to separate the job from other command line options.",
    )
    submit_parser.set_defaults(func=submit)

    # subcommand 're-submit'
    resubmit_parser = cmdparser.add_parser(
        "resubmit",
        aliases=["reset", "requeue", "re"],
        formatter_class=formatter,
        help="Re-submits a list of jobs.",
    )
    resubmit_parser.add_argument(
        "-j",
        "--job-ids",
        metavar="ID",
        nargs="+",
        help="Re-submit only the jobs with the given ids (by default, all finished jobs are re-submitted).",
    )
    resubmit_parser.add_argument(
        "-q",
        "--queue",
        metavar="QNAME",
        dest="qname",
        choices=QUEUES,
        help="Reset the SGE queue to submit the job to",
    )
    resubmit_parser.add_argument(
        "-m",
        "--memory",
        help="Resets both the h_vmem and the mem_free parameters when "
        "submitting the job to a non-GPU queue, e.g., 8G "
        "to set the memory requirements to 8 gigabytes. Resets gpumem "
        "parameter when submitting the job to a GPU-based queue.",
    )
    resubmit_parser.add_argument(
        "-p",
        "--parallel",
        "--pe_mth",
        type=int,
        help="Resets the number of slots per job (-pe pe_mth) and multiplies the mem_free parameter. E.g. to get 16 G of memory, use -m 8G -p 2.",
    )
    resubmit_parser.add_argument(
        "-i",
        "--io-big",
        action="store_true",
        help='Resubmits the job to the "io_big" queue.',
    )
    resubmit_parser.add_argument(
        "-I",
        "--no-io-big",
        action="store_true",
        help='Resubmits the job NOT to the "io_big" queue.',
    )
    resubmit_parser.add_argument(
        "-k",
        "--keep-logs",
        action="store_true",
        help="Do not clean the log files of the old job before re-submitting.",
    )
    resubmit_parser.add_argument(
        "-s",
        "--also-success",
        action="store_true",
        help="Re-submit also jobs that have finished successfully.",
    )
    resubmit_parser.add_argument(
        "-a",
        "--running-jobs",
        action="store_true",
        help="Re-submit even jobs that are running or waiting (use this flag with care).",
    )
    resubmit_parser.add_argument(
        "-o",
        "--overwrite-command",
        nargs=argparse.REMAINDER,
        help="Overwrite the command line (of a single job) that should be executed (useful to keep job dependencies).",
    )
    resubmit_parser.set_defaults(func=resubmit)

    # subcommand 'stop'
    stop_parser = cmdparser.add_parser(
        "stop",
        formatter_class=formatter,
        help="Stops the execution of jobs in the grid.",
    )
    stop_parser.add_argument(
        "-j",
        "--job-ids",
        metavar="ID",
        nargs="+",
        help="Stop only the jobs with the given ids (by default, all jobs are stopped).",
    )
    stop_parser.set_defaults(func=stop)

    # subcommand 'list'
    list_parser = cmdparser.add_parser(
        "list",
        aliases=["ls"],
        formatter_class=formatter,
        help="Lists jobs stored in the database. Use the -vv option to get a long listing.",
    )
    list_parser.add_argument(
        "-j",
        "--job-ids",
        metavar="ID",
        nargs="+",
        help="List only the jobs with the given ids (by default, all jobs are listed)",
    )
    list_parser.add_argument(
        "-n",
        "--names",
        metavar="NAME",
        nargs="+",
        help="List only the jobs with the given names (by default, all jobs are listed)",
    )
    list_parser.add_argument(
        "-a",
        "--print-array-jobs",
        action="store_true",
        help="Also list the array ids.",
    )
    list_parser.add_argument(
        "-l",
        "--long",
        action="store_true",
        help="Prints additional information about the submitted job.",
    )
    list_parser.add_argument(
        "-t",
        "--print-times",
        action="store_true",
        help="Prints timing information on when jobs were submited, executed and finished",
    )
    list_parser.add_argument(
        "-x",
        "--print-dependencies",
        action="store_true",
        help="Print the dependencies of the jobs as well.",
    )
    list_parser.add_argument(
        "-o",
        "--ids-only",
        action="store_true",
        help="Prints ONLY the job ids (so that they can be parsed by automatic scripts).",
    )
    list_parser.add_argument(
        "-s",
        "--status",
        nargs="+",
        choices=Status,
        default=Status,
        help="Delete only jobs that have the given statuses; by default all jobs are deleted.",
    )
    list_parser.set_defaults(func=list)

    # subcommand 'communicate'
    stop_parser = cmdparser.add_parser(
        "communicate",
        aliases=["com"],
        formatter_class=formatter,
        help="Communicates with the grid to see if there were unexpected errors (e.g. a timeout) during the job execution.",
    )
    stop_parser.add_argument(
        "-j",
        "--job-ids",
        metavar="ID",
        nargs="+",
        help="Check only the jobs with the given ids (by default, all jobs are checked)",
    )
    stop_parser.set_defaults(func=communicate)

    # subcommand 'report'
    report_parser = cmdparser.add_parser(
        "report",
        aliases=["rep", "r", "explain", "why"],
        formatter_class=formatter,
        help="Iterates through the result and error log files and prints out the logs.",
    )
    report_parser.add_argument(
        "-e",
        "--errors-only",
        action="store_true",
        help="Only report the error logs (by default, both logs are reported).",
    )
    report_parser.add_argument(
        "-o",
        "--output-only",
        action="store_true",
        help="Only report the output logs  (by default, both logs are reported).",
    )
    report_parser.add_argument(
        "-j",
        "--job-ids",
        metavar="ID",
        nargs="+",
        help="Report only the jobs with the given ids (by default, all finished jobs are reported)",
    )
    report_parser.add_argument(
        "-a",
        "--array-ids",
        metavar="ID",
        nargs="+",
        help="Report only the jobs with the given array ids. If specified, a single job-id must be given as well.",
    )
    report_parser.add_argument(
        "-n",
        "--name",
        help="Report only the jobs with the given name; by default all jobs are reported.",
    )
    report_parser.add_argument(
        "-s",
        "--status",
        nargs="+",
        choices=Status,
        default=Status,
        help="Report only jobs that have the given statuses; by default all jobs are reported.",
    )
    report_parser.set_defaults(func=report)

    # subcommand 'delete'
    delete_parser = cmdparser.add_parser(
        "delete",
        aliases=["del", "rm", "remove"],
        formatter_class=formatter,
        help="Removes jobs from the database; if jobs are running or are still scheduled in SGE, the jobs are also removed from the SGE queue.",
    )
    delete_parser.add_argument(
        "-j",
        "--job-ids",
        metavar="ID",
        nargs="+",
        help="Delete only the jobs with the given ids (by default, all jobs are deleted).",
    )
    delete_parser.add_argument(
        "-a",
        "--array-ids",
        metavar="ID",
        nargs="+",
        help="Delete only the jobs with the given array ids. If specified, a single job-id must be given as well. Note that the whole job including all array jobs will be removed from the SGE queue.",
    )
    delete_parser.add_argument(
        "-r",
        "--keep-logs",
        action="store_true",
        help="If set, the log files will NOT be removed.",
    )
    delete_parser.add_argument(
        "-R",
        "--keep-log-dir",
        action="store_true",
        help="When removing the logs, keep the log directory.",
    )
    delete_parser.add_argument(
        "-s",
        "--status",
        nargs="+",
        choices=Status,
        default=Status,
        help="Delete only jobs that have the given statuses; by default all jobs are deleted.",
    )
    delete_parser.set_defaults(func=delete)

    # subcommand 'run_scheduler'
    scheduler_parser = cmdparser.add_parser(
        "run-scheduler",
        aliases=["sched", "x"],
        formatter_class=formatter,
        help="Runs the scheduler on the local machine. To stop the scheduler safely, please use Ctrl-C; only valid in combination with the '--local' option.",
    )
    scheduler_parser.add_argument(
        "-p",
        "--parallel",
        type=int,
        default=1,
        help="Select the number of parallel jobs that you want to execute locally",
    )
    scheduler_parser.add_argument(
        "-j",
        "--job-ids",
        metavar="ID",
        nargs="+",
        help="Select the job ids that should be run (be default, all submitted and queued jobs are run).",
    )
    scheduler_parser.add_argument(
        "-s",
        "--sleep-time",
        type=float,
        default=0.1,
        help="Set the sleep time between for the scheduler in seconds.",
    )
    scheduler_parser.add_argument(
        "-x",
        "--die-when-finished",
        action="store_true",
        help="Let the job manager die when it has finished all jobs of the database.",
    )
    scheduler_parser.add_argument(
        "-l",
        "--no-log-files",
        action="store_true",
        help="Overwrites the log file setup to print the results to the console.",
    )
    scheduler_parser.add_argument(
        "-n",
        "--nice",
        type=int,
        help="Jobs will be run with the given priority (can only be positive, i.e., to have lower priority",
    )
    scheduler_parser.set_defaults(func=run_scheduler)

    # subcommand 'run-job'; this should not be seen on the command line since it is actually a wrapper script
    run_parser = cmdparser.add_parser("run-job", help=argparse.SUPPRESS)
    run_parser.set_defaults(func=run_job)

    if command_line_options:
        args = parser.parse_args(command_line_options[1:])
        args.wrapper_script = command_line_options[0]
    else:
        args = parser.parse_args()
        args.wrapper_script = sys.argv[0]

    if not hasattr(args, "func"):
        return parser.print_help(sys.stderr)

    args.func(args)

    return 0