def __init__( self, protocol, annotation_type="eyes-center", fixed_positions=None, dataset_original_directory=rc.get("bob.db.mobio.directory", ""), dataset_original_extension=rc.get("bob.db.mobio.extension", ".png"), ): # Downloading model if not exists urls = MobioDatabase.urls() filename = get_file("mobio.tar.gz", urls, file_hash="4a7f99b33a54b2dd337ddcaecb09edb8") super().__init__( name="mobio", dataset_protocol_path=filename, protocol=protocol, csv_to_sample_loader=make_pipeline( CSVToSampleLoaderBiometrics( data_loader=bob.io.base.load, dataset_original_directory=dataset_original_directory, extension=dataset_original_extension, ), EyesAnnotations(), ), annotation_type=annotation_type, fixed_positions=fixed_positions, )
def __init__( self, protocol, dataset_original_directory=rc.get("bob.bio.face.vgg2.directory", ""), dataset_original_extension=rc.get("bob.bio.face.vgg2.extension", ".jpg"), annotation_type="eyes-center", fixed_positions=None, ): # Downloading model if not exists urls = VGG2Database.urls() filename = get_file("vgg2.tar.gz", urls, file_hash="4a05d797a326374a6b52bcd8d5a89d48") super().__init__( name="vgg2", dataset_protocol_path=filename, protocol=protocol, csv_to_sample_loader=make_pipeline( CSVToSampleLoaderBiometrics( data_loader=bob.io.base.load, dataset_original_directory=dataset_original_directory, extension=dataset_original_extension, ), VGG2Annotations(), ), annotation_type=annotation_type, fixed_positions=fixed_positions, )
def __init__(self, *args, **kwargs): super(SchedulerResourceRestriction, self).__init__( idle_timeout=3600 if rc.get("bob.pipelines.sge.idle_timeout") is None else rc.get("bob.pipelines.sge.idle_timeout"), allowed_failures=100 if rc.get("bob.pipelines.sge.allowed_failures") is None else rc.get("bob.pipelines.sge.allowed_failures"), synchronize_worker_interval="10s", *args, **kwargs, ) self.handlers[ "get_no_worker_tasks_resource_restrictions" ] = self.get_no_worker_tasks_resource_restrictions
def __init__( self, protocol, annotation_type="eyes-center", fixed_positions=None, dataset_original_directory=rc.get("bob.db.morph.directory", ""), dataset_original_extension=".JPG", ): # Downloading model if not exists urls = MorphDatabase.urls() filename = get_file("morph.tar.gz", urls, file_hash="9efa1ff13ef6984ebfcf86f1b1f58873") super().__init__( name="morph", dataset_protocol_path=filename, protocol=protocol, csv_to_sample_loader=make_pipeline( CSVToSampleLoaderBiometrics( data_loader=bob.io.base.load, dataset_original_directory=dataset_original_directory if dataset_original_directory else "", extension=dataset_original_extension, ), EyesAnnotations(), ), annotation_type=annotation_type, fixed_positions=fixed_positions, )
def __init__(self, protocol, annotation_type="eyes-center", fixed_positions=None): # Downloading model if not exists urls = FRGCDatabase.urls() filename = get_file( "frgc.tar.gz", urls, file_hash="242168e993fe0f6f29bd59fccf3c79a0", ) super().__init__( name="frgc", dataset_protocol_path=filename, protocol=protocol, csv_to_sample_loader=make_pipeline( CSVToSampleLoaderBiometrics( data_loader=bob.io.base.load, dataset_original_directory=rc.get( "bob.bio.face.frgc.directory", ""), extension="", reference_id_equal_subject_id=False, ), EyesAnnotations(), ), annotation_type=annotation_type, fixed_positions=fixed_positions, score_all_vs_all=True, group_probes_by_reference_id=True, memory_demanding=True, ) self.hash_fn = hash_string
def __init__( self, protocol, annotation_type="eyes-center", fixed_positions=None, dataset_original_directory=rc.get("bob.db.meds.directory", ""), dataset_original_extension=".jpg", ): # Downloading model if not exists urls = MEDSDatabase.urls() filename = get_file("meds.tar.gz", urls, file_hash="3b01354d4c170672ac14120b80dace75") super().__init__( name="meds", dataset_protocol_path=filename, protocol=protocol, csv_to_sample_loader=make_pipeline( CSVToSampleLoaderBiometrics( data_loader=bob.io.base.load, dataset_original_directory=dataset_original_directory if dataset_original_directory else "", extension=dataset_original_extension, ), EyesAnnotations(), ), annotation_type=annotation_type, fixed_positions=fixed_positions, )
def test_msceleb(): database_path = rc.get("bob.bio.face.msceleb.directory") # WITH UNKNOW DEMOGRAPHICS dataset = MSCelebTorchDataset(database_path, include_unknow_demographics=True) assert dataset.n_classes == 89735 assert len(dataset.demographic_keys) == 18 dataloader = DataLoader(dataset, batch_size=64, shuffle=True) batch = next(iter(dataloader)) batch["data"].shape == (64, 3, 112, 112) # WITHOUT UNKNOW DEMOGRAPHICS dataset = MSCelebTorchDataset(database_path, include_unknow_demographics=False) assert dataset.n_classes == 81279 assert len(dataset.demographic_keys) == 15 dataloader = DataLoader(dataset, batch_size=64, shuffle=True) batch = next(iter(dataloader)) batch["data"].shape == (64, 3, 112, 112) weights = dataset.get_demographic_class_weights() assert np.allclose(sum(weights), 1, atol=0.001)
def __init__( self, protocol, original_directory=rc.get("bob.bio.face.ijbc.directory"), **kwargs, ): if original_directory is None or not os.path.exists( original_directory): raise ValueError( f"Invalid or non existent `original_directory`: {original_directory}" ) self._check_protocol(protocol) super().__init__( name="ijbc", protocol=protocol, score_all_vs_all=False, annotation_type="bounding-box", fixed_positions=None, memory_demanding=True, ) self.image_directory = os.path.join(original_directory, "images") self.protocol_directory = os.path.join(original_directory, "protocols") self._cached_probes = None self._cached_references = None self.hash_fn = hash_string self._load_metadata(protocol) # For the test4 protocols if "test4" in protocol: self.score_all_vs_all = True
def test_replaymobile(): database = bob.bio.base.load_resource( "replaymobile-img", "database", preferred_package="bob.bio.face" ) sample = database.probes()[0][0] assert ( sample.path == "devel/real/client005_session02_authenticate_mobile_adverse" ), sample.path assert sample.frame == "12", sample.frame assert sample.should_flip, sample assert hasattr(sample, "annotations") assert "reye" in sample.annotations assert "leye" in sample.annotations assert hasattr(sample, "path") assert hasattr(sample, "frame") assert len(database.references()) == 16 assert len(database.references(group="eval")) == 12 assert len(database.probes()) == 4160 assert len(database.probes(group="eval")) == 3020 assert sample.annotations == { "bottomright": [734, 407], "topleft": [436, 182], "leye": [541, 350], "reye": [540, 245], "mouthleft": [655, 254], "mouthright": [657, 338], "nose": [591, 299], } # test another sample where should_flip is False sample2 = [s for s in database.all_samples() if not s.should_flip][0] assert ( sample2.path == "enroll/train/client001_session01_enroll_tablet_lightoff" ), sample2.path assert sample2.frame == "12", sample2.frame assert not sample2.should_flip, sample2 assert sample2.annotations == { "reye": [515, 267], "leye": [516, 399], "nose": [576, 332], "mouthleft": [662, 282], "mouthright": [664, 384], "topleft": [372, 196], "bottomright": [761, 480], }, dict(sample2.annotations) # Only if data is available if rc.get("bob.db.replaymobile.directory", None): assert sample.data.shape == (3, 1280, 720), sample.data.shape assert sample.data[0, 0, 0] == 94, sample.data[0, 0, 0] assert sample2.data.shape == (3, 1280, 720), sample2.data.shape assert sample2.data[0, 0, 0] == 129, sample2.data[0, 0, 0]
def __init__( self, protocol, annotation_type="bounding-box", fixed_positions=None, original_directory=rc.get("bob.bio.video.youtube.directory", ""), extension=".jpg", annotation_extension=".labeled_faces.txt", frame_selector=None, ): self._check_protocol(protocol) original_directory = original_directory or "" if not os.path.exists(original_directory): logger.warning( "Invalid or non existent `original_directory`: f{original_directory}." "Please, do `bob config set bob.bio.video.youtube.directory PATH` to set the Youtube data directory." ) urls = YoutubeDatabase.urls() cache_subdir = os.path.join("datasets", "youtube_protocols") self.filename = get_file( "youtube_protocols-6962cd2e.tar.gz", urls, file_hash="8a4792872ff30b37eab7f25790b0b10d", extract=True, cache_subdir=cache_subdir, ) self.protocol_path = os.path.dirname(self.filename) self.references_dict = {} self.probes_dict = {} # Dict that holds a `subject_id` as a key and has # filenames as values self.subject_id_files = {} self.reference_id_to_subject_id = None self.reference_id_to_sample = None self.load_file_client_id() self.original_directory = original_directory self.extension = extension self.annotation_extension = annotation_extension self.frame_selector = frame_selector super().__init__( name="youtube", protocol=protocol, score_all_vs_all=False, annotation_type=annotation_type, fixed_positions=None, memory_demanding=True, )
def __init__( self, protocol, original_directory=rc.get("bob.bio.face.rfw.directory"), **kwargs, ): if original_directory is None or not os.path.exists( original_directory): raise ValueError( "Invalid or non existant `original_directory`: f{original_directory}" ) self._check_protocol(protocol) self._races = ["African", "Asian", "Caucasian", "Indian"] self.original_directory = original_directory self._default_extension = ".jpg" super().__init__( name="rfw", protocol=protocol, score_all_vs_all=False, annotation_type="eyes-center", fixed_positions=None, memory_demanding=False, ) self._pairs = dict() self._first_reference_of_subject = (dict() ) # Used with the Idiap protocol self._inverted_pairs = dict() self._id_race = dict() # ID -- > RACE self._race_ids = dict() # RACE --> ID self._landmarks = dict() self._cached_biometric_references = None self._cached_probes = None self._cached_zprobes = None self._cached_treferences = None self._cached_treferences = None self._discarded_subjects = ( []) # Some subjects were labeled with both races self._load_metadata(target_set="test") self._demographics = None self._demographics = self._get_demographics_dict() # Setting the seed for the IDIAP PROTOCOL, # so we have a consisent set of probes self._idiap_protocol_seed = 652 # Number of samples used to Z-Norm and T-Norm (per race) self._nzprobes = 25 self._ntreferences = 25
def get_cache_path(self): filename = ("vgg2_short_cached_bucket.pickle" if self.bob_dataset.protocol == "vgg2-short" else "vgg2_full_cached_bucket.pickle") return os.path.join( rc.get( "bob_data_folder", os.path.join(os.path.expanduser("~"), "bob_data"), ), "datasets", f"{filename}", )
def get_cache_path(self): filename = ("msceleb_cached_bucket_WITH_unknow_demographics.csv" if self.include_unknow_demographics else "msceleb_cached_bucket_WITHOUT_unknow_demographics.csv") return os.path.join( rc.get( "bob_data_folder", os.path.join(os.path.expanduser("~"), "bob_data"), ), "datasets", f"{filename}", )
def test_siamese(): siamese_transforms = get_standard_data_augmentation() # database_path = os.path.join( # rc.get("bob.bio.demographics.directory"), "morph", "samplewrapper" # ) database_path = rc.get("bob.bio.face.vgg2-crops.directory") # dataset = MobioTorchDataset( # protocol="mobile0-male-female", # database_path=database_path, # transform=siamese_transforms, # ) # dataset = MedsTorchDataset( # protocol="verification_fold1", # database_path=database_path, # transform=siamese_transforms, # take_from_znorm=False, # ) dataset = VGG2TorchDataset( protocol="vgg2-short", database_path=database_path, database_extension=".jpg", transform=siamese_transforms, ) # dataset = MorphTorchDataset( # protocol="verification_fold1", # database_path=database_path, # transform=siamese_transforms, # take_from_znorm=False, # ) siamese_dataset = SiameseDemographicWrapper( dataset, max_positive_pairs_per_subject=5, negative_pairs_per_subject=3) dataloader = DataLoader(siamese_dataset, batch_size=64, shuffle=True) batch = next(iter(dataloader)) batch["data"][0].shape == (64, 3, 112, 112) batch["data"][1].shape == (64, 3, 112, 112)
def test_morph(): database_path = os.path.join(rc.get("bob.bio.demographics.directory"), "morph", "samplewrapper") dataset = MorphTorchDataset( protocol="verification_fold1", database_path=database_path, take_from_znorm=False, ) dataloader = DataLoader(dataset, batch_size=64, shuffle=True) batch = next(iter(dataloader)) batch["data"].shape == (64, 3, 112, 112) weights = dataset.get_demographic_class_weights() assert np.allclose(sum(weights), 1, atol=0.0001)
def test_mobio(): database_path = os.path.join(rc.get("bob.bio.demographics.directory"), "mobio", "samplewrapper") dataset = MobioTorchDataset( protocol="mobile0-male-female", database_path=database_path, ) dataloader = DataLoader(dataset, batch_size=64, shuffle=True) batch = next(iter(dataloader)) batch["data"].shape == (64, 3, 112, 112) # Testing class weights weights = dataset.get_demographic_class_weights() assert np.allclose(sum(weights), 1)
def __init__( self, protocol, annotation_type="eyes-center", fixed_positions=None, original_directory=rc.get("bob.bio.face.gbu.directory"), extension=".jpg", ): # self.filename = "/idiap/user/tpereira/gitlab/bob/bob.nightlies/temp/gbu.tar.gz" # Downloading model if not exists urls = GBUDatabase.urls() self.filename = get_file( "gbu-xmls.tar.gz", urls, file_hash="827de43434ee84020c6a949ece5e4a4d", ) self.references_dict = {} self.probes_dict = {} self.annotations = None self.original_directory = original_directory self.extension = extension self.background_samples = None self._background_files = [ "GBU_Training_Uncontrolledx1.xml", "GBU_Training_Uncontrolledx2.xml", "GBU_Training_Uncontrolledx4.xml", "GBU_Training_Uncontrolledx8.xml", ] super().__init__( name="gbu", protocol=protocol, score_all_vs_all=True, annotation_type="eyes-center", fixed_positions=fixed_positions, memory_demanding=True, )
def test_meds(): database_path = os.path.join(rc.get("bob.bio.demographics.directory"), "meds", "samplewrapper") dataset = MedsTorchDataset( protocol="verification_fold1", database_path=database_path, ) dataloader = DataLoader(dataset, batch_size=64, shuffle=True, pin_memory=True, num_workers=2) batch = next(iter(dataloader)) batch["data"].shape == (64, 3, 112, 112) # Testing class weights weights = dataset.get_demographic_class_weights() assert np.allclose(sum(weights), 1)
def test_vgg2(): database_path = rc.get("bob.bio.face.vgg2-crops.directory") dataset = VGG2TorchDataset(protocol="vgg2-short", database_path=database_path) assert np.allclose(sum(dataset.get_demographic_weights(as_dict=False)), 1, atol=1) assert dataset.n_classes == 8631 assert len(dataset.demographic_keys) == 8 dataloader = DataLoader(dataset, batch_size=64, shuffle=True) batch = next(iter(dataloader)) batch["data"].shape == (64, 3, 112, 112) weights = dataset.get_demographic_class_weights() assert np.allclose(sum(weights), 1, atol=0.001) # Testing dev dataset = VGG2TorchDataset(protocol="vgg2-short", database_path=database_path, train=False) assert dataset.n_classes == 8631 assert len(dataset.demographic_keys) == 8 dataloader = DataLoader(dataset, batch_size=64, shuffle=True) batch = next(iter(dataloader)) batch["data"].shape == (64, 3, 112, 112) weights = dataset.get_demographic_class_weights() assert np.allclose(sum(weights), 1, atol=0.001)
def __init__( self, database_path=rc.get("bob.bio.face.webface42M.directory", ""), transform=None, ): self.database_path = database_path if database_path == "": raise ValueError( "`database_path` is empty; please do `bob config set bob.bio.face.webface42M.directory` to set the absolute path of the data" ) urls = WebFace42M.urls() filename = get_file( "webface42M.tar.gz", urls, file_hash="50c32cbe61de261466e1ea3af2721cea", ) self.file = search_file(filename, "webface42M.csv") self._line_offset = 51 self.transform = transform
def __init__( self, protocol, annotation_type="eyes-center", fixed_positions=None ): # Downloading model if not exists urls = PolaThermalDatabase.urls() filename = get_file( "polathermal.tar.gz", urls, file_hash="4693149bc883debe5a9e1441a4f5f4ae", ) directory = rc.get("bob.db.pola-thermal.directory", "") def load(path): """ Images in this dataset are stored as 16-bit PNG [0-65535] and bob.bio.face assumes images are between 0 and 255, so we divide by 257: 65535 / 255 = 257 """ return bob.io.base.load(path) / 257 super().__init__( name="polathermal", protocol=protocol, dataset_protocol_path=filename, csv_to_sample_loader=make_pipeline( CSVToSampleLoaderBiometrics( data_loader=load, dataset_original_directory=directory, extension=".png", ), EyesAnnotations(), ), annotation_type=annotation_type, fixed_positions=fixed_positions, )
from bob.bio.face.pytorch.datasets import ( MedsTorchDataset, MobioTorchDataset, MorphTorchDataset, MSCelebTorchDataset, SiameseDemographicWrapper, VGG2TorchDataset, WebFace42M, ) from bob.bio.face.pytorch.preprocessing import get_standard_data_augmentation from bob.extension import rc @pytest.mark.skipif( rc.get("bob.bio.face.webface42M.directory") is None, reason= "WEBFace42M not available. Please do `bob config set bob.bio.face.ijbc.directory [IJBC PATH]` to set the IJBC data path.", ) def test_webface42M(): dataset = WebFace42M() sample = dataset[0] assert sample["label"] == 0 assert sample["data"].shape == (3, 112, 112) sample = dataset[100000] assert sample["label"] == 4960 assert sample["data"].shape == (3, 112, 112)
def __init__( self, *args, queue=None, project=rc.get("sge.project"), resource_spec=None, job_extra=None, config_name="sge", **kwargs, ): if queue is None: queue = dask.config.get("jobqueue.%s.queue" % config_name) if project is None: project = dask.config.get("jobqueue.%s.project" % config_name) if resource_spec is None: resource_spec = dask.config.get("jobqueue.%s.resource-spec" % config_name) if job_extra is None: job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name) super().__init__(*args, config_name=config_name, death_timeout=10000, **kwargs) # Amending the --resources in the `distributed.cli.dask_worker` CLI command if "resources" in kwargs and kwargs["resources"]: resources = kwargs["resources"] # Preparing the string to be sent to `dask-worker` command def _resource_to_str(resources): resources_str = "" for k in resources: resources_str += f"{k}={resources[k]}" return resources_str resources_str = _resource_to_str(resources) self._command_template += f" --resources {resources_str}" header_lines = [] if self.job_name is not None: header_lines.append("#$ -N %(job-name)s") if queue is not None: header_lines.append("#$ -q %(queue)s") if project is not None: header_lines.append("#$ -P %(project)s") if resource_spec is not None: header_lines.append("#$ -l %(resource_spec)s") if self.log_directory is not None: header_lines.append("#$ -e %(log_directory)s/") header_lines.append("#$ -o %(log_directory)s/") header_lines.extend(["#$ -cwd", "#$ -j y"]) header_lines.extend(["#$ %s" % arg for arg in job_extra]) header_template = "\n".join(header_lines) config = { "job-name": self.job_name, "queue": queue, "project": project, "processes": self.worker_processes, "resource_spec": resource_spec, "log_directory": self.log_directory, } self.job_header = header_template % config logger.debug("Job script: \n %s" % self.job_script())
def __init__( self, log_directory="./logs", protocol="tcp://", dashboard_address=":8787", env_extra=None, sge_job_spec=QUEUE_DEFAULT, min_jobs=1, project=rc.get("sge.project"), **kwargs, ): # Defining the job launcher self.job_cls = SGEIdiapJob self.sge_job_spec = sge_job_spec self.protocol = protocol self.log_directory = log_directory self.project = project silence_logs = "error" interface = None host = None security = None if env_extra is None: env_extra = [] elif not isinstance(env_extra, list): env_extra = [env_extra] self.env_extra = env_extra + ["export PYTHONPATH=" + ":".join(sys.path)] scheduler = { "cls": SchedulerResourceRestriction, # Use local scheduler for now "options": { "protocol": self.protocol, "interface": interface, "host": host, "dashboard_address": dashboard_address, "security": security, }, } # Spec cluster parameters loop = None asynchronous = False name = None # Starting the SpecCluster constructor super(JobQueueCluster, self).__init__( scheduler=scheduler, worker={}, loop=loop, silence_logs=silence_logs, asynchronous=asynchronous, name=name, ) max_jobs = get_max_jobs(sge_job_spec) self.scale(max_jobs) # Adapting to minimim 1 job to maximum 48 jobs # interval: Milliseconds between checks from the scheduler # wait_count: Number of consecutive times that a worker should be suggested for # removal before we remove it. self.adapt( minimum=min_jobs, maximum=max_jobs, wait_count=5, interval=10, target_duration="10s", )
def __init__( self, protocol, annotation_type="eyes-center", image_relative_path="all_images", fixed_positions=None, original_directory=rc.get("bob.bio.face.lfw.directory"), extension=".jpg", annotation_directory=rc.get("bob.bio.face.lfw.annotation_directory"), annotation_issuer="funneled", ): if original_directory is None or not os.path.exists( original_directory): raise ValueError( f"Invalid or non existent `original_directory`: {original_directory}." "Please, do `bob config set bob.bio.face.lfw.directory PATH` to set the LFW data directory." ) if annotation_issuer not in ("funneled", "idiap", "named"): raise ValueError( f"Invalid annotation issuer: {annotation_issuer}. Possible values are `idiap`, `funneled` or `named`" ) if annotation_directory is None or not os.path.exists( annotation_directory): # Downloading annotations if not exists annotation_urls = LFWDatabase.urls() logger.info( f"`annotation_directory`: {annotation_directory} not set. " f"Fetching it from {annotation_urls[0]}") annotation_directory = get_file( "lfw_annotations.tar.gz", annotation_urls, file_hash="c0ce6e090e19d0ed159172fcba2e8252", extract=True, ) # Removing extension annotation_directory = annotation_directory[:-7] # Attaching the issuer sub-directory annotation_directory = os.path.join(annotation_directory, annotation_issuer) self.annotation_issuer = annotation_issuer # Hard-coding the extension of the annotations # I don't think we need this exposed # Please, open an issue if otherwise self.annotation_extension = (".jpg.pts" if annotation_issuer == "funneled" else ".pos") self._check_protocol(protocol) self.references_dict = {} self.probes_dict = {} self.pairs = {} self.probe_reference_keys = {} # Inverted pairs self.annotations = None self.original_directory = original_directory self.annotation_directory = annotation_directory self.extension = extension self.image_relative_path = image_relative_path # Some path manipulation lambdas self.subject_id_from_filename = lambda x: "_".join(x.split("_")[0:-1]) self.make_path_from_filename = lambda x: os.path.join( self.subject_id_from_filename(x), x) super().__init__( name="lfw", protocol=protocol, score_all_vs_all=protocol[0] == "o", annotation_type=annotation_type, fixed_positions=fixed_positions, memory_demanding=False, ) self.load_pairs()
def SpearBioDatabase( name: str, protocol: Optional[str] = None, dataset_protocol_path: Optional[str] = None, data_path: Optional[str] = None, data_ext: str = ".wav", annotations_path: Optional[str] = None, annotations_ext: str = ".json", force_sample_rate: Optional[int] = None, force_channel: Optional[int] = None, **kwargs, ): """Database interface for the bob.bio.spear datasets for speaker recognition. This database interface is meant to be used with bob.bio.base pipelines. Given a series of CSV files (or downloading them from the bob data server), it creates the Sample objects for each roles needed by the pipeline (enroll, probe), for different groups (train, dev, eval). Each sample contains: - `data`: the wav audio data, - `rate`: the sample rate of `data`, - (optional)`annotations`: some annotations loaded from files if `annotations_path` is provided. `protocol definition` files (CSV files) are not the `data` files (WAV files): - `protocol definition` files are a list of paths and corresponding reference name. They are available on the bob data server. - `data` files are the actual files of the dataset (pointed to by the definition files). They are not provided by bob. You have to set the bob configuration to the root folder of the data files using the following command: ``$ bob config set bob.db.<database_name>.directory <your_path_to_data>`` The final data paths will be constructed with the bob.db.<database_name>.directory key, and the paths in the CSV protocol definition files. Parameters ---------- name name of the database used for retrieving config keys and files. protocol protocol to use (sub-folder containing the protocol definition files). dataset_protocol_path Path to an existing protocol definition folder structure. If None: will download the definition files to a datasets folder in the path pointed by the ``bob_data_folder`` config (see :py:func:`bob.extension.download.get_file`). data_path Path to the data files of the database. If None: will use the path in the ``bob.db.<database_name>.directory`` config. data_ext File extension of the data files. annotations_path Path to the annotations files of the dataset, if available. If None: will not load any annotations (you could then annotate on the fly with a transformer). annotations_ext If annotations_path is provided, will load annotation using this extension. force_sample_rate If not None, will force the sample rate of the data to a specific value. Otherwise the sample rate will be specified by each loaded file. force_channel If not None, will force to load the nth channel of each file. If None and the samples have a ``channel`` attribute, this channel will be loaded, and otherwise all channels will be loaded in a 2D array if multiple are present. """ if dataset_protocol_path is None: dataset_protocol_path = get_protocol_file(name) logger.info( f"Database: Will read the CSV protocol definitions in '{dataset_protocol_path}'." ) rc_db_name = known_databases.get(name, {}).get("rc_name", name) if data_path is None: data_path = rc.get(f"bob.db.{rc_db_name}.directory") if data_path is None: raise RuntimeError( f"No data path was provided! Either set 'bob.db.{rc_db_name}.directory' " "with the 'bob config set' command, or provide a 'data_path' to " "'SpearBioDatabase'." ) logger.info(f"Database: Will read raw data files in '{data_path}'.") # Define the data loading transformers # Load a path into the data of the sample sample_loader = CSVToSampleLoaderBiometrics( data_loader=path_loader, dataset_original_directory=data_path, extension=data_ext, reference_id_equal_subject_id=name not in ["voxceleb"], ) # Read the file at path and set the data and metadata of a sample path_to_sample = PathToAudio( forced_channel=force_channel, forced_sr=force_sample_rate ) # Build the data loading pipeline if annotations_path is None: sample_loader = Pipeline( [ ("db:reader_loader", sample_loader), ("db:path_to_sample", path_to_sample), ] ) else: logger.info( f"Database: Will read annotation files in '{annotations_path}'." ) annotations_transformer = AnnotationsLoader( annotation_directory=annotations_path, annotation_extension=annotations_ext, ) sample_loader = Pipeline( [ ("db:reader_loader", sample_loader), ("db:path_to_sample", path_to_sample), ("db:annotations_loader", annotations_transformer), ] ) return CSVDataset( name=name, protocol=protocol, dataset_protocol_path=dataset_protocol_path, csv_to_sample_loader=sample_loader, score_all_vs_all=name not in ["voxceleb"], is_sparse=name in ["voxceleb"], **kwargs, )
"mouthright": [664, 384], "topleft": [372, 196], "bottomright": [761, 480], }, dict(sample2.annotations) # Only if data is available if rc.get("bob.db.replaymobile.directory", None): assert sample.data.shape == (3, 1280, 720), sample.data.shape assert sample.data[0, 0, 0] == 94, sample.data[0, 0, 0] assert sample2.data.shape == (3, 1280, 720), sample2.data.shape assert sample2.data[0, 0, 0] == 129, sample2.data[0, 0, 0] @pytest.mark.skipif( rc.get("bob.bio.face.ijbc.directory") is None, reason="IJBC original protocols not available. Please do `bob config set bob.bio.face.ijbc.directory [IJBC PATH]` to set the IJBC data path.", ) @pytest.mark.slow def test_ijbc(): from bob.bio.face.database import IJBCDatabase # test1 ##### database = IJBCDatabase(protocol="test1") # assert len(database.background_model_samples()) == 140732 assert len(database.references()) == 3531 assert len(database.probes()) == 19593 num_comparisons = sum([len(item.references) for item in database.probes()]) assert num_comparisons == 19557 + 15638932 # Genuine + Impostor
def main(command_line_options=None): from ..config import __version__ from bob.extension import rc formatter = argparse.ArgumentDefaultsHelpFormatter parser = argparse.ArgumentParser(description=__doc__, epilog=__epilog__, formatter_class=formatter) # part of the hack to support aliases in subparsers parser.register('action', 'parsers', AliasedSubParsersAction) # general options parser.add_argument( '-v', '--verbose', action='count', default=0, help= "Increase the verbosity level from 0 (only error messages) to 1 (warnings), 2 (log messages), 3 (debug information) by adding the --verbose option as often as desired (e.g. '-vvv' for debug)." ) parser.add_argument('-V', '--version', action='version', version='GridTk version %s' % __version__) parser.add_argument( '-d', '--database', '--db', metavar='DATABASE', default='submitted.sql3', help= 'replace the default database "submitted.sql3" by one provided by you.' ) parser.add_argument( '-l', '--local', action='store_true', help='Uses the local job manager instead of the SGE one.') cmdparser = parser.add_subparsers(title='commands', help='commands accepted by %(prog)s') # subcommand 'submit' submit_parser = cmdparser.add_parser( 'submit', aliases=['sub'], formatter_class=formatter, help= 'Submits jobs to the SGE queue or to the local job scheduler and logs them in a database.' ) submit_parser.add_argument( '-q', '--queue', metavar='QNAME', dest='qname', default='all.q', choices=QUEUES, help='the name of the SGE queue to submit the job to') submit_parser.add_argument( '-e', '--sge-extra-args', default=rc.get('gridtk.sge.extra.args.default', ''), type=str, help= 'Passes extra arguments to qsub. See the documentation of the package for usage and ways of overriding default behavior.' ) submit_parser.add_argument( '-m', '--memory', help='Sets both the h_vmem and the mem_free parameters when submitting ' 'the job to a non-GPU queue, e.g., 8G to set the memory ' 'requirements to 8 gigabytes. Sets gpumem parameter when ' 'submitting the job to a GPU-based queue.') submit_parser.add_argument( '-p', '--parallel', '--pe_mth', type=int, help= 'Sets the number of slots per job (-pe pe_mth) and multiplies the mem_free parameter. E.g. to get 16 G of memory, use -m 8G -p 2.' ) submit_parser.add_argument('-n', '--name', dest='name', help='Gives the job a name') submit_parser.add_argument( '-x', '--dependencies', type=int, default=[], metavar='ID', nargs='*', help= 'Set job dependencies to the list of job identifiers separated by spaces' ) submit_parser.add_argument( '-k', '--stop-on-failure', action='store_true', help='Stop depending jobs when this job finished with an error.') submit_parser.add_argument( '-d', '--exec-dir', metavar='DIR', help= 'Sets the executing directory, where the script should be executed. If not given, jobs will be executed in the current directory' ) submit_parser.add_argument( '-l', '--log-dir', metavar='DIR', help= 'Sets the log directory. By default, "logs" is selected for the SGE. If the jobs are executed locally, by default the result is written to console.' ) submit_parser.add_argument( '-s', '--environment', metavar='KEY=VALUE', dest='env', nargs='*', default=[], help='Passes specific environment variables to the job.') submit_parser.add_argument( '-t', '--array', '--parametric', metavar='(first-)last(:step)', help= "Creates a parametric (array) job. You must specify the 'last' value, but 'first' (default=1) and 'step' (default=1) can be specified as well (when specifying 'step', 'first' has to be given, too)." ) submit_parser.add_argument( '-z', '--dry-run', action='store_true', help= 'Do not really submit anything, just print out what would submit in this case' ) submit_parser.add_argument( '-i', '--io-big', action='store_true', help= 'Sets "io_big" on the submitted jobs so it limits the machines in which the job is submitted to those that can do high-throughput.' ) submit_parser.add_argument( '-r', '--repeat', type=int, metavar='N', default=1, help='Submits the job N times. Each job will depend on the job before.' ) submit_parser.add_argument( '-o', '--print-id', action='store_true', help= 'Prints the new job id (so that they can be parsed by automatic scripts).' ) submit_parser.add_argument( 'job', metavar='command', nargs=argparse.REMAINDER, help= "The job that should be executed. Sometimes a -- is required to separate the job from other command line options." ) submit_parser.set_defaults(func=submit) # subcommand 're-submit' resubmit_parser = cmdparser.add_parser('resubmit', aliases=['reset', 'requeue', 're'], formatter_class=formatter, help='Re-submits a list of jobs.') resubmit_parser.add_argument( '-j', '--job-ids', metavar='ID', nargs='+', help= 'Re-submit only the jobs with the given ids (by default, all finished jobs are re-submitted).' ) resubmit_parser.add_argument( '-q', '--queue', metavar='QNAME', dest='qname', choices=QUEUES, help='Reset the SGE queue to submit the job to') resubmit_parser.add_argument( '-m', '--memory', help='Resets both the h_vmem and the mem_free parameters when ' 'submitting the job to a non-GPU queue, e.g., 8G ' 'to set the memory requirements to 8 gigabytes. Resets gpumem ' 'parameter when submitting the job to a GPU-based queue.') resubmit_parser.add_argument( '-p', '--parallel', '--pe_mth', type=int, help= 'Resets the number of slots per job (-pe pe_mth) and multiplies the mem_free parameter. E.g. to get 16 G of memory, use -m 8G -p 2.' ) resubmit_parser.add_argument( '-i', '--io-big', action='store_true', help='Resubmits the job to the "io_big" queue.') resubmit_parser.add_argument( '-I', '--no-io-big', action='store_true', help='Resubmits the job NOT to the "io_big" queue.') resubmit_parser.add_argument( '-k', '--keep-logs', action='store_true', help='Do not clean the log files of the old job before re-submitting.') resubmit_parser.add_argument( '-s', '--also-success', action='store_true', help='Re-submit also jobs that have finished successfully.') resubmit_parser.add_argument( '-a', '--running-jobs', action='store_true', help= 'Re-submit even jobs that are running or waiting (use this flag with care).' ) resubmit_parser.add_argument( '-o', '--overwrite-command', nargs=argparse.REMAINDER, help= "Overwrite the command line (of a single job) that should be executed (useful to keep job dependencies)." ) resubmit_parser.set_defaults(func=resubmit) # subcommand 'stop' stop_parser = cmdparser.add_parser( 'stop', formatter_class=formatter, help='Stops the execution of jobs in the grid.') stop_parser.add_argument( '-j', '--job-ids', metavar='ID', nargs='+', help= 'Stop only the jobs with the given ids (by default, all jobs are stopped).' ) stop_parser.set_defaults(func=stop) # subcommand 'list' list_parser = cmdparser.add_parser( 'list', aliases=['ls'], formatter_class=formatter, help= 'Lists jobs stored in the database. Use the -vv option to get a long listing.' ) list_parser.add_argument( '-j', '--job-ids', metavar='ID', nargs='+', help= 'List only the jobs with the given ids (by default, all jobs are listed)' ) list_parser.add_argument( '-n', '--names', metavar='NAME', nargs='+', help= 'List only the jobs with the given names (by default, all jobs are listed)' ) list_parser.add_argument('-a', '--print-array-jobs', action='store_true', help='Also list the array ids.') list_parser.add_argument( '-l', '--long', action='store_true', help='Prints additional information about the submitted job.') list_parser.add_argument( '-t', '--print-times', action='store_true', help= 'Prints timing information on when jobs were submited, executed and finished' ) list_parser.add_argument( '-x', '--print-dependencies', action='store_true', help='Print the dependencies of the jobs as well.') list_parser.add_argument( '-o', '--ids-only', action='store_true', help= 'Prints ONLY the job ids (so that they can be parsed by automatic scripts).' ) list_parser.add_argument( '-s', '--status', nargs='+', choices=Status, default=Status, help= 'Delete only jobs that have the given statuses; by default all jobs are deleted.' ) list_parser.set_defaults(func=list) # subcommand 'communicate' stop_parser = cmdparser.add_parser( 'communicate', aliases=['com'], formatter_class=formatter, help= 'Communicates with the grid to see if there were unexpected errors (e.g. a timeout) during the job execution.' ) stop_parser.add_argument( '-j', '--job-ids', metavar='ID', nargs='+', help= 'Check only the jobs with the given ids (by default, all jobs are checked)' ) stop_parser.set_defaults(func=communicate) # subcommand 'report' report_parser = cmdparser.add_parser( 'report', aliases=['rep', 'r', 'explain', 'why'], formatter_class=formatter, help= 'Iterates through the result and error log files and prints out the logs.' ) report_parser.add_argument( '-e', '--errors-only', action='store_true', help='Only report the error logs (by default, both logs are reported).' ) report_parser.add_argument( '-o', '--output-only', action='store_true', help= 'Only report the output logs (by default, both logs are reported).') report_parser.add_argument( '-j', '--job-ids', metavar='ID', nargs='+', help= 'Report only the jobs with the given ids (by default, all finished jobs are reported)' ) report_parser.add_argument( '-a', '--array-ids', metavar='ID', nargs='+', help= 'Report only the jobs with the given array ids. If specified, a single job-id must be given as well.' ) report_parser.add_argument( '-n', '--name', help= "Report only the jobs with the given name; by default all jobs are reported." ) report_parser.add_argument( '-s', '--status', nargs='+', choices=Status, default=Status, help= 'Report only jobs that have the given statuses; by default all jobs are reported.' ) report_parser.set_defaults(func=report) # subcommand 'delete' delete_parser = cmdparser.add_parser( 'delete', aliases=['del', 'rm', 'remove'], formatter_class=formatter, help= 'Removes jobs from the database; if jobs are running or are still scheduled in SGE, the jobs are also removed from the SGE queue.' ) delete_parser.add_argument( '-j', '--job-ids', metavar='ID', nargs='+', help= 'Delete only the jobs with the given ids (by default, all jobs are deleted).' ) delete_parser.add_argument( '-a', '--array-ids', metavar='ID', nargs='+', help= 'Delete only the jobs with the given array ids. If specified, a single job-id must be given as well. Note that the whole job including all array jobs will be removed from the SGE queue.' ) delete_parser.add_argument( '-r', '--keep-logs', action='store_true', help='If set, the log files will NOT be removed.') delete_parser.add_argument( '-R', '--keep-log-dir', action='store_true', help='When removing the logs, keep the log directory.') delete_parser.add_argument( '-s', '--status', nargs='+', choices=Status, default=Status, help= 'Delete only jobs that have the given statuses; by default all jobs are deleted.' ) delete_parser.set_defaults(func=delete) # subcommand 'run_scheduler' scheduler_parser = cmdparser.add_parser( 'run-scheduler', aliases=['sched', 'x'], formatter_class=formatter, help= 'Runs the scheduler on the local machine. To stop the scheduler safely, please use Ctrl-C; only valid in combination with the \'--local\' option.' ) scheduler_parser.add_argument( '-p', '--parallel', type=int, default=1, help= 'Select the number of parallel jobs that you want to execute locally') scheduler_parser.add_argument( '-j', '--job-ids', metavar='ID', nargs='+', help= 'Select the job ids that should be run (be default, all submitted and queued jobs are run).' ) scheduler_parser.add_argument( '-s', '--sleep-time', type=float, default=0.1, help='Set the sleep time between for the scheduler in seconds.') scheduler_parser.add_argument( '-x', '--die-when-finished', action='store_true', help= 'Let the job manager die when it has finished all jobs of the database.' ) scheduler_parser.add_argument( '-l', '--no-log-files', action='store_true', help= 'Overwrites the log file setup to print the results to the console.') scheduler_parser.add_argument( '-n', '--nice', type=int, help= 'Jobs will be run with the given priority (can only be positive, i.e., to have lower priority' ) scheduler_parser.set_defaults(func=run_scheduler) # subcommand 'run-job'; this should not be seen on the command line since it is actually a wrapper script run_parser = cmdparser.add_parser('run-job', help=argparse.SUPPRESS) run_parser.set_defaults(func=run_job) if command_line_options: args = parser.parse_args(command_line_options[1:]) args.wrapper_script = command_line_options[0] else: args = parser.parse_args() args.wrapper_script = sys.argv[0] if not hasattr(args, "func"): return parser.print_help(sys.stderr) args.func(args) return 0
def qsub(command, queue=None, cwd=True, name=None, deps=[], stdout='', stderr='', env=[], array=None, context='grid', hostname=None, memfree=None, hvmem=None, gpumem=None, pe_opt=None, io_big=False, sge_extra_args=""): """Submits a shell job to a given grid queue Keyword parameters: command The command to be submitted to the grid queue A valid queue name or None, to use the default queue cwd If the job should change to the current working directory before starting name An optional name to set for the job. If not given, defaults to the script name being launched. deps Job ids to which this job will be dependent on stdout The standard output directory. If not given, defaults to what qsub has as a default. stderr The standard error directory (if not given, defaults to the stdout directory). env This is a list of extra variables that will be set on the environment running the command of your choice. array If set should be either: 1. a string in the form m[-n[:s]] which indicates the starting range 'm', the closing range 'n' and the step 's'. 2. an integer value indicating the total number of jobs to be submitted. This is equivalent ot set the parameter to a string "1-k:1" where "k" is the passed integer value 3. a tuple that contains either 1, 2 or 3 elements indicating the start, end and step arguments ("m", "n", "s"). The minimum value for "m" is 1. Giving "0" is an error. If submitted with this option, the job to be created will be an SGE parametric job. In this mode SGE does not allow individual control of each job. The environment variable SGE_TASK_ID will be set on the executing process automatically by SGE and indicates the unique identifier in the range for which the current job instance is for. context The setshell context in which we should try a 'qsub'. Normally you don't need to change the default. This variable can also be set to a context dictionary in which case we just setup using that context instead of probing for a new one, what can be fast. memfree If set, it asks the queue for a node with a minimum amount of memory Used only if mem is not set (cf. qsub -l mem_free=<...>) hvmem If set, it asks the queue for a node with a minimum amount of memory Used only if mem is not set (cf. qsub -l h_vmem=<...>) gpumem Applicable only for GPU-based queues. If set, it asks for the GPU queue with a minimum amount of memory. The amount should not be more than 24. (cf. qsub -l gpumem=<...>) hostname If set, it asks the queue to use only a subset of the available nodes Symbols: | for OR, & for AND, ! for NOT, etc. (cf. qsub -l hostname=<...>) pe_opt If set, add a -pe option when launching a job (for instance pe_exclusive* 1-) io_big If set to true, the io_big flag will be set. Use this flag if your process will need a lot of Input/Output operations. sge_extra_args This is used to send extra argument to SGE. Note that all its arguments are directly used in `qsub` command. For example, `jman submit -e "-P project_name -l pytorch=true" -- ...` will be translated to `qsub -P project_name -l pytorch=true -- ...` Returns the job id assigned to this job (integer) """ import six from bob.extension import rc scmd = ['qsub'] prepend = rc.get('gridtk.sge.extra.args.prepend') or "" sge_extra_args = f"{prepend} {sge_extra_args or ''}" scmd += shlex.split(sge_extra_args) if isinstance(queue, six.string_types) and queue not in ('all.q', 'default'): scmd += ['-l', queue] if memfree: scmd += ['-l', 'mem_free=%s' % memfree] if hvmem: scmd += ['-l', 'h_vmem=%s' % hvmem] if gpumem: scmd += ['-l', 'gpumem=%s' % gpumem] if io_big: scmd += ['-l', 'io_big'] if hostname: scmd += ['-l', 'hostname=%s' % hostname] if pe_opt: scmd += ['-pe'] + pe_opt.split() if cwd: scmd += ['-cwd'] if name: scmd += ['-N', name] if deps: scmd += ['-hold_jid', ','.join(['%d' % k for k in deps])] if stdout: if not cwd: # pivot, temporarily, to home directory curdir = os.path.realpath(os.curdir) os.chdir(os.environ['HOME']) if not os.path.exists(stdout): makedirs_safe(stdout) if not cwd: # go back os.chdir(os.path.realpath(curdir)) scmd += ['-o', stdout] if stderr: if not os.path.exists(stderr): makedirs_safe(stderr) scmd += ['-e', stderr] elif stdout: #just re-use the stdout settings scmd += ['-e', stdout] scmd += ['-terse' ] # simplified job identifiers returned by the command line for k in env: scmd += ['-v', k] if array is not None: scmd.append('-t') if isinstance(array, six.string_types): try: i = int(array) scmd.append('1-%d:1' % i) except ValueError: #must be complete... scmd.append('%s' % array) if isinstance(array, six.integer_types): scmd.append('1-%d:1' % array) if isinstance(array, (tuple, list)): if len(array) < 1 or len(array) > 3: raise RuntimeError( "Array tuple should have length between 1 and 3") elif len(array) == 1: scmd.append('%s' % array[0]) elif len(array) == 2: scmd.append('%s-%s' % (array[0], array[1])) elif len(array) == 3: scmd.append('%s-%s:%s' % (array[0], array[1], array[2])) if not isinstance(command, (list, tuple)): command = [command] scmd += command logger.debug("Qsub command '%s'", ' '.join(scmd)) from .setshell import sexec jobid = str_(sexec(context, scmd)) return int(jobid.split("\n")[-1].split('.', 1)[0])
def main(command_line_options=None): from bob.extension import rc from ..config import __version__ formatter = argparse.ArgumentDefaultsHelpFormatter parser = argparse.ArgumentParser( description=__doc__, epilog=__epilog__, formatter_class=formatter ) # part of the hack to support aliases in subparsers parser.register("action", "parsers", AliasedSubParsersAction) # general options parser.add_argument( "-v", "--verbose", action="count", default=0, help="Increase the verbosity level from 0 (only error messages) to 1 (warnings), 2 (log messages), 3 (debug information) by adding the --verbose option as often as desired (e.g. '-vvv' for debug).", ) parser.add_argument( "-V", "--version", action="version", version="GridTk version %s" % __version__, ) parser.add_argument( "-d", "--database", "--db", metavar="DATABASE", default="submitted.sql3", help='replace the default database "submitted.sql3" by one provided by you.', ) parser.add_argument( "-l", "--local", action="store_true", help="Uses the local job manager instead of the SGE one.", ) cmdparser = parser.add_subparsers( title="commands", help="commands accepted by %(prog)s" ) # subcommand 'submit' submit_parser = cmdparser.add_parser( "submit", aliases=["sub"], formatter_class=formatter, help="Submits jobs to the SGE queue or to the local job scheduler and logs them in a database.", ) submit_parser.add_argument( "-q", "--queue", metavar="QNAME", dest="qname", default="all.q", choices=QUEUES, help="the name of the SGE queue to submit the job to", ) submit_parser.add_argument( "-e", "--sge-extra-args", default=rc.get("gridtk.sge.extra.args.default", ""), type=str, help="Passes extra arguments to qsub. See the documentation of the package for usage and ways of overriding default behavior.", ) submit_parser.add_argument( "-m", "--memory", help="Sets both the h_vmem and the mem_free parameters when submitting " "the job to a non-GPU queue, e.g., 8G to set the memory " "requirements to 8 gigabytes. Sets gpumem parameter when " "submitting the job to a GPU-based queue.", ) submit_parser.add_argument( "-p", "--parallel", "--pe_mth", type=int, help="Sets the number of slots per job (-pe pe_mth) and multiplies the mem_free parameter. E.g. to get 16 G of memory, use -m 8G -p 2.", ) submit_parser.add_argument( "-n", "--name", dest="name", help="Gives the job a name" ) submit_parser.add_argument( "-x", "--dependencies", type=int, default=[], metavar="ID", nargs="*", help="Set job dependencies to the list of job identifiers separated by spaces", ) submit_parser.add_argument( "-k", "--stop-on-failure", action="store_true", help="Stop depending jobs when this job finished with an error.", ) submit_parser.add_argument( "-d", "--exec-dir", metavar="DIR", help="Sets the executing directory, where the script should be executed. If not given, jobs will be executed in the current directory", ) submit_parser.add_argument( "-l", "--log-dir", default="logs", metavar="DIR", help="Sets the log directory.", ) submit_parser.add_argument( "-s", "--environment", metavar="KEY=VALUE", dest="env", nargs="*", default=[], help="Passes specific environment variables to the job.", ) submit_parser.add_argument( "-t", "--array", "--parametric", metavar="(first-)last(:step)", help="Creates a parametric (array) job. You must specify the 'last' value, but 'first' (default=1) and 'step' (default=1) can be specified as well (when specifying 'step', 'first' has to be given, too).", ) submit_parser.add_argument( "-z", "--dry-run", action="store_true", help="Do not really submit anything, just print out what would submit in this case", ) submit_parser.add_argument( "-i", "--io-big", action="store_true", help='Sets "io_big" on the submitted jobs so it limits the machines in which the job is submitted to those that can do high-throughput.', ) submit_parser.add_argument( "-r", "--repeat", type=int, metavar="N", default=1, help="Submits the job N times. Each job will depend on the job before.", ) submit_parser.add_argument( "-o", "--print-id", action="store_true", help="Prints the new job id (so that they can be parsed by automatic scripts).", ) submit_parser.add_argument( "job", metavar="command", nargs=argparse.REMAINDER, help="The job that should be executed. Sometimes a -- is required to separate the job from other command line options.", ) submit_parser.set_defaults(func=submit) # subcommand 're-submit' resubmit_parser = cmdparser.add_parser( "resubmit", aliases=["reset", "requeue", "re"], formatter_class=formatter, help="Re-submits a list of jobs.", ) resubmit_parser.add_argument( "-j", "--job-ids", metavar="ID", nargs="+", help="Re-submit only the jobs with the given ids (by default, all finished jobs are re-submitted).", ) resubmit_parser.add_argument( "-q", "--queue", metavar="QNAME", dest="qname", choices=QUEUES, help="Reset the SGE queue to submit the job to", ) resubmit_parser.add_argument( "-m", "--memory", help="Resets both the h_vmem and the mem_free parameters when " "submitting the job to a non-GPU queue, e.g., 8G " "to set the memory requirements to 8 gigabytes. Resets gpumem " "parameter when submitting the job to a GPU-based queue.", ) resubmit_parser.add_argument( "-p", "--parallel", "--pe_mth", type=int, help="Resets the number of slots per job (-pe pe_mth) and multiplies the mem_free parameter. E.g. to get 16 G of memory, use -m 8G -p 2.", ) resubmit_parser.add_argument( "-i", "--io-big", action="store_true", help='Resubmits the job to the "io_big" queue.', ) resubmit_parser.add_argument( "-I", "--no-io-big", action="store_true", help='Resubmits the job NOT to the "io_big" queue.', ) resubmit_parser.add_argument( "-k", "--keep-logs", action="store_true", help="Do not clean the log files of the old job before re-submitting.", ) resubmit_parser.add_argument( "-s", "--also-success", action="store_true", help="Re-submit also jobs that have finished successfully.", ) resubmit_parser.add_argument( "-a", "--running-jobs", action="store_true", help="Re-submit even jobs that are running or waiting (use this flag with care).", ) resubmit_parser.add_argument( "-o", "--overwrite-command", nargs=argparse.REMAINDER, help="Overwrite the command line (of a single job) that should be executed (useful to keep job dependencies).", ) resubmit_parser.set_defaults(func=resubmit) # subcommand 'stop' stop_parser = cmdparser.add_parser( "stop", formatter_class=formatter, help="Stops the execution of jobs in the grid.", ) stop_parser.add_argument( "-j", "--job-ids", metavar="ID", nargs="+", help="Stop only the jobs with the given ids (by default, all jobs are stopped).", ) stop_parser.set_defaults(func=stop) # subcommand 'list' list_parser = cmdparser.add_parser( "list", aliases=["ls"], formatter_class=formatter, help="Lists jobs stored in the database. Use the -vv option to get a long listing.", ) list_parser.add_argument( "-j", "--job-ids", metavar="ID", nargs="+", help="List only the jobs with the given ids (by default, all jobs are listed)", ) list_parser.add_argument( "-n", "--names", metavar="NAME", nargs="+", help="List only the jobs with the given names (by default, all jobs are listed)", ) list_parser.add_argument( "-a", "--print-array-jobs", action="store_true", help="Also list the array ids.", ) list_parser.add_argument( "-l", "--long", action="store_true", help="Prints additional information about the submitted job.", ) list_parser.add_argument( "-t", "--print-times", action="store_true", help="Prints timing information on when jobs were submited, executed and finished", ) list_parser.add_argument( "-x", "--print-dependencies", action="store_true", help="Print the dependencies of the jobs as well.", ) list_parser.add_argument( "-o", "--ids-only", action="store_true", help="Prints ONLY the job ids (so that they can be parsed by automatic scripts).", ) list_parser.add_argument( "-s", "--status", nargs="+", choices=Status, default=Status, help="Delete only jobs that have the given statuses; by default all jobs are deleted.", ) list_parser.set_defaults(func=list) # subcommand 'communicate' stop_parser = cmdparser.add_parser( "communicate", aliases=["com"], formatter_class=formatter, help="Communicates with the grid to see if there were unexpected errors (e.g. a timeout) during the job execution.", ) stop_parser.add_argument( "-j", "--job-ids", metavar="ID", nargs="+", help="Check only the jobs with the given ids (by default, all jobs are checked)", ) stop_parser.set_defaults(func=communicate) # subcommand 'report' report_parser = cmdparser.add_parser( "report", aliases=["rep", "r", "explain", "why"], formatter_class=formatter, help="Iterates through the result and error log files and prints out the logs.", ) report_parser.add_argument( "-e", "--errors-only", action="store_true", help="Only report the error logs (by default, both logs are reported).", ) report_parser.add_argument( "-o", "--output-only", action="store_true", help="Only report the output logs (by default, both logs are reported).", ) report_parser.add_argument( "-j", "--job-ids", metavar="ID", nargs="+", help="Report only the jobs with the given ids (by default, all finished jobs are reported)", ) report_parser.add_argument( "-a", "--array-ids", metavar="ID", nargs="+", help="Report only the jobs with the given array ids. If specified, a single job-id must be given as well.", ) report_parser.add_argument( "-n", "--name", help="Report only the jobs with the given name; by default all jobs are reported.", ) report_parser.add_argument( "-s", "--status", nargs="+", choices=Status, default=Status, help="Report only jobs that have the given statuses; by default all jobs are reported.", ) report_parser.set_defaults(func=report) # subcommand 'delete' delete_parser = cmdparser.add_parser( "delete", aliases=["del", "rm", "remove"], formatter_class=formatter, help="Removes jobs from the database; if jobs are running or are still scheduled in SGE, the jobs are also removed from the SGE queue.", ) delete_parser.add_argument( "-j", "--job-ids", metavar="ID", nargs="+", help="Delete only the jobs with the given ids (by default, all jobs are deleted).", ) delete_parser.add_argument( "-a", "--array-ids", metavar="ID", nargs="+", help="Delete only the jobs with the given array ids. If specified, a single job-id must be given as well. Note that the whole job including all array jobs will be removed from the SGE queue.", ) delete_parser.add_argument( "-r", "--keep-logs", action="store_true", help="If set, the log files will NOT be removed.", ) delete_parser.add_argument( "-R", "--keep-log-dir", action="store_true", help="When removing the logs, keep the log directory.", ) delete_parser.add_argument( "-s", "--status", nargs="+", choices=Status, default=Status, help="Delete only jobs that have the given statuses; by default all jobs are deleted.", ) delete_parser.set_defaults(func=delete) # subcommand 'run_scheduler' scheduler_parser = cmdparser.add_parser( "run-scheduler", aliases=["sched", "x"], formatter_class=formatter, help="Runs the scheduler on the local machine. To stop the scheduler safely, please use Ctrl-C; only valid in combination with the '--local' option.", ) scheduler_parser.add_argument( "-p", "--parallel", type=int, default=1, help="Select the number of parallel jobs that you want to execute locally", ) scheduler_parser.add_argument( "-j", "--job-ids", metavar="ID", nargs="+", help="Select the job ids that should be run (be default, all submitted and queued jobs are run).", ) scheduler_parser.add_argument( "-s", "--sleep-time", type=float, default=0.1, help="Set the sleep time between for the scheduler in seconds.", ) scheduler_parser.add_argument( "-x", "--die-when-finished", action="store_true", help="Let the job manager die when it has finished all jobs of the database.", ) scheduler_parser.add_argument( "-l", "--no-log-files", action="store_true", help="Overwrites the log file setup to print the results to the console.", ) scheduler_parser.add_argument( "-n", "--nice", type=int, help="Jobs will be run with the given priority (can only be positive, i.e., to have lower priority", ) scheduler_parser.set_defaults(func=run_scheduler) # subcommand 'run-job'; this should not be seen on the command line since it is actually a wrapper script run_parser = cmdparser.add_parser("run-job", help=argparse.SUPPRESS) run_parser.set_defaults(func=run_job) if command_line_options: args = parser.parse_args(command_line_options[1:]) args.wrapper_script = command_line_options[0] else: args = parser.parse_args() args.wrapper_script = sys.argv[0] if not hasattr(args, "func"): return parser.print_help(sys.stderr) args.func(args) return 0