def setUp(self):
     self.cwd = os.getcwd()
     self.tempdir = TemporaryDirectory()
     bandersnatch.filter.loaded_filter_plugins = defaultdict(list)
     os.chdir(self.tempdir.name)
Beispiel #2
0
def _generic_test(func,
                  in_,
                  out,
                  wrapped_eq=eq,
                  pb_mg_eq=eq,
                  pb_bytes_eq=eq,
                  dict_eq=eq,
                  json_eq=eq,
                  preload=None,
                  reqs=None,
                  skip=None):
    '''Reusable wrap test routine with swappable equality functions'''

    model = Model(transform=func)
    model_name = 'my-model'

    with TemporaryDirectory() as tdir:
        with _dump_model(model, model_name, reqs) as dump_dir:
            _copy_dir(dump_dir, tdir, model_name)

        if preload is not None:
            preload()

        copied_dump_dir = path_join(tdir, model_name)
        wrapped_model = load_model(copied_dump_dir)

        TransIn = model.transform.input_type
        TransOut = model.transform.output_type

        trans_in = TransIn(*in_)
        trans_out = TransOut(*out)

        trans_in_pb = _pack_pb_msg(trans_in, wrapped_model.transform._module)
        trans_out_pb = _pack_pb_msg(trans_out, wrapped_model.transform._module)

        trans_in_pb_bytes = trans_in_pb.SerializeToString()
        trans_out_pb_bytes = trans_out_pb.SerializeToString()

        trans_in_dict = MessageToDict(trans_in_pb)
        trans_out_dict = MessageToDict(trans_out_pb)

        trans_in_json = MessageToJson(trans_in_pb, indent=0)
        trans_out_json = MessageToJson(trans_out_pb, indent=0)

        # test all from / as combinations
        for as_method_name, as_data_expected, eq_func in (
            ('as_wrapped', trans_out, wrapped_eq), ('as_pb_msg', trans_out_pb,
                                                    pb_mg_eq),
            ('as_pb_bytes', trans_out_pb_bytes,
             pb_bytes_eq), ('as_dict', trans_out_dict,
                            dict_eq), ('as_json', trans_out_json, json_eq)):
            for from_method_name, from_data in (('from_wrapped', trans_in),
                                                ('from_pb_msg', trans_in_pb),
                                                ('from_pb_bytes',
                                                 trans_in_pb_bytes),
                                                ('from_dict', trans_in_dict),
                                                ('from_json', trans_in_json)):

                if skip is not None and skip(as_method_name, from_method_name):
                    logger.info("Skipping {} -> {}".format(
                        from_method_name, as_method_name))
                    continue

                from_method = getattr(wrapped_model.transform,
                                      from_method_name)
                resp = from_method(from_data)
                as_data_method = getattr(resp, as_method_name)
                as_data = as_data_method()
                assert eq_func(as_data, as_data_expected)
Beispiel #3
0
 def __init__(self,
              training_path,
              epoch,
              tokenizer,
              num_data_epochs,
              reduce_memory=False):
     self.vocab = tokenizer.vocab
     self.tokenizer = tokenizer
     self.epoch = epoch
     self.data_epoch = epoch % num_data_epochs
     data_file = training_path / f"epoch_{self.data_epoch}.json"
     metrics_file = training_path / f"epoch_{self.data_epoch}_metrics.json"
     assert data_file.is_file() and metrics_file.is_file()
     metrics = json.loads(metrics_file.read_text())
     num_samples = metrics['num_training_examples']
     seq_len = metrics['max_seq_len']
     self.temp_dir = None
     self.working_dir = None
     if reduce_memory:
         self.temp_dir = TemporaryDirectory()
         self.working_dir = Path(self.temp_dir.name)
         input_ids = np.memmap(filename=self.working_dir /
                               'input_ids.memmap',
                               mode='w+',
                               dtype=np.int32,
                               shape=(num_samples, seq_len))
         input_masks = np.memmap(filename=self.working_dir /
                                 'input_masks.memmap',
                                 shape=(num_samples, seq_len),
                                 mode='w+',
                                 dtype=np.bool)
         segment_ids = np.memmap(filename=self.working_dir /
                                 'segment_ids.memmap',
                                 shape=(num_samples, seq_len),
                                 mode='w+',
                                 dtype=np.bool)
         lm_label_ids = np.memmap(filename=self.working_dir /
                                  'lm_label_ids.memmap',
                                  shape=(num_samples, seq_len),
                                  mode='w+',
                                  dtype=np.int32)
         lm_label_ids[:] = -1
         is_nexts = np.memmap(filename=self.working_dir / 'is_nexts.memmap',
                              shape=(num_samples, ),
                              mode='w+',
                              dtype=np.bool)
     else:
         input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32)
         input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
         segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
         lm_label_ids = np.full(shape=(num_samples, seq_len),
                                dtype=np.int32,
                                fill_value=-1)
         is_nexts = np.zeros(shape=(num_samples, ), dtype=np.bool)
     logging.info(f"Loading training examples for epoch {epoch}")
     with data_file.open() as f:
         for i, line in enumerate(
                 tqdm(f, total=num_samples, desc="Training examples")):
             line = line.strip()
             example = json.loads(line)
             features = convert_example_to_features(example, tokenizer,
                                                    seq_len)
             input_ids[i] = features.input_ids
             segment_ids[i] = features.segment_ids
             input_masks[i] = features.input_mask
             lm_label_ids[i] = features.lm_label_ids
             is_nexts[i] = features.is_next
     assert i == num_samples - 1  # Assert that the sample count metric was true
     logging.info("Loading complete!")
     self.num_samples = num_samples
     self.seq_len = seq_len
     self.input_ids = input_ids
     self.input_masks = input_masks
     self.segment_ids = segment_ids
     self.lm_label_ids = lm_label_ids
     self.is_nexts = is_nexts
Beispiel #4
0
def shapefile(gdf_with_data):
    with TemporaryDirectory() as d:
        filepath = pathlib.Path(d) / "temp.shp"
        filename = str(filepath.absolute())
        gdf_with_data.to_file(filename)
        yield filename
 def setUpClass(cls):
     cls.pdf_path = pdf_path
     cls.temp = TemporaryDirectory()
Beispiel #6
0
import subprocess as sp

# Disable etelemetry during doc builds
os.environ["NIPYPE_NO_ET"] = "1"

conf_py = Path(__file__)

example_dir = conf_py.parent / 'users' / 'examples'
shutil.rmtree(example_dir, ignore_errors=True)
example_dir.mkdir(parents=True)
python_dir = conf_py.parent / "_static" / "python"
shutil.rmtree(python_dir, ignore_errors=True)

ex2rst = str(conf_py.parent.parent / "tools" / "ex2rst")

with TemporaryDirectory() as tmpdir:
    sp.run([
        "git", "clone", "--depth", "1",
        "https://github.com/niflows/nipype1-examples.git", tmpdir
    ],
           check=True)
    source_dir = Path(tmpdir) / "package" / "niflow" / "nipype1" / "examples"
    shutil.copytree(source_dir, python_dir)

sp.run([
    "python", ex2rst, "--outdir",
    str(example_dir),
    str(python_dir), "-x",
    str(python_dir / "test_spm.py"), "-x",
    str(python_dir / "__init__.py"), "-x",
    str(python_dir / "cli.py")
Beispiel #7
0
 def __init__(self, url, config_context):
     self._tempdir = TemporaryDirectory()
     self._path = Path(self._tempdir.name)
     self._repo = GitRepo.clone_from(url[4:], self._tempdir.name)
     super().__init__(url, config_context)
def tmp_path(request) -> Path:
    with TemporaryDirectory(prefix=request.node.name) as d:
        yield Path(d)
Beispiel #9
0
def _open_openml_url(openml_path: str,
                     data_home: Optional[str],
                     n_retries: int = 3,
                     delay: float = 1.0):
    """
    Returns a resource from OpenML.org. Caches it to data_home if required.

    Parameters
    ----------
    openml_path : str
        OpenML URL that will be accessed. This will be prefixes with
        _OPENML_PREFIX.

    data_home : str
        Directory to which the files will be cached. If None, no caching will
        be applied.

    n_retries : int, default=3
        Number of retries when HTTP errors are encountered. Error with status
        code 412 won't be retried as they represent OpenML generic errors.

    delay : float, default=1.0
        Number of seconds between retries.

    Returns
    -------
    result : stream
        A stream to the OpenML resource.
    """
    def is_gzip_encoded(_fsrc):
        return _fsrc.info().get("Content-Encoding", "") == "gzip"

    req = Request(_OPENML_PREFIX + openml_path)
    req.add_header("Accept-encoding", "gzip")

    if data_home is None:
        fsrc = _retry_on_network_error(n_retries, delay,
                                       req.full_url)(urlopen)(req,
                                                              timeout=delay)
        if is_gzip_encoded(fsrc):
            return gzip.GzipFile(fileobj=fsrc, mode="rb")
        return fsrc

    local_path = _get_local_path(openml_path, data_home)
    dir_name, file_name = os.path.split(local_path)
    if not os.path.exists(local_path):
        os.makedirs(dir_name, exist_ok=True)
        try:
            # Create a tmpdir as a subfolder of dir_name where the final file will
            # be moved to if the download is successful. This guarantees that the
            # renaming operation to the final location is atomic to ensure the
            # concurrence safety of the dataset caching mechanism.
            with TemporaryDirectory(dir=dir_name) as tmpdir:
                with closing(
                        _retry_on_network_error(n_retries, delay,
                                                req.full_url)(urlopen)(
                                                    req,
                                                    timeout=delay)) as fsrc:
                    opener: Callable
                    if is_gzip_encoded(fsrc):
                        opener = open
                    else:
                        opener = gzip.GzipFile
                    with opener(os.path.join(tmpdir, file_name), "wb") as fdst:
                        shutil.copyfileobj(fsrc, fdst)
                shutil.move(fdst.name, local_path)
        except Exception:
            if os.path.exists(local_path):
                os.unlink(local_path)
            raise

    # XXX: First time, decompression will not be necessary (by using fsrc), but
    # it will happen nonetheless
    return gzip.GzipFile(local_path, "rb")
Beispiel #10
0
    def testdoDownloadPackage(self):
        """Local download tests"""

        archive = self.__getArchiveInstance({})
        archive.wantDownload(True)
        self.assertTrue(archive.canDownloadLocal())

        with TemporaryDirectory() as tmp:
            audit = os.path.join(tmp, "audit.json.gz")
            content = os.path.join(tmp, "workspace")
            self.assertTrue(
                run(
                    archive.downloadPackage(DummyStep(), DOWNLOAD_ARITFACT,
                                            audit, content)))
            self.__testWorkspace(audit, content)
            self.assertEqual(
                run(
                    archive.downloadLocalLiveBuildId(DummyStep(),
                                                     DOWNLOAD_ARITFACT)),
                b'\x00' * 20)

        # non-existent and erro cases
        with TemporaryDirectory() as tmp:
            audit = os.path.join(tmp, "audit.json.gz")
            content = os.path.join(tmp, "workspace")
            self.assertFalse(
                run(
                    archive.downloadPackage(DummyStep(), NOT_EXISTS_ARTIFACT,
                                            audit, content)))
            self.assertFalse(
                run(
                    archive.downloadPackage(DummyStep(),
                                            ERROR_DOWNLOAD_ARTIFACT, audit,
                                            content)))
            self.assertFalse(
                run(
                    archive.downloadPackage(DummyStep(), ERROR_UPLOAD_ARTIFACT,
                                            audit, content)))
            self.assertEqual(
                run(
                    archive.downloadLocalLiveBuildId(DummyStep(),
                                                     NOT_EXISTS_ARTIFACT)),
                None)
            self.assertEqual(
                run(
                    archive.downloadLocalLiveBuildId(DummyStep(),
                                                     ERROR_DOWNLOAD_ARTIFACT)),
                None)
            self.assertEqual(
                run(
                    archive.downloadLocalLiveBuildId(DummyStep(),
                                                     ERROR_UPLOAD_ARTIFACT)),
                None)
            with self.assertRaises(BuildError):
                run(
                    archive.downloadPackage(DummyStep(), BROKEN_ARTIFACT,
                                            audit, content))
            with self.assertRaises(BuildError):
                run(
                    archive.downloadPackage(DummyStep(),
                                            WRONG_VERSION_ARTIFACT, audit,
                                            content))
def train_experiment(engine=None):
    with TemporaryDirectory() as logdir:
        # sample data
        num_samples, num_features, num_classes1, num_classes2 = int(1e4), int(
            1e1), 4, 10
        X = torch.rand(num_samples, num_features)
        y1 = (torch.rand(num_samples) * num_classes1).to(torch.int64)
        y2 = (torch.rand(num_samples) * num_classes2).to(torch.int64)

        # pytorch loaders
        dataset = TensorDataset(X, y1, y2)
        loader = DataLoader(dataset, batch_size=32, num_workers=1)
        loaders = {"train": loader, "valid": loader}

        # model, criterion, optimizer, scheduler
        model = CustomModule(num_features, num_classes1, num_classes2)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters())
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [2])

        callbacks = [
            dl.CriterionCallback(metric_key="loss1",
                                 input_key="logits1",
                                 target_key="targets1"),
            dl.CriterionCallback(metric_key="loss2",
                                 input_key="logits2",
                                 target_key="targets2"),
            dl.MetricAggregationCallback(metric_key="loss",
                                         metrics=["loss1", "loss2"],
                                         mode="mean"),
            dl.BackwardCallback(metric_key="loss"),
            dl.OptimizerCallback(metric_key="loss"),
            dl.SchedulerCallback(),
            dl.AccuracyCallback(
                input_key="logits1",
                target_key="targets1",
                num_classes=num_classes1,
                prefix="one_",
            ),
            dl.AccuracyCallback(
                input_key="logits2",
                target_key="targets2",
                num_classes=num_classes2,
                prefix="two_",
            ),
            dl.CheckpointCallback(
                "./logs/one",
                loader_key="valid",
                metric_key="one_accuracy01",
                minimize=False,
                topk=1,
            ),
            dl.CheckpointCallback(
                "./logs/two",
                loader_key="valid",
                metric_key="two_accuracy03",
                minimize=False,
                topk=3,
            ),
        ]
        if SETTINGS.ml_required:
            # catalyst[ml] required
            callbacks.append(
                dl.ConfusionMatrixCallback(
                    input_key="logits1",
                    target_key="targets1",
                    num_classes=num_classes1,
                    prefix="one_cm",
                ))
            # catalyst[ml] required
            callbacks.append(
                dl.ConfusionMatrixCallback(
                    input_key="logits2",
                    target_key="targets2",
                    num_classes=num_classes2,
                    prefix="two_cm",
                ))

        # model training
        runner = CustomRunner()
        runner.train(
            engine=engine,
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            loaders=loaders,
            num_epochs=1,
            verbose=False,
            callbacks=callbacks,
            loggers={
                "console": dl.ConsoleLogger(),
                "tb": dl.TensorboardLogger("./logs/tb"),
            },
        )
Beispiel #12
0
 def test_filter_json(self):
     """
     Test that the ``json`` filter can be used.
     """
     # We are going to overwrite the template file, so we have to disable the
     # cache in order to avoid problems.
     engine = JinjaEngine({'cache_enabled': False})
     with TemporaryDirectory() as tmpdir:
         # We have to generate a template file that can be read by the
         # template engine.
         tmpdir_path = pathlib.Path(tmpdir)
         template_path = tmpdir_path / 'test.jinja'
         _write_file(
             template_path, """
             {{ value | json }}
             """)
         value = OrderedDict()
         value['def'] = 456
         value['abc'] = 123
         self.assertEqual(
             '{"def": 456, "abc": 123}',
             engine.render(str(template_path), {'value': value}))
         # We also want to test the sort_keys option.
         _write_file(
             template_path, """
             {{ value | json(sort_keys=False) }}
             """)
         self.assertEqual(
             '{"def": 456, "abc": 123}',
             engine.render(str(template_path), {'value': value}))
         _write_file(
             template_path, """
             {{ value | json(sort_keys=True) }}
             """)
         self.assertEqual(
             '{"abc": 123, "def": 456}',
             engine.render(str(template_path), {'value': value}))
         # And we want to test the indent option (indent=None is the default)
         _write_file(
             template_path, """
             {{ value | json }}
             """)
         self.assertEqual(
             '[1, 2]', engine.render(str(template_path), {'value': [1, 2]}))
         _write_file(
             template_path, """
             {{ value | json(indent=None) }}
             """)
         self.assertEqual(
             '[1, 2]', engine.render(str(template_path), {'value': [1, 2]}))
         _write_file(
             template_path, """
             {{ value | json(indent=0) }}
             """)
         self.assertEqual(
             '[\n1,\n2\n]',
             engine.render(str(template_path), {'value': [1, 2]}))
         _write_file(
             template_path, """
             {{ value | json(indent=2) }}
             """)
         self.assertEqual(
             '[\n  1,\n  2\n]',
             engine.render(str(template_path), {'value': [1, 2]}))
Beispiel #13
0
 def test_config_provide_transform_functions(self):
     """
     Test the ``provide_transform_functions`` configuration option.
     """
     with TemporaryDirectory() as tmpdir:
         # We have to generate a template file that can be read by the
         # template engine.
         tmpdir_path = pathlib.Path(tmpdir)
         template_path = tmpdir_path / 'test.jinja'
         _write_file(
             template_path, """
             {{ transform['string.to_upper']('Some text') }}
             """)
         # We disable the cache for this test because it causes problems when
         # we rapidly change files.
         engine = JinjaEngine({'cache_enabled': False})
         self.assertEqual('SOME TEXT',
                          engine.render(str(template_path), {}))
         # Explicitly setting provide_transform_functions should not make a
         # difference.
         engine = JinjaEngine({
             'cache_enabled': False,
             'provide_transform_functions': True
         })
         self.assertEqual('SOME TEXT',
                          engine.render(str(template_path), {}))
         # If we provide our own transform object in the context, this should
         # hide the transform object provided by the template engine because
         # context objects override globals.
         _write_file(
             template_path, """
             {{ transform }}
             """)
         self.assertEqual(
             'text from context',
             engine.render(str(template_path),
                           {'transform': 'text from context'}))
         # The "is defined" check should succeed if there is a transform
         # object, and fail if there is none.
         _write_file(
             template_path, """
             {{ transform is defined }}
             """)
         self.assertEqual('True', engine.render(str(template_path), {}))
         # Now, we set provide_transform_functions to False, which should
         # remove the transform object from the context.
         engine = JinjaEngine({
             'cache_enabled': False,
             'provide_transform_functions': False
         })
         self.assertEqual('False', engine.render(str(template_path), {}))
         # If we provide our own transform object, that object should be
         # available.
         _write_file(
             template_path, """
             {{ transform }}
             """)
         self.assertEqual(
             'text from context',
             engine.render(str(template_path),
                           {'transform': 'text from context'}))
Beispiel #14
0
 def setUp(self):
     self._temp_dir = TemporaryDirectory()
Beispiel #15
0
    def __call__(self) -> bool:
        resolver = get_resolver(reqs=self.args.name)
        name = next(iter(resolver.graph.get_layer(0))).dependencies[0].name

        command = self.config.get('command')
        if not command:
            command = 'python'
        if isinstance(command, str):
            command = shlex.split(command)

        with TemporaryDirectory() as base_path:
            base_path = Path(base_path)

            # make venv
            venv = VEnv(path=base_path)
            if venv.exists():
                self.logger.error('already installed', extra=dict(package=name))
                return False
            python = get_python(self.config)
            self.logger.info('creating venv...', extra=dict(
                venv=str(venv.path),
                python=str(python.path),
            ))
            venv.create(python_path=python.path)

            # install
            ok = self._install(resolver=resolver, python_path=venv.python_path)
            if not ok:
                return False

            # install executable
            executable = venv.bin_path / command[0]
            if not executable.exists():
                self.logger.warning('executable is not found in venv, trying to install...', extra=dict(
                    executable=command[0],
                ))
                ok = self._install(
                    resolver=get_resolver(reqs=command[:1]),
                    python_path=venv.python_path,
                )
                if not ok:
                    return False
            if not executable.exists():
                self.logger.error('package installed, but executable is not found')
                return False

            # make startup script to import installed packages
            startup_path = base_path / '_startup.py'
            packages = self._get_startup_packages(lib_path=venv.lib_path, packages=self.args.name)
            if not packages:
                self.logger.error('cannot find any packages')
                return False
            startup_path.write_text('import ' + ', '.join(sorted(packages)))

            # run
            self.logger.info('running...')
            with override_env_vars({'PYTHONSTARTUP': str(startup_path)}):
                result = subprocess.run([str(executable)] + command[1:])
            if result.returncode != 0:
                self.logger.error('command failed', extra=dict(code=result.returncode))
                return False

            return True
Beispiel #16
0
 def setUp(self):
     self.tempdir = TemporaryDirectory()
     self.config_path = path_join(self.tempdir.name, 'config.yaml')
Beispiel #17
0
def main(
    output_file: str,
    entry_point: Optional[str],
    console_script: Optional[str],
    python: Optional[str],
    site_packages: Optional[str],
    compressed: bool,
    compile_pyc: bool,
    extend_pythonpath: bool,
    reproducible: bool,
    no_modify: bool,
    preamble: Optional[str],
    pip_args: List[str],
) -> None:
    """
    Shiv is a command line utility for building fully self-contained Python zipapps
    as outlined in PEP 441, but with all their dependencies included!
    """

    if not pip_args and not site_packages:
        sys.exit(NO_PIP_ARGS_OR_SITE_PACKAGES)

    if output_file is None:
        sys.exit(NO_OUTFILE)

    # check for disallowed pip arguments
    for disallowed in DISALLOWED_ARGS:
        for supplied_arg in pip_args:
            if supplied_arg in disallowed:
                sys.exit(
                    DISALLOWED_PIP_ARGS.format(
                        arg=supplied_arg, reason=DISALLOWED_ARGS[disallowed]))

    sources: List[Path] = []

    with TemporaryDirectory() as tmp_site_packages:

        # If both site_packages and pip_args are present, we need to copy the site_packages
        # dir into our staging area (tmp_site_packages) as pip may modify the contents.
        if site_packages:
            if pip_args:
                for sp in site_packages:
                    copytree(Path(sp), Path(tmp_site_packages))
            else:
                sources.extend([Path(p).expanduser() for p in site_packages])

        if pip_args:
            # Install dependencies into staged site-packages.
            pip.install(["--target", tmp_site_packages] + list(pip_args))

        if preamble:
            bin_dir = Path(tmp_site_packages, "bin")
            bin_dir.mkdir(exist_ok=True)
            shutil.copy(
                Path(preamble).absolute(), bin_dir / Path(preamble).name)

        sources.append(Path(tmp_site_packages).absolute())

        if no_modify:
            # if no_modify is specified, we need to build a map of source files and their
            # sha256 hashes, to be checked at runtime:
            hashes = {}

            for source in sources:
                for path in source.rglob("**/*.py"):
                    hashes[str(path.relative_to(source))] = hashlib.sha256(
                        path.read_bytes()).hexdigest()

        # if entry_point is a console script, get the callable
        if entry_point is None and console_script is not None:
            try:
                entry_point = find_entry_point(sources, console_script)
            except KeyError:
                if not console_script_exists(sources, console_script):
                    sys.exit(NO_ENTRY_POINT.format(entry_point=console_script))

        # Some projects need reproducible artifacts, so they can use SOURCE_DATE_EPOCH
        # environment variable to specify the timestamps in the zipapp.
        timestamp = int(
            os.environ.get(
                SOURCE_DATE_EPOCH_ENV,
                SOURCE_DATE_EPOCH_DEFAULT if reproducible else time.time()))

        # create runtime environment metadata
        env = Environment(
            built_at=datetime.utcfromtimestamp(timestamp).strftime(
                BUILD_AT_TIMESTAMP_FORMAT),
            entry_point=entry_point,
            script=console_script,
            compile_pyc=compile_pyc,
            extend_pythonpath=extend_pythonpath,
            shiv_version=__version__,
            no_modify=no_modify,
            reproducible=reproducible,
            preamble=Path(preamble).name if preamble else None,
        )

        if no_modify:
            env.hashes = hashes

        # create the zip
        builder.create_archive(
            sources,
            target=Path(output_file).expanduser(),
            interpreter=python or get_interpreter_path(),
            main="_bootstrap:bootstrap",
            env=env,
            compressed=compressed,
        )
Beispiel #18
0
    def start_python_dataflow(  # pylint: disable=too-many-arguments
        self,
        job_name: str,
        variables: dict,
        dataflow: str,
        py_options: List[str],
        project_id: str,
        py_interpreter: str = "python3",
        py_requirements: Optional[List[str]] = None,
        py_system_site_packages: bool = False,
        append_job_name: bool = True,
        on_new_job_id_callback: Optional[Callable[[str], None]] = None,
        location: str = DEFAULT_DATAFLOW_LOCATION,
    ):
        """
        Starts Dataflow job.

        :param job_name: The name of the job.
        :type job_name: str
        :param variables: Variables passed to the job.
        :type variables: Dict
        :param dataflow: Name of the Dataflow process.
        :type dataflow: str
        :param py_options: Additional options.
        :type py_options: List[str]
        :param project_id: The ID of the GCP project that owns the job.
            If set to ``None`` or missing, the default project_id from the GCP connection is used.
        :type project_id: Optional[str]
        :param py_interpreter: Python version of the beam pipeline.
            If None, this defaults to the python3.
            To track python versions supported by beam and related
            issues check: https://issues.apache.org/jira/browse/BEAM-1251
        :param py_requirements: Additional python package(s) to install.
            If a value is passed to this parameter, a new virtual environment has been created with
            additional packages installed.

            You could also install the apache-beam package if it is not installed on your system or you want
            to use a different version.
        :type py_requirements: List[str]
        :param py_system_site_packages: Whether to include system_site_packages in your virtualenv.
            See virtualenv documentation for more information.

            This option is only relevant if the ``py_requirements`` parameter is not None.
        :type py_interpreter: str
        :param append_job_name: True if unique suffix has to be appended to job name.
        :type append_job_name: bool
        :param project_id: Optional, the Google Cloud project ID in which to start a job.
            If set to None or missing, the default project_id from the Google Cloud connection is used.
        :param on_new_job_id_callback: Callback called when the job ID is known.
        :type on_new_job_id_callback: callable
        :param location: Job location.
        :type location: str
        """
        name = self._build_dataflow_job_name(job_name, append_job_name)
        variables['job_name'] = name
        variables['region'] = location

        def label_formatter(labels_dict):
            return [
                f'--labels={key}={value}'
                for key, value in labels_dict.items()
            ]

        if py_requirements is not None:
            if not py_requirements and not py_system_site_packages:
                warning_invalid_environment = textwrap.dedent("""\
                    Invalid method invocation. You have disabled inclusion of system packages and empty list
                    required for installation, so it is not possible to create a valid virtual environment.
                    In the virtual environment, apache-beam package must be installed for your job to be \
                    executed. To fix this problem:
                    * install apache-beam on the system, then set parameter py_system_site_packages to True,
                    * add apache-beam to the list of required packages in parameter py_requirements.
                    """)
                raise AirflowException(warning_invalid_environment)

            with TemporaryDirectory(prefix='dataflow-venv') as tmp_dir:
                py_interpreter = prepare_virtualenv(
                    venv_directory=tmp_dir,
                    python_bin=py_interpreter,
                    system_site_packages=py_system_site_packages,
                    requirements=py_requirements,
                )
                command_prefix = [py_interpreter] + py_options + [dataflow]

                self._start_dataflow(
                    variables=variables,
                    name=name,
                    command_prefix=command_prefix,
                    label_formatter=label_formatter,
                    project_id=project_id,
                    on_new_job_id_callback=on_new_job_id_callback,
                    location=location,
                )
        else:
            command_prefix = [py_interpreter] + py_options + [dataflow]

            self._start_dataflow(
                variables=variables,
                name=name,
                command_prefix=command_prefix,
                label_formatter=label_formatter,
                project_id=project_id,
                on_new_job_id_callback=on_new_job_id_callback,
                location=location,
            )
Beispiel #19
0
    def execute(self, fcrepo, args):
        start_time = datetime.now().timestamp()
        count = 0
        errors = 0
        total = len(args.uris)
        try:
            serializer_class = SERIALIZER_CLASSES[args.format]
        except KeyError:
            logger.error(f'Unknown format: {args.format}')
            raise FailureException()

        if args.export_binaries and args.binary_types is not None:
            # filter files by their MIME type
            def mime_type_filter(file):
                return str(file.mimetype) in args.binary_types.split(',')
        else:
            # default filter is None; in this case filter() will return
            # all items that evaluate to true
            mime_type_filter = None

        logger.info(f'Export destination: {args.output_dest}')

        # create a bag in a temporary directory to hold exported items
        temp_dir = TemporaryDirectory()
        bag = make_bag(temp_dir.name)

        export_dir = os.path.join(temp_dir.name, 'data')
        serializer = serializer_class(directory=export_dir,
                                      public_uri_template=args.uri_template)
        for uri in args.uris:
            try:
                logger.info(f'Exporting item {count + 1}/{total}: {uri}')

                # derive an item-level directory name from the URI
                # currently this is hard-coded to look for a UUID
                # TODO: expand to other types of unique ids?
                match = UUID_REGEX.search(uri)
                if match is None:
                    raise DataReadException(f'No UUID found in {uri}')
                item_dir = match[0]

                graph = fcrepo.get_graph(uri)
                model_class = detect_resource_class(graph, uri, fallback=Item)
                obj = model_class.from_graph(graph, uri)

                if args.export_binaries:
                    logger.info(f'Gathering binaries for {uri}')
                    binaries = list(
                        filter(mime_type_filter, obj.gather_files(fcrepo)))
                    total_size = sum(int(file.size[0]) for file in binaries)
                    size, unit = format_size(total_size)
                    logger.info(
                        f'Total size of binaries: {round(size, 2)} {unit}')
                else:
                    binaries = None

                serializer.write(obj, files=binaries, binaries_dir=item_dir)

                if binaries is not None:
                    binaries_dir = os.path.join(export_dir, item_dir)
                    os.makedirs(binaries_dir, exist_ok=True)
                    for file in binaries:
                        response = fcrepo.head(file.uri)
                        accessed = parsedate(response.headers['Date'])
                        modified = parsedate(response.headers['Last-Modified'])

                        binary_filename = os.path.join(binaries_dir,
                                                       str(file.filename))
                        with open(binary_filename, mode='wb') as binary:
                            with file.source as stream:
                                for chunk in stream:
                                    binary.write(chunk)

                        # update the atime and mtime of the file to reflect the time of the
                        # HTTP request and the resource's last-modified time in the repo
                        os.utime(binary_filename,
                                 times=(mktime(accessed), mktime(modified)))
                        logger.debug(f'Copied {file.uri} to {binary.name}')

                count += 1

            except DataReadException as e:
                # log the failure, but continue to attempt to export the rest of the URIs
                logger.error(f'Export of {uri} failed: {e}')
                errors += 1
            except (RESTAPIException, ConnectionError) as e:
                # log the failure, but continue to attempt to export the rest of the URIs
                logger.error(f'Unable to retrieve {uri}: {e}')
                errors += 1

            # update the status
            now = datetime.now().timestamp()
            yield {
                'time': {
                    'started': start_time,
                    'now': now,
                    'elapsed': now - start_time
                },
                'count': {
                    'total': total,
                    'exported': count,
                    'errors': errors
                }
            }

        try:
            serializer.finish()
        except EmptyItemListError:
            logger.error("No items could be exported; skipping writing file")

        logger.info(f'Exported {count} of {total} items')

        # save the BagIt bag to send to the output destination
        bag.save(manifests=True)

        # parse the output destination to determine where to send the export
        if args.output_dest.startswith('sftp:'):
            # send over SFTP to a remote host
            sftp_uri = urlsplit(args.output_dest)
            ssh_client = get_ssh_client(sftp_uri, key_filename=args.key)
            try:
                sftp_client = SFTPClient.from_transport(
                    ssh_client.get_transport())
                root, ext = splitext(basename(sftp_uri.path))
                destination = sftp_client.open(sftp_uri.path, mode='w')
            except SSHException as e:
                raise FailureException(str(e)) from e
        else:
            # send to a local file
            zip_filename = args.output_dest
            root, ext = splitext(basename(zip_filename))
            destination = zip_filename

        # write out a single ZIP file of the whole bag
        compress_bag(bag, destination, root)

        self.result = {
            'type': 'export_complete' if count == total else 'partial_export',
            'content_type': serializer.content_type,
            'file_extension': serializer.file_extension,
            'count': {
                'total': total,
                'exported': count,
                'errors': errors
            }
        }
Beispiel #20
0
 def setUp(self):
     super(RestoreTestCase, self).setUp()
     self.include = TemporaryDirectory()
     generic.create_test_files(self.sid.pathBackup(self.include.name))
Beispiel #21
0
 def setUp(self):
     self.tempdir = TemporaryDirectory()
Beispiel #22
0
def compile_2k_merge(path):
    vcf = setup(path)
    vcfs = [comb.transform_gvcf(vcf)] * COMBINE_GVCF_MAX
    combined = [comb.combine_gvcfs(vcfs)] * 20
    with TemporaryDirectory() as tmpdir:
        hl.experimental.write_matrix_tables(combined, os.path.join(tmpdir, 'combiner-multi-write'), overwrite=True)
Beispiel #23
0
def target_file():
    with TemporaryDirectory() as d:
        filepath = pathlib.Path(d) / "temp.shp"
        filename = str(filepath.absolute())
        yield filename
Beispiel #24
0
    def graph(self):
        """
        Returns a graphviz.Digraph for the directed graph the inheriting strategy represents.

        The graph can be rendered with:
        ``mystrategy.graph.render("filename") # renders to filename.png``
        """
        from graphviz import Digraph

        if (self._graph_cache['graph']
                and self._graph_cache['path'] == self.path):
            return self._graph_cache['graph']

        if not self._graph_cache['tempdir']:
            self._graph_cache['tempdir'] = TemporaryDirectory()

        dg = Digraph(
            filename=os.path.join(self._graph_cache['tempdir'].name, 'graph'),
            format='png',
        )

        edges = []

        dg.attr('node', style='filled', fillcolor='lightblue2', penwidth='1')
        dg.attr('edge', style='solid')

        for index, node_name in enumerate(self.path):
            attrs = {}

            if node_name == self.path[-1]:
                attrs = {'penwidth': '2'}

            dg.node(node_name, **attrs)

            if index < len(self.path) - 1:
                edges.append((
                    node_name,
                    self.path[index + 1],
                ))
                dg.edge(*edges[-1])

        dg.attr('node',
                style='filled',
                color='lightgrey',
                fillcolor='lightgrey')

        dg.attr('edge', style='dashed', arrowhead='empty')

        for node_name in self.states:
            if node_name not in self.path:
                dg.node(node_name)

            for edge in self.states[node_name]['dependencies']:
                if (
                        edge,
                        node_name,
                ) in edges:
                    continue

                dg.edge(edge, node_name)

        self._graph_cache['graph'] = dg

        return dg
Beispiel #25
0
import pytest
import subprocess
from importlib.machinery import SourceFileLoader
from datetime import datetime, timedelta
from tempfile import TemporaryDirectory
from unittest import mock
from unittest.mock import call

relpath_updater_script = "../sdw_updater_gui/Updater.py"
path_to_script = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              relpath_updater_script)
updater = SourceFileLoader("Updater", path_to_script).load_module()
from Updater import UpdateStatus  # noqa: E402
from Updater import current_templates  # noqa: E402

temp_dir = TemporaryDirectory().name

debian_based_vms = [
    "sd-app",
    "sd-log",
    "sd-viewer",
    "sd-gpg",
    "sd-proxy",
    "sd-whonix",
    "sd-devices",
]

TEST_RESULTS_OK = {
    "dom0": UpdateStatus.UPDATES_OK,
    "fedora": UpdateStatus.UPDATES_OK,
    "sd-app": UpdateStatus.UPDATES_OK,
Beispiel #26
0
def convert(inputPath, outputPath=None, **kwargs):
    """
    Take a source input file and output a pyramidal tiff file.

    :param inputPath: the path to the input file or base file of a set.
    :param outputPath: the path of the output file.

    Optional parameters that can be specified in kwargs:

    :param tileSize: the horizontal and vertical tile size.
    :param format: one of 'tiff' or 'aperio'.  Default is 'tiff'.
    :param onlyFrame: None for all frames or the 0-based frame number to just
        convert a single frame of the source.
    :param compression: one of 'jpeg', 'deflate' (zip), 'lzw', 'packbits',
        'zstd', or 'none'.
    :param quality: a jpeg or webp quality passed to vips.  0 is small, 100 is
        high quality.  90 or above is recommended.  For webp, 0 is lossless.
    :param level: compression level for zstd, 1-22 (default is 10) and deflate,
        1-9.
    :param predictor: one of 'none', 'horizontal', 'float', or 'yes' used for
        lzw and deflate.  Default is horizontal for non-geospatial data and yes
        for geospatial.
    :param psnr: psnr value for jp2k, higher results in large files.  0 is
        lossless.
    :param cr: jp2k compression ratio.  1 is lossless, 100 will try to make
        a file 1% the size of the original, etc.
    :param subifds: if True (the default), when creating a multi-frame file,
        store lower resolution tiles in sub-ifds.  If False, store all data in
        primary ifds.
    :param overwrite: if not True, throw an exception if the output path
        already exists.

    Additional optional parameters:

    :param geospatial: if not None, a boolean indicating if this file is
        geospatial.  If not specified or None, this will be checked.
    :param _concurrency: the number of cpus to use during conversion.  None to
        use the logical cpu count.

    :returns: outputPath if successful
    """
    if kwargs.get('_concurrency'):
        os.environ['VIPS_CONCURRENCY'] = str(_concurrency_to_value(**kwargs))
    geospatial = kwargs.get('geospatial')
    if geospatial is None:
        geospatial = is_geospatial(inputPath)
    suffix = format_hook('adjust_params', geospatial, kwargs, **kwargs)
    if suffix is False:
        return
    suffix = suffix or ('.tiff' if not geospatial else '.geo.tiff')
    if not outputPath:
        outputPath = os.path.splitext(inputPath)[0] + suffix
        if outputPath.endswith('.geo' + suffix):
            outputPath = outputPath[:len(outputPath) - len(suffix) -
                                    4] + suffix
        if outputPath == inputPath:
            outputPath = (os.path.splitext(inputPath)[0] + '.' +
                          time.strftime('%Y%m%d-%H%M%S') + suffix)
    if os.path.exists(outputPath) and not kwargs.get('overwrite'):
        raise Exception('Output file already exists.')
    try:
        tiffinfo = tifftools.read_tiff(inputPath)
    except Exception:
        tiffinfo = None
    if not kwargs.get('compression', None):
        kwargs = kwargs.copy()
        lossy = _is_lossy(inputPath, tiffinfo)
        logger.debug('Is file lossy: %r', lossy)
        eightbit = _is_eightbit(inputPath, tiffinfo)
        logger.debug('Is file 8 bits per samples: %r', eightbit)
        kwargs['_compression'] = None
        kwargs['compression'] = 'jpeg' if lossy and eightbit else 'lzw'
    if geospatial:
        _generate_geotiff(inputPath, outputPath, **kwargs)
    else:
        with TemporaryDirectory() as tempDir:
            tempPath = os.path.join(tempDir, os.path.basename(outputPath))
            lidata = _data_from_large_image(inputPath, tempPath, **kwargs)
            logger.log(logging.DEBUG - 1, 'large_image information for %s: %r',
                       inputPath, lidata)
            if not is_vips(inputPath) and lidata:
                _convert_large_image(inputPath, outputPath, tempPath, lidata,
                                     **kwargs)
            elif _is_multiframe(inputPath):
                _generate_multiframe_tiff(inputPath, outputPath, tempPath,
                                          lidata, **kwargs)
            else:
                _generate_tiff(inputPath, outputPath, tempPath, lidata,
                               **kwargs)
    return outputPath
Beispiel #27
0
 def setUp(self):
     self.tmp_dir = TemporaryDirectory()
     self.tmp_dir_path = Path(self.tmp_dir.name)
     os.chdir(self.tmp_dir.name)
     self.repo = self.get_repo()
Beispiel #28
0
def _create_temp_dir_next_to(path):
    return TemporaryDirectory(dir=str(Path(path).parent),
                              prefix='',
                              suffix='.tmp')
    def test_trityper2h5(self):
        with TemporaryDirectory() as temporary_directory:

            self.assertFalse(trityper2h5.main(
                "--input exampleTriTyper --output {} --study_name dosage"
                    .format(os.path.join(temporary_directory, "haseh5")).split(" ")))
Beispiel #30
0
def colorizable_files():
    """populate temp dir with sample files.
    (too hard to emit indivual test cases when fixture invoked in mark.parametrize)"""

    with TemporaryDirectory() as tempdir:
        for k, v in _cf.items():

            if v is None:
                continue
            if v.startswith("/"):
                file_path = v
            else:
                file_path = tempdir + "/" + v
            try:
                os.lstat(file_path)
            except FileNotFoundError:
                if file_path.endswith("_dir"):
                    os.mkdir(file_path)
                else:
                    open(file_path, "a").close()
                if k in ("di", "fi"):
                    pass
                elif k == "ex":
                    os.chmod(file_path, stat.S_IRWXU)  # tmpdir on windows need u+w
                elif k == "ln":  # cook ln test case.
                    os.chmod(file_path, stat.S_IRWXU)  # link to *executable* file
                    os.rename(file_path, file_path + "_target")
                    os.symlink(file_path + "_target", file_path)
                elif k == "or":
                    os.rename(file_path, file_path + "_target")
                    os.symlink(file_path + "_target", file_path)
                    os.remove(file_path + "_target")
                elif k == "pi":  # not on Windows
                    os.remove(file_path)
                    os.mkfifo(file_path)
                elif k == "su":
                    os.chmod(file_path, stat.S_ISUID)
                elif k == "sg":
                    os.chmod(file_path, stat.S_ISGID)
                elif k == "st":
                    os.chmod(
                        file_path, stat.S_ISVTX | stat.S_IRUSR | stat.S_IWUSR
                    )  # TempDir requires o:r
                elif k == "tw":
                    os.chmod(
                        file_path,
                        stat.S_ISVTX | stat.S_IWOTH | stat.S_IRUSR | stat.S_IWUSR,
                    )
                elif k == "ow":
                    os.chmod(file_path, stat.S_IWOTH | stat.S_IRUSR | stat.S_IWUSR)
                elif k == "mh":
                    os.rename(file_path, file_path + "_target")
                    os.link(file_path + "_target", file_path)
                else:
                    pass  # cauterize those elseless ifs!

                os.symlink(file_path, file_path + "_symlink")

        yield tempdir

    pass  # tempdir get cleaned up here.