Esempio n. 1
0
def test_resolve_remote_files(tmpdir):
    tmpdir.join("remote").join("dir").join("a.txt").write("toto", ensure=True)
    tmpdir.join("local").ensure_dir()
    storage_config = {
        "tmp": {
            "type": "local",
            "basedir": str(tmpdir)
        },
        "tmp2": {
            "type": "local",
            "basedir": str(tmpdir.join("remote"))
        }
    }
    client = StorageClient(config=storage_config)
    config = {
        "a": "/home/ubuntu/a.txt",
        "b": "non_storage:b.txt",
        "c": "tmp:remote/dir/a.txt",
        "d": "tmp2:/dir/a.txt",
        "e": True,
        "f": "tmp:",
    }
    config = utility.resolve_remote_files(config, str(tmpdir.join("local")),
                                          client)
    c_path = tmpdir.join("local").join("tmp/remote/dir/a.txt")
    d_path = tmpdir.join("local").join("tmp2/dir/a.txt")
    f_path = tmpdir.join("local").join("tmp")
    assert config["a"] == "/home/ubuntu/a.txt"
    assert config["b"] == "non_storage:b.txt"
    assert config["c"] == str(c_path)
    assert config["d"] == str(d_path)
    assert c_path.check(file=1)
    assert d_path.check(file=1)
    assert f_path.check(dir=1)
Esempio n. 2
0
class Utility(object):
    """Base class for utilities."""

    def __init__(self):
        self._corpus_dir = os.getenv('CORPUS_DIR')
        workspace_dir = os.getenv('WORKSPACE_DIR', '/root/workspace')
        self._output_dir = os.path.join(workspace_dir, 'output')
        self._data_dir = os.path.join(workspace_dir, 'data')
        self._shared_dir = os.path.join(workspace_dir, 'shared')
        self._tmp_dir = tempfile.mkdtemp()
        try:
            if not os.path.exists(self._output_dir):
                os.makedirs(self._output_dir)
            if not os.path.exists(self._data_dir):
                os.makedirs(self._data_dir)
            if not os.path.exists(self._shared_dir):
                os.makedirs(self._shared_dir)
        except OSError:
            pass

    @property
    @abc.abstractmethod
    def name(self):
        raise NotImplementedError()

    @abc.abstractmethod
    def declare_arguments(self, parser):
        raise NotImplementedError()

    @abc.abstractmethod
    def exec_function(self, args):
        """Launch the utility with provided params
        """
        raise NotImplementedError()

    def run(self, args=None):
        """Main entrypoint."""
        parser = argparse.ArgumentParser()
        parser.add_argument('-s', '--storage_config', default=None,
                            help=('Configuration of available storages as a file or a JSON string. '
                                  'Setting "-" will read from the standard input.'))
        parser.add_argument('-t', '--task_id', default=None,
                            help="Identifier of this run.")
        parser.add_argument('-i', '--image', default="?",
                            help="Full URL (registry/image:tag) of the image used for this run.")
        parser.add_argument('-b', '--beat_url', default=None,
                            help=("Endpoint that listens to beat requests "
                                  "(push notifications of activity)."))
        parser.add_argument('-bi', '--beat_interval', default=30, type=int,
                            help="Interval of beat requests in seconds.")
        parser.add_argument('--statistics_url', default=None,
                            help=('Endpoint that listens to statistics summaries generated '
                                  'at the end of the execution'))

        parser.add_argument('-ms', '--model_storage', default=os.environ["MODELS_DIR"],
                            help='Model storage in the form <storage_id>:[<path>].')
        parser.add_argument('-msr', '--model_storage_read', default=None,
                            help=('Model storage to read from, in the form <storage_id>:[<path>] '
                                  '(defaults to model_storage).'))
        parser.add_argument('-msw', '--model_storage_write', default=None,
                            help=('Model storage to write to, in the form <storage_id>:[<path>] '
                                  '(defaults to model_storage).'))
        parser.add_argument('-c', '--config', default=None,
                            help=('Configuration as a file or a JSON string. '
                                  'Setting "-" will read from the standard input.'))
        parser.add_argument('--config_update_mode',
                            choices=['default', 'merge', 'replace'],
                            default='default',
                            help=('How to update the parent task configuration with the given '
                                  'configuration. '
                                  '"default": automatic mode based on the configuration, '
                                  '"merge": recursively update configuration fields, '
                                  '"replace": replace the top-most fields.'))
        parser.add_argument('-m', '--model', default=None,
                            help='Model to load.')
        parser.add_argument('-g', '--gpuid', default="0",
                            help="Comma-separated list of 0-indexed GPU identifiers.")
        parser.add_argument('--no_push', default=False, action='store_true',
                            help='Do not push model.')

        self.declare_arguments(parser)
        args = parser.parse_args(args=args)

        if args.task_id is None:
            args.task_id = str(uuid.uuid4())

        self._task_id = args.task_id
        self._image = args.image

        start_beat_service(
            os.uname()[1],
            args.beat_url,
            args.task_id,
            interval=args.beat_interval)

        self._storage = StorageClient(
            config=load_config(args.storage_config) if args.storage_config else None)

        if args.model_storage_read is None:
            args.model_storage_read = args.model_storage
        if args.model_storage_write is None:
            args.model_storage_write = args.model_storage

        self._model_storage_read = args.model_storage_read
        self._model_storage_write = args.model_storage_write

        # for backward compatibility - convert singleton in int
        args.gpuid = args.gpuid.split(',')
        args.gpuid = [int(g) for g in args.gpuid]
        if len(args.gpuid) == 1:
            args.gpuid = args.gpuid[0]

        self._gpuid = args.gpuid

        self._config = load_config(args.config) if args.config is not None else None
        self._model = args.model
        self._no_push = args.no_push

        logger.info('Starting executing utility %s=%s', self.name, args.image)
        start_time = time.time()
        stats = self.exec_function(args)
        end_time = time.time()
        logger.info('Finished executing utility in %s seconds', str(end_time-start_time))

        if args.statistics_url is not None:
            requests.post(args.statistics_url, json={
                'task_id': self._task_id,
                'start_time': start_time,
                'end_time': end_time,
                'statistics': stats or {}
            })

    def convert_to_local_file(self, nextval, is_dir = False):
        new_val = []
        for val in nextval:
            inputs = val.split(',')
            local_inputs = []
            for remote_input in inputs:
                local_input = os.path.join(self._data_dir, self._storage.split(remote_input)[-1])
                if is_dir:
                    self._storage.get_directory(remote_input, local_input)
                else:
                    self._storage.get_file(remote_input, local_input)
                local_inputs.append(local_input)
            new_val.append(','.join(local_inputs))
        return new_val
Esempio n. 3
0
    def run(self, args=None):
        """Main entrypoint."""
        parser = argparse.ArgumentParser()
        parser.add_argument('-s', '--storage_config', default=None,
                            help=('Configuration of available storages as a file or a JSON string. '
                                  'Setting "-" will read from the standard input.'))
        parser.add_argument('-t', '--task_id', default=None,
                            help="Identifier of this run.")
        parser.add_argument('-i', '--image', default="?",
                            help="Full URL (registry/image:tag) of the image used for this run.")
        parser.add_argument('-b', '--beat_url', default=None,
                            help=("Endpoint that listens to beat requests "
                                  "(push notifications of activity)."))
        parser.add_argument('-bi', '--beat_interval', default=30, type=int,
                            help="Interval of beat requests in seconds.")
        parser.add_argument('--statistics_url', default=None,
                            help=('Endpoint that listens to statistics summaries generated '
                                  'at the end of the execution'))

        parser.add_argument('-ms', '--model_storage', default=os.environ["MODELS_DIR"],
                            help='Model storage in the form <storage_id>:[<path>].')
        parser.add_argument('-msr', '--model_storage_read', default=None,
                            help=('Model storage to read from, in the form <storage_id>:[<path>] '
                                  '(defaults to model_storage).'))
        parser.add_argument('-msw', '--model_storage_write', default=None,
                            help=('Model storage to write to, in the form <storage_id>:[<path>] '
                                  '(defaults to model_storage).'))
        parser.add_argument('-c', '--config', default=None,
                            help=('Configuration as a file or a JSON string. '
                                  'Setting "-" will read from the standard input.'))
        parser.add_argument('--config_update_mode',
                            choices=['default', 'merge', 'replace'],
                            default='default',
                            help=('How to update the parent task configuration with the given '
                                  'configuration. '
                                  '"default": automatic mode based on the configuration, '
                                  '"merge": recursively update configuration fields, '
                                  '"replace": replace the top-most fields.'))
        parser.add_argument('-m', '--model', default=None,
                            help='Model to load.')
        parser.add_argument('-g', '--gpuid', default="0",
                            help="Comma-separated list of 0-indexed GPU identifiers.")
        parser.add_argument('--no_push', default=False, action='store_true',
                            help='Do not push model.')

        self.declare_arguments(parser)
        args = parser.parse_args(args=args)

        if args.task_id is None:
            args.task_id = str(uuid.uuid4())

        self._task_id = args.task_id
        self._image = args.image

        start_beat_service(
            os.uname()[1],
            args.beat_url,
            args.task_id,
            interval=args.beat_interval)

        self._storage = StorageClient(
            config=load_config(args.storage_config) if args.storage_config else None)

        if args.model_storage_read is None:
            args.model_storage_read = args.model_storage
        if args.model_storage_write is None:
            args.model_storage_write = args.model_storage

        self._model_storage_read = args.model_storage_read
        self._model_storage_write = args.model_storage_write

        # for backward compatibility - convert singleton in int
        args.gpuid = args.gpuid.split(',')
        args.gpuid = [int(g) for g in args.gpuid]
        if len(args.gpuid) == 1:
            args.gpuid = args.gpuid[0]

        self._gpuid = args.gpuid

        self._config = load_config(args.config) if args.config is not None else None
        self._model = args.model
        self._no_push = args.no_push

        logger.info('Starting executing utility %s=%s', self.name, args.image)
        start_time = time.time()
        stats = self.exec_function(args)
        end_time = time.time()
        logger.info('Finished executing utility in %s seconds', str(end_time-start_time))

        if args.statistics_url is not None:
            requests.post(args.statistics_url, json={
                'task_id': self._task_id,
                'start_time': start_time,
                'end_time': end_time,
                'statistics': stats or {}
            })
Esempio n. 4
0
 def get_storage_client(accessible_storages):
     storage_client = StorageClient(rmprivate(accessible_storages))
     return storage_client
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-c', '--config', default=None, required=True,
                        help='Storages configuration file.')
    parser.add_argument('--info', '-v', action='store_true', help='info mode')
    parser.add_argument('--verbose', '-vv', action='store_true', help='verbose mode')

    subparsers = parser.add_subparsers(help='command help', dest='cmd')
    subparsers.required = True

    parser_list = subparsers.add_parser('list', help='list file on a storage')
    parser_list.add_argument('--recursive', '-r', action='store_true', help='recursive listing')
    parser_list.add_argument('storage', type=resolvedpath, help='path to list')

    parser_get = subparsers.add_parser('get', help='download a file or directory')
    parser_get.add_argument('storage', type=resolvedpath,
                            help='path to file or directory to download, directory must ends with /')
    parser_get.add_argument('local', type=str, help='local path')

    parser_get = subparsers.add_parser('push', help='upload a file or directory')
    parser_get.add_argument('local', type=str, help='local path to file or directory to upload')
    parser_get.add_argument('storage', type=resolvedpath,
                            help='remote path')

    parser_stat = subparsers.add_parser('stat', help='returns stat on a remote file/directory')
    parser_stat.add_argument('storage', type=resolvedpath, help='remote path')

    args = parser.parse_args()
    if args.info:
        logging.basicConfig(level=logging.INFO)
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)

    with open(args.config) as jsonf:
        config = json.load(jsonf)
        # support configuration from automatic tests
        if 'storages' in config:
            config = config['storages']
        client = StorageClient(config=config)

    if args.cmd == "list":
        listdir = client.listdir(args.storage, args.recursive)
        for k in sorted(listdir.keys()):
            if listdir[k].get("is_dir"):
                print("dir", k)
            else:
                date = datetime.fromtimestamp(listdir[k]["last_modified"])
                print("   ", "%10d" % listdir[k]["size"], date.strftime("%Y-%m-%dT%H:%M:%S"), k)
    elif args.cmd == "get":
        directory = args.storage.endswith('/')
        if directory:
            if os.path.isfile(args.local):
                raise ValueError("%s should be a directory", args.local)
            client.get_directory(args.storage, args.local)
        else:
            client.get_file(args.storage, args.local)
    elif args.cmd == "push":
        client.push(args.local, args.storage)
    elif args.cmd == "stat":
        print(client.stat(args.storage))
Esempio n. 6
0
    def run(self, args=None):
        """Main entrypoint."""
        parser = argparse.ArgumentParser()
        parser.add_argument(
            "-s",
            "--storage_config",
            default=None,
            help=
            ("Configuration of available storages as a file or a JSON string. "
             'Setting "-" will read from the standard input.'),
        )
        parser.add_argument("-t",
                            "--task_id",
                            default=None,
                            help="Identifier of this run.")
        parser.add_argument(
            "-i",
            "--image",
            default="?",
            help=
            "Full URL (registry/image:tag) of the image used for this run.",
        )
        parser.add_argument(
            "-b",
            "--beat_url",
            default=None,
            help=("Endpoint that listens to beat requests "
                  "(push notifications of activity)."),
        )
        parser.add_argument(
            "-bi",
            "--beat_interval",
            default=30,
            type=int,
            help="Interval of beat requests in seconds.",
        )
        parser.add_argument(
            "--statistics_url",
            default=None,
            help=("Endpoint that listens to statistics summaries generated "
                  "at the end of the execution"),
        )

        parser.add_argument(
            "-ms",
            "--model_storage",
            default=os.environ["MODELS_DIR"],
            help="Model storage in the form <storage_id>:[<path>].",
        )
        parser.add_argument(
            "-msr",
            "--model_storage_read",
            default=None,
            help=(
                "Model storage to read from, in the form <storage_id>:[<path>] "
                "(defaults to model_storage)."),
        )
        parser.add_argument(
            "-msw",
            "--model_storage_write",
            default=None,
            help=(
                "Model storage to write to, in the form <storage_id>:[<path>] "
                "(defaults to model_storage)."),
        )
        parser.add_argument(
            "-c",
            "--config",
            default=None,
            help=("Configuration as a file or a JSON string. "
                  'Setting "-" will read from the standard input.'),
        )
        parser.add_argument(
            "--config_update_mode",
            choices=["default", "merge", "replace"],
            default="default",
            help=("How to update the parent task configuration with the given "
                  "configuration. "
                  '"default": automatic mode based on the configuration, '
                  '"merge": recursively update configuration fields, '
                  '"replace": replace the top-most fields.'),
        )
        parser.add_argument("-m",
                            "--model",
                            default=None,
                            help="Model to load.")
        parser.add_argument(
            "-g",
            "--gpuid",
            default="0",
            help="Comma-separated list of 0-indexed GPU identifiers.",
        )
        parser.add_argument("--no_push",
                            default=False,
                            action="store_true",
                            help="Do not push model.")

        self.declare_arguments(parser)
        args = parser.parse_args(args=args)

        if args.task_id is None:
            args.task_id = str(uuid.uuid4())

        self._task_id = args.task_id
        self._image = args.image

        start_beat_service(os.uname()[1],
                           args.beat_url,
                           args.task_id,
                           interval=args.beat_interval)

        self._storage = StorageClient(config=load_config(args.storage_config)
                                      if args.storage_config else None)

        if args.model_storage_read is None:
            args.model_storage_read = args.model_storage
        if args.model_storage_write is None:
            args.model_storage_write = args.model_storage

        self._model_storage_read = args.model_storage_read
        self._model_storage_write = args.model_storage_write

        # for backward compatibility - convert singleton in int
        args.gpuid = args.gpuid.split(",")
        args.gpuid = [int(g) for g in args.gpuid]
        if len(args.gpuid) == 1:
            args.gpuid = args.gpuid[0]

        self._gpuid = args.gpuid

        self._config = load_config(
            args.config) if args.config is not None else None
        self._model = args.model
        self._no_push = args.no_push

        logger.info("Starting executing utility %s=%s", self.name, args.image)
        start_time = time.time()
        stats = self.exec_function(args)
        end_time = time.time()
        logger.info("Finished executing utility in %.1f seconds",
                    end_time - start_time)

        if args.statistics_url is not None:
            requests.post(
                args.statistics_url,
                json={
                    "task_id": self._task_id,
                    "start_time": start_time,
                    "end_time": end_time,
                    "statistics": stats or {},
                },
            )
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-c',
                        '--config',
                        default=None,
                        required=True,
                        help='Storages configuration file.')
    parser.add_argument('--info', '-v', action='store_true', help='info mode')
    parser.add_argument('--verbose',
                        '-vv',
                        action='store_true',
                        help='verbose mode')

    subparsers = parser.add_subparsers(help='command help', dest='cmd')
    subparsers.required = True

    parser_list = subparsers.add_parser('list', help='list file on a storage')
    parser_list.add_argument('--recursive',
                             '-r',
                             action='store_true',
                             help='recursive listing')
    parser_list.add_argument('storage', type=resolvedpath, help='path to list')

    parser_get = subparsers.add_parser('get',
                                       help='download a file or directory')
    parser_get.add_argument(
        'storage',
        type=resolvedpath,
        help='path to file or directory to download, directory must ends with /'
    )
    parser_get.add_argument('local', type=str, help='local path')

    parser_get = subparsers.add_parser('push',
                                       help='upload a file or directory')
    parser_get.add_argument('local',
                            type=str,
                            help='local path to file or directory to upload')
    parser_get.add_argument(
        'storage',
        type=resolvedpath,
        help='path to file or directory to download, directory must ends with /'
    )

    parser_get = subparsers.add_parser('delete', help='delete a corpus')
    parser_get.add_argument(
        'storage',
        type=resolvedpath,
        help='path to file or directory to download, directory must ends with /'
    )
    parser_get.add_argument('corpusId', type=str, help='corpus id')

    parser_stat = subparsers.add_parser(
        'stat', help='returns stat on a remote file/directory')
    parser_stat.add_argument(
        'storage',
        type=resolvedpath,
        help='path to file or directory to download, directory must ends with /'
    )

    parser_get = subparsers.add_parser(
        'stream_corpus_manager',
        help='Export a corpus in TMX(default) or biText')
    parser_get.add_argument(
        'storage',
        type=resolvedpath,
        help='path to file or directory to download, directory must ends with /'
    )
    parser_get.add_argument('corpusId', type=str, help='corpus id')
    parser_get.add_argument(
        'format',
        type=check_format,
        help='Format of the corpus (application/x-tmx+xml, text/bitext)')

    parser_search = subparsers.add_parser(
        'search', help='list corpus segments identified '
        'by corpus id')
    parser_search.add_argument('storage',
                               type=resolvedpath,
                               help='remote path')
    parser_search.add_argument('id', help='remote id')
    parser_search.add_argument('search_query',
                               type=resolvedjson,
                               help='query text for search')
    parser_search.add_argument('skip',
                               default=0,
                               help='number of segments skip (default 0)')
    parser_search.add_argument(
        'limit',
        default=0,
        help='number of segments returned (default 0 meaning all)')

    parser_search = subparsers.add_parser(
        'seg_delete', help='Delete segments identified by id')
    parser_search.add_argument('storage',
                               type=resolvedpath,
                               help='remote path')
    parser_search.add_argument('corpus_id', help='corpus id')
    parser_search.add_argument('ids', help='list segment id')

    parser_stream = subparsers.add_parser(
        'stream', help='print out specific corpus by name')
    parser_stream.add_argument('storage',
                               type=resolvedpath,
                               help='remote path')

    args = parser.parse_args()
    if args.info:
        logging.basicConfig(level=logging.INFO)
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)

    with open(args.config) as jsonf:
        config = json.load(jsonf)
        # support configuration from automatic tests
        if 'storages' in config:
            config = config['storages']
        client = StorageClient(config=config)

    if args.cmd == "list":
        listdir = client.listdir(args.storage, args.recursive)
        for k in sorted(listdir.keys()):
            if listdir[k].get("is_dir"):
                print("dir", k)
            else:
                date = datetime.fromtimestamp(listdir[k]["last_modified"])
                if "entries" in listdir[k]:
                    size = listdir[k]["entries"]
                else:
                    size = listdir[k]["size"]
                print("   ", "%10d" % size, date.strftime("%Y-%m-%dT%H:%M:%S"),
                      k)
    elif args.cmd == "get":
        directory = args.storage.endswith('/')
        if directory:
            if os.path.isfile(args.local):
                raise ValueError("%s should be a directory" % args.local)
            client.get_directory(args.storage, args.local)
        else:
            client.get_file(args.storage, args.local)
    elif args.cmd == "push":
        client.push(args.local, args.storage)
    elif args.cmd == "delete":
        client.delete_corpus_manager(args.storage, args.corpusId)
    elif args.cmd == "stat":
        print(client.stat(args.storage))
    elif args.cmd == "stream_corpus_manager":
        byte_result = b''
        for chunk in client.stream_corpus_manager(args.storage, args.corpusId,
                                                  args.format):
            if chunk:
                byte_result += chunk
        sys.stdout.write(byte_result.decode("utf-8"))
    elif args.cmd == "stream":
        byte_result = b''
        for chunk in client.stream(args.storage):
            if chunk:
                byte_result += chunk
        sys.stdout.write(byte_result.decode("utf-8"))
    elif args.cmd == "search":
        print(
            client.search(args.storage, args.id, args.search_query, args.skip,
                          args.limit))
    elif args.cmd == "seg_delete":
        print(client.seg_delete(args.storage, args.corpus_id, args.ids))
    elif args.cmd == "seg_add":
        print(client.seg_add(args.storage, args.corpus_id, args.ids))