Esempio n. 1
0
    def __init__(self, name, type, description=None, metadata=None):
        if not re.match("^[a-zA-Z0-9_\-.]+$", name):
            raise ValueError(
                'Artifact name may only contain alphanumeric characters, dashes, underscores, and dots. Invalid name: "%s"'
                % name
            )
        # TODO: this shouldn't be a property of the artifact. It's a more like an
        # argument to log_artifact.
        storage_layout = StorageLayout.V2
        if env.get_use_v1_artifacts():
            storage_layout = StorageLayout.V1

        self._storage_policy = WandbStoragePolicy(
            config={
                "storageLayout": storage_layout,
                #  TODO: storage region
            }
        )
        self._api = InternalApi()
        self._final = False
        self._digest = None
        self._file_entries = None
        self._manifest = ArtifactManifestV1(self, self._storage_policy)
        self._cache = get_artifacts_cache()
        self._added_new = False
        self._added_objs = {}
        # You can write into this directory when creating artifact files
        self._artifact_dir = compat_tempfile.TemporaryDirectory(
            missing_ok_on_cleanup=True
        )
        self.type = type
        self.name = name
        self.description = description
        self.metadata = metadata
Esempio n. 2
0
    def __init__(self, api):
        self._api = api

        self._tempdir = tempfile.TemporaryDirectory("wandb")

        self._stats = stats.Stats()

        self._incoming_queue = queue.Queue()
        self._event_queue = queue.Queue()

        self._step_checksum = step_checksum.StepChecksum(
            self._api,
            self._tempdir,
            self._incoming_queue,
            self._event_queue,
            self._stats,
        )
        self._step_checksum.start()

        self._step_upload = step_upload.StepUpload(self._api, self._stats,
                                                   self._event_queue,
                                                   self.MAX_UPLOAD_JOBS)
        self._step_upload.start()

        # Holds refs to tempfiles if users need to make a temporary file that
        # stays around long enough for file pusher to sync
        # TODO(artifacts): maybe don't do this
        self._temp_file_refs = []
Esempio n. 3
0
 def __init__(self, name, type, description=None, metadata=None):
     if not re.match('^[a-zA-Z0-9_\-.]+$', name):
         raise ValueError(
             'Artifact name may only contain alphanumeric characters, dashes, underscores, and dots. Invalid name: "%s"'
             % name)
     if type is None:
         raise ValueError(
             "type is required when logging artifacts, specify \"dataset\", \"model\", or a custom type"
         )
     # TODO: this shouldn't be a property of the artifact. It's a more like an
     # argument to log_artifact.
     self._storage_policy = WandbStoragePolicy()
     self._file_specs = {}
     self._api = InternalApi()  # TODO: persist project in settings?
     self._final = False
     self._digest = None
     self._file_entries = None
     self._manifest = ArtifactManifestV1(self, self._storage_policy)
     self._cache = artifacts_cache.get_artifacts_cache()
     self._added_new = False
     # You can write into this directory when creating artifact files
     self._artifact_dir = compat_tempfile.TemporaryDirectory(
         missing_ok_on_cleanup=True)
     self.server_manifest = None
     self.type = type
     self.name = name
     self.description = description
     self.metadata = metadata
def main(argv):
    args = parser.parse_args()
    print('Load test starting')

    project_name = args.project
    if project_name is None:
        project_name = 'artifacts-load-test-%s' % str(
            datetime.now()).replace(' ', '-').replace(':', '-').replace('.', '-')

    env_project = os.environ.get('WANDB_PROJECT')

    sweep_id = os.environ.get('WANDB_SWEEP_ID')
    if sweep_id:
        del os.environ['WANDB_SWEEP_ID']
    wandb_config_paths = os.environ.get('WANDB_CONFIG_PATHS')
    if wandb_config_paths:
        del os.environ['WANDB_CONFIG_PATHS']
    wandb_run_id = os.environ.get('WANDB_RUN_ID')
    if wandb_run_id:
        del os.environ['WANDB_RUN_ID']

    # set global entity and project before chdir'ing
    from wandb.apis import InternalApi
    api = InternalApi()
    settings_entity = api.settings('entity')
    settings_base_url = api.settings('base_url')
    os.environ['WANDB_ENTITY'] = (os.environ.get('LOAD_TEST_ENTITY') or settings_entity)
    os.environ['WANDB_PROJECT'] = project_name
    os.environ['WANDB_BASE_URL'] = (os.environ.get('LOAD_TEST_BASE_URL') or settings_base_url)

    # Change dir to avoid litering code directory
    pwd = os.getcwd()
    tempdir = tempfile.TemporaryDirectory()
    os.chdir(tempdir.name)

    artifact_name = 'load-artifact-' + ''.join(
        random.choices(string.ascii_lowercase + string.digits, k=10))

    print('Generating source data')
    source_file_names = gen_files(
        args.gen_n_files, args.gen_max_small_size, args.gen_max_large_size)
    print('Done generating source data')

    procs = []
    stop_queue = multiprocessing.Queue()
    stats_queue = multiprocessing.Queue()

    # start all processes

    # writers
    for i in range(args.num_writers):
        file_names = source_file_names
        if args.non_overlapping_writers:
            chunk_size = int(len(source_file_names) / args.num_writers)
            file_names = source_file_names[i * chunk_size: (i+1) * chunk_size]
        if args.distributed_fanout > 1:
            p = multiprocessing.Process(
                target=proc_version_writer_distributed,
                args=(
                    stop_queue,
                    stats_queue,
                    project_name,
                    file_names,
                    artifact_name,
                    args.files_per_version_min,
                    args.files_per_version_max,
                    args.distributed_fanout,
                    args.blocking)
            )
        else:
            p = multiprocessing.Process(
                target=proc_version_writer,
                args=(
                    stop_queue,
                    stats_queue,
                    project_name,
                    file_names,
                    artifact_name,
                    args.files_per_version_min,
                    args.files_per_version_max,
                    args.blocking)
            )
        p.start()
        procs.append(p)

    # readers
    for i in range(args.num_readers):
        p = multiprocessing.Process(
            target=proc_version_reader,
            args=(
                stop_queue,
                stats_queue,
                project_name,
                artifact_name,
                i
            )
        )
        p.start()
        procs.append(p)

    # deleters
    for i in range(args.num_deleters):
        p = multiprocessing.Process(
            target=proc_version_deleter,
            args=(
                stop_queue,
                stats_queue,
                artifact_name,
                args.min_versions_before_delete,
                args.delete_period_max))
        p.start()
        procs.append(p)

    # cache garbage collector
    if args.cache_gc_period_max is None:
        print('Testing cache GC process not enabled!')
    else:
        p = multiprocessing.Process(
            target=proc_cache_garbage_collector,
            args=(
                stop_queue,
                args.cache_gc_period_max))
        p.start()
        procs.append(p)
    
    # reset environment
    os.environ['WANDB_ENTITY'] = settings_entity
    os.environ['WANDB_BASE_URL'] = settings_base_url
    os.environ
    if env_project is None:
        del os.environ['WANDB_PROJECT']
    else:
        os.environ['WANDB_PROJECT'] = env_project
    if sweep_id:
        os.environ['WANDB_SWEEP_ID'] = sweep_id
    if wandb_config_paths:
        os.environ['WANDB_CONFIG_PATHS'] = wandb_config_paths
    if wandb_run_id:
        os.environ['WANDB_RUN_ID'] = wandb_run_id
    # go back to original dir
    os.chdir(pwd)

    # test phase
    start_time = time.time()
    stats = defaultdict(int)

    run = wandb.init(job_type='main-test-phase')
    run.config.update(args)
    while time.time() - start_time < args.test_phase_seconds:
        stat_update = None
        try:
            stat_update = stats_queue.get(True, 5000)
        except queue.Empty:
            pass
        print('** Test time: %s' % (time.time() - start_time))
        if stat_update:
            for k, v in stat_update.items():
                stats[k] += v
        wandb.log(stats)

    print('Test phase time expired')
    # stop all processes and wait til all are done
    for i in range(len(procs)):
        stop_queue.put(True)
    print('Waiting for processes to stop')
    fail = False
    for proc in procs:
        proc.join()
        if proc.exitcode != 0:
            print('FAIL! Test phase failed')
            fail = True
            sys.exit(1)

    # drain remaining stats
    while True:
        try:
            stat_update = stats_queue.get_nowait()
        except queue.Empty:
            break
        for k, v in stat_update.items():
            stats[k] += v

    print('Stats')
    import pprint
    pprint.pprint(dict(stats))

    if fail:
        print('FAIL! Test phase failed')
        sys.exit(1)
    else:
        print('Test phase successfully completed')

    print('Starting verification phase')
    
    os.environ['WANDB_ENTITY'] = (os.environ.get('LOAD_TEST_ENTITY') or settings_entity)
    os.environ['WANDB_PROJECT'] = project_name
    os.environ['WANDB_BASE_URL'] = (os.environ.get('LOAD_TEST_BASE_URL') or settings_base_url)
    data_api = wandb.Api()
    # we need list artifacts by walking runs, accessing via
    # project.artifactType.artifacts only returns committed artifacts
    for run in data_api.runs('%s/%s' % (api.settings('entity'), project_name)):
        for v in run.logged_artifacts():
            # TODO: allow deleted once we build deletion support
            if v.state != 'COMMITTED' and v.state != 'DELETED':
                print('FAIL! Artifact version not committed or deleted: %s' % v)
                sys.exit(1)
    
    print('Verification succeeded')
Esempio n. 5
0
import time
from six.moves import queue
import warnings

import wandb
import wandb.util
from wandb.compat import tempfile

# Get rid of cleanup warnings in Python 2.7.
warnings.filterwarnings('ignore', 'Implicitly cleaning up', RuntimeWarning,
                        'wandb.compat.tempfile')

# Temporary directory for copies we make of some file types to
# reduce the probability that the file gets changed while we're
# uploading it.
TMP_DIR = tempfile.TemporaryDirectory('wandb')

EventFileChanged = collections.namedtuple('EventFileChanged',
                                          ('path', 'save_name', 'copy'))
EventJobDone = collections.namedtuple('EventJobDone', ('job'))
EventFinish = collections.namedtuple('EventFinish', ())


class UploadJob(threading.Thread):
    def __init__(self, done_queue, push_function, save_name, path, copy=True):
        """A file upload thread.

        Arguments:
            done_queue: queue.Queue in which to put an EventJobDone event when
                the upload finishes.
            push_function: function(save_name, actual_path) which actually uploads
Esempio n. 6
0
        from pathlib import Path

        return str(Path(path).resolve())
    except:
        # Pathlib isn't present for python versions earlier than 3.3
        return os.path.realpath(path)


# Get rid of cleanup warnings in Python 2.7.
warnings.filterwarnings("ignore", "Implicitly cleaning up", RuntimeWarning,
                        "wandb.compat.tempfile")

# Temporary directory for copies we make of some file types to
# reduce the probability that the file gets changed while we're
# uploading it.
TMP_DIR = tempfile.TemporaryDirectory("wandb")

logger = logging.getLogger(__file__)


class FilePusher(object):
    """Parallel file upload class.
    This manages uploading multiple files in parallel. It will restart a given file's
    upload job if it receives a notification that that file has been modified.
    The finish() method will block until all events have been processed and all
    uploads are complete.
    """

    MAX_UPLOAD_JOBS = 64

    def __init__(self, api):
Esempio n. 7
0
    from wandb.sdk.internal import datastore
    from wandb.sdk.internal import handler
    from wandb.sdk.internal import sender
    from wandb.sdk.internal import tb_watcher
    from wandb.sdk.interface import interface
else:
    from wandb.sdk_py27.internal import datastore
    from wandb.sdk_py27.internal import handler
    from wandb.sdk_py27.internal import sender
    from wandb.sdk_py27.internal import tb_watcher
    from wandb.sdk_py27.interface import interface

WANDB_SUFFIX = ".wandb"
SYNCED_SUFFIX = ".synced"
TFEVENT_SUBSTRING = ".tfevents."
TMPDIR = tempfile.TemporaryDirectory()


class _LocalRun(object):
    def __init__(self, path, synced=None):
        self.path = path
        self.synced = synced
        self.offline = os.path.basename(path).startswith("offline-")
        self.datetime = datetime.datetime.strptime(
            os.path.basename(path).split("run-")[1].split("-")[0],
            "%Y%m%d_%H%M%S")

    def __str__(self):
        return self.path

Esempio n. 8
0
        from wandb.sdk import wandb_artifacts
    else:
        from wandb.sdk_py27 import wandb_run
        from wandb.sdk_py27 import wandb_artifacts

    return wandb_run, wandb_artifacts


# Get rid of cleanup warnings in Python 2.7.
warnings.filterwarnings(
    "ignore", "Implicitly cleaning up", RuntimeWarning, "wandb.compat.tempfile"
)

# Staging directory so we can encode raw data into files, then hash them before
# we put them into the Run directory to be uploaded.
MEDIA_TMP = tempfile.TemporaryDirectory("wandb-media")


class Table(Media):
    """This is a table designed to display sets of records.

    Arguments:
        columns: ([str]) Names of the columns in the table.
            Defaults to ["Input", "Output", "Expected"].
        data: (array) 2D Array of values that will be displayed as strings.
        dataframe: (pandas.DataFrame) DataFrame object used to create the table.
            When set, the other arguments are ignored.
        optional (Union[bool,List[bool]]): If None values are allowed. Singular bool
            applies to all columns. A list of bool values applies to each respective column.
            Default to True.
        allow_mixed_types (bool): Determines if columns are allowed to have mixed types (disables type validation). Defaults to False
Esempio n. 9
0
def main(argv):
    print('Load test starting')
    args = parser.parse_args()

    # set global entity and project before chdir'ing
    from wandb.apis import InternalApi
    api = InternalApi()
    os.environ['WANDB_ENTITY'] = api.settings('entity')
    os.environ['WANDB_PROJECT'] = api.settings('project')
    os.environ['WANDB_BASE_URL'] = api.settings('base_url')

    # Change dir to avoid litering code directory
    tempdir = tempfile.TemporaryDirectory()
    os.chdir(tempdir.name)

    artifact_name = 'load-artifact-' + ''.join(
        random.choices(string.ascii_lowercase + string.digits, k=10))

    print('Generating source data')
    source_file_names = gen_files(args.gen_n_files, args.gen_max_small_size,
                                  args.gen_max_large_size)
    print('Done generating source data')

    procs = []
    stop_queue = multiprocessing.Queue()

    # start all processes

    # writers
    for i in range(args.num_writers):
        p = multiprocessing.Process(
            target=proc_version_writer,
            args=(stop_queue, source_file_names, artifact_name,
                  args.files_per_version_min, args.files_per_version_max))
        p.start()
        procs.append(p)

    # readers
    for i in range(args.num_readers):
        p = multiprocessing.Process(target=proc_version_reader,
                                    args=(stop_queue, artifact_name, i))
        p.start()
        procs.append(p)

    # cache garbage collector
    if args.cache_gc_period_max is None:
        print('Testing cache GC process not enabled!')
    else:
        p = multiprocessing.Process(target=proc_cache_garbage_collector,
                                    args=(stop_queue,
                                          args.cache_gc_period_max))
        p.start()
        procs.append(p)

    # test phase
    time.sleep(args.test_phase_seconds)

    print('Test phase time expired')

    # stop all processes and wait til all are done
    for i in range(len(procs)):
        stop_queue.put(True)
    print('Waiting for processes to stop')
    for proc in procs:
        proc.join()
        if proc.exitcode != 0:
            print('FAIL! Test phase failed')
            sys.exit(1)

    print('Test phase successfully completed')

    print('Starting verification phase')

    api = wandb.Api()
    versions = api.artifact_versions('dataset', artifact_name)
    for v in versions:
        # TODO: allow deleted once we build deletion support
        if v.state != 'COMMITTED':
            print('FAIL! Artifact version not committed: %s' % v)
            sys.exit(1)

    print('Verification succeeded')