def __init__(self, auth_cookie=None, verbose=10, branch=None, user_startup_script=None): self.startup_script_file = os.path.join( os.path.dirname(__file__), 'scripts/ec2_worker_startup.sh') self.install_studio_script = os.path.join(os.path.dirname(__file__), 'scripts/install_studio.sh') self.client = boto3.client('ec2') self.asclient = boto3.client('autoscaling') self.cwclient = boto3.client('cloudwatch') self.region = self.client._client_config.region_name self.logger = logs.getLogger('EC2WorkerManager') self.logger.setLevel(verbose) self.auth_cookie = auth_cookie self.prices = self._get_ondemand_prices(_instance_specs.keys()) self.repo_url = git_util.get_my_repo_url() self.branch = branch if branch else git_util.get_my_checkout_target() self.user_startup_script = user_startup_script if user_startup_script: self.logger.warn('User startup script argument is deprecated')
def get_worker_manager(config, cloud=None, verbose=10): if cloud is None: return None assert cloud in ['gcloud', 'gcspot', 'ec2', 'ec2spot'] logger = logs.getLogger('runner.get_worker_manager') logger.setLevel(verbose) auth_cookie = None if config['database'].get('guest') \ else os.path.join( auth.TOKEN_DIR, config['database']['apiKey'] ) branch = config['cloud'].get('branch') logger.info('using branch {}'.format(branch)) if cloud in ['gcloud', 'gcspot']: cloudconfig = config['cloud']['gcloud'] worker_manager = GCloudWorkerManager( auth_cookie=auth_cookie, zone=cloudconfig['zone'], branch=branch, user_startup_script=config['cloud'].get('user_startup_script') ) if cloud in ['ec2', 'ec2spot']: worker_manager = EC2WorkerManager( auth_cookie=auth_cookie, branch=branch, user_startup_script=config['cloud'].get('user_startup_script') ) return worker_manager
def __init__(self, func=lambda x: x, parent=None, q_in=None, q_out=None, num_workers=0, q_size=None, batch_size=1, filterf=lambda x: x is not None, batcher=lambda x: x, timeout=1): min_q_size = 10 self.func = func self.parent = parent self.num_workers = num_workers self.filterf = filterf self.batch_size = batch_size self.batcher = batcher if num_workers > 0: self.q_size = q_size if q_size else 2 * num_workers self.q_out = q_out self.q_in = q_in self.q_size = max(min_q_size, 2 * num_workers) self.logger = logs.getLogger('BufferedPipe') self.logger.setLevel(10) self.timeout = timeout self.worker_frame = Thread
def __init__(self, path=None, verbose=10): if path is None: self.path = fs_tracker.get_queue_directory() else: self.path = path self.logger = logs.getLogger(self.__class__.__name__) self.logger.setLevel(verbose)
def __init__(self, db_config, measure_timestamp_diff=False, blocking_auth=True, compression=None, verbose=10): guest = db_config.get('guest') self.app = pyrebase.initialize_app(db_config) if compression is None: compression = db_config.get('compression') self.auth = None if not guest and 'serviceAccount' not in db_config.keys(): self.auth = get_auth(self.app, db_config.get("use_email_auth"), db_config.get("email"), db_config.get("password"), blocking_auth) self.logger = logs.getLogger('FirebaseArtifactStore') self.logger.setLevel(verbose) super(FirebaseArtifactStore, self).__init__( measure_timestamp_diff, compression=compression)
def getlogger(): global logger if logger is None: logger = logs.getLogger('studio_server') logger.setLevel(10) return logger
def __init__(self, zone='us-east1-c', auth_cookie=None, verbose=10, branch=None, user_startup_script=None): assert 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ.keys() with open(os.environ['GOOGLE_APPLICATION_CREDENTIALS'], 'r') as f: credentials_dict = json.loads(f.read()) self.compute = googleapiclient.discovery.build('compute', 'v1') self.startup_script_file = os.path.join( os.path.dirname(__file__), 'scripts/gcloud_worker_startup.sh') self.install_studio_script = os.path.join(os.path.dirname(__file__), 'scripts/install_studio.sh') self.zone = zone self.projectid = credentials_dict['project_id'] self.logger = logs.getLogger("GCloudWorkerManager") self.logger.setLevel(verbose) self.auth_cookie = auth_cookie self.user_startup_script = user_startup_script self.repo_url = git_util.get_my_repo_url() self.branch = branch if branch else git_util.get_my_checkout_target() self.log_bucket = "studioml-logs" if user_startup_script: self.logger.warn('User startup script argument is deprecated')
def __init__(self, args): self.config = args.config if args.guest: self.config['database']['guest'] = True self.logger = logs.getLogger('LocalExecutor') self.logger.setLevel(model.parse_verbosity(self.config.get('verbose'))) self.logger.debug("Config: ") self.logger.debug(self.config)
def allocate_resources(experiment, config=None, verbose=10): logger = logs.getLogger('allocate_resources') logger.setLevel(verbose) logger.info('Allocating resources {} for experiment {}'.format( experiment.resources_needed, experiment.key)) ret_val = True gpus_needed = int(experiment.resources_needed.get('gpus')) \ if experiment.resources_needed else 0 if gpus_needed > 0: ret_val = ret_val and allocate_gpus(gpus_needed, config) else: allocate_gpus(0, config) return ret_val
def __init__(self, measure_timestamp_diff=False, compression=None, verbose=logs.DEBUG): if measure_timestamp_diff: try: self.timestamp_shift = self._measure_timestamp_diff() except BaseException: self.timestamp_shift = 0 else: self.timestamp_shift = 0 self.compression = compression self.logger = logs.getLogger(self.__class__.__name__) self.logger.setLevel(verbose)
def __init__(self, name, verbose=10, receive_timeout=300, retry_time=10): assert boto3 is not None self._client = boto3.client('sqs') create_q_response = self._client.create_queue(QueueName=name) self._queue_url = create_q_response['QueueUrl'] self.logger = logs.getLogger('SQSQueue') if verbose is not None: self.logger.setLevel(parse_verbosity(verbose)) self._name = name self.logger.info('Creating SQS queue with name ' + name) self.logger.info('Queue url = ' + self._queue_url) self._receive_timeout = receive_timeout self._retry_time = retry_time
def __init__(self, config, measure_timestamp_diff=False, compression=None, verbose=10): self.logger = logs.getLogger('GCloudArtifactStore') self.logger.setLevel(verbose) self.config = config self._client = None self._client_timestamp = None compression = compression if compression else config.get('compression') super(GCloudArtifactStore, self).__init__(measure_timestamp_diff, compression=compression)
def __init__(self, firebase, use_email_auth=False, email=None, password=None, blocking=True): if not os.path.exists(TOKEN_DIR): os.makedirs(TOKEN_DIR) self.logger = logs.getLogger(self.__class__.__name__) self.logger.setLevel(logs.DEBUG) self.firebase = firebase self.user = {} self.use_email_auth = use_email_auth if use_email_auth: if email and password: self.email = email self.password = password else: self.email = input('Firebase token is not found or expired! ' + 'You need to re-login. (Or re-run with ' + 'studio/studio-runner ' + 'with --guest option ) ' '\nemail:') self.password = getpass.getpass('password:'******'Authentication required! Either specify ' + 'use_email_auth in config file, or run ' 'studio and go to webui ' + '(localhost:5000 by default) ' 'to authenticate using google credentials') while self.expired: time.sleep(1) self._update_user() self.sched = BackgroundScheduler() self.sched.start() self.sched.add_job(self._update_user, 'interval', minutes=31) atexit.register(self.sched.shutdown)
def get_db_provider(config=None, blocking_auth=True): if not config: config = get_config() verbose = parse_verbosity(config.get('verbose')) logger = logs.getLogger("get_db_provider") logger.setLevel(verbose) logger.debug('Choosing db provider with config:') logger.debug(config) if 'storage' in config.keys(): artifact_store = get_artifact_store(config['storage'], blocking_auth=blocking_auth, verbose=verbose) else: artifact_store = None assert 'database' in config.keys() db_config = config['database'] if db_config['type'].lower() == 'firebase': return FirebaseProvider(db_config, blocking_auth, verbose=verbose, store=artifact_store) elif db_config['type'].lower() == 'http': return HTTPProvider(db_config, verbose=verbose, blocking_auth=blocking_auth) elif db_config['type'].lower() == 's3': return S3Provider(db_config, verbose=verbose, store=artifact_store, blocking_auth=blocking_auth) elif db_config['type'].lower() == 'gs': return GSProvider(db_config, verbose=verbose, store=artifact_store, blocking_auth=blocking_auth) else: raise ValueError('Unknown type of the database ' + db_config['type'])
def __init__(self, queue_name, sub_name=None, verbose=10): from google.cloud import pubsub assert 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ.keys() with open(os.environ['GOOGLE_APPLICATION_CREDENTIALS']) as f: credentials = json.loads(f.read()) project_name = credentials['project_id'] self.logger = logs.getLogger(self.__class__.__name__) if verbose is not None: self.logger.setLevel(parse_verbosity(verbose)) self.pubclient = pubsub.PublisherClient() self.subclient = pubsub.SubscriberClient() self.project = project_name self.topic_name = self.pubclient.topic_path(project_name, queue_name) self.logger.info("Topic name = {}".format(self.topic_name)) try: self.pubtopic = self.pubclient.get_topic(self.topic_name) except BaseException as e: self.pubtopic = self.pubclient.create_topic(self.topic_name) self.logger.info('topic {} created'.format(self.topic_name)) sub_name = sub_name if sub_name else queue_name + "_sub" self.logger.info("Topic name = {}".format(queue_name)) self.logger.info("Subscription name = {}".format(sub_name)) self.sub_name = self.subclient.subscription_path( project_name, sub_name) try: self.subclient.get_subscription(self.sub_name) except BaseException as e: self.logger.warn(e) self.subclient.create_subscription(self.sub_name, self.topic_name, ack_deadline_seconds=20) self.logger.info('subscription {} created'.format(sub_name))
def __init__(self, config, verbose=10, blocking_auth=True, compression=None): # TODO: implement connection self.url = config.get('serverUrl') self.verbose = verbose self.logger = logs.getLogger('HTTPProvider') self.logger.setLevel(self.verbose) self.auth = None self.app = pyrebase.initialize_app(config) guest = config.get('guest') if not guest and 'serviceAccount' not in config.keys(): self.auth = get_auth(self.app, config.get("use_email_auth"), config.get("email"), config.get("password"), blocking_auth) self.compression = compression if self.compression is None: self.compression = config.get('compression')
def __init__( self, db_config, blocking_auth=True, verbose=10, store=None, compression=None): guest = db_config.get('guest') self.app = pyrebase.initialize_app(db_config) self.logger = logs.getLogger(self.__class__.__name__) self.logger.setLevel(verbose) self.compression = compression if self.compression is None: self.compression = db_config.get('compression') self.auth = None if not guest and 'serviceAccount' not in db_config.keys(): self.auth = get_auth(self.app, db_config.get("use_email_auth"), db_config.get("email"), db_config.get("password"), blocking_auth) self.store = store if store else FirebaseArtifactStore( db_config, verbose=verbose, blocking_auth=blocking_auth, compression=self.compression ) if self.auth and not self.auth.expired: self.register_user(None, self.auth.get_user_email()) self.max_keys = db_config.get('max_keys', 100)
import logs from routes import Mapper from webob import request LOG = logs.getLogger(__name__) class Server(object): def create(self, body): return {'method': 'create'} def index(self): return {'method': 'index'} def show(self, id): return {'method': 'get'} def delete(self, id): return {'method': 'delete'} def update(self, id, body): return {'method': 'update'} class Public(object): def __init__(self, conf): self.conf = conf self.mapper = Mapper() self.mapper.resource('server', 'servers', controller=Server())
def worker_loop(queue, parsed_args, single_experiment=False, timeout=0, verbose=None): fetch_artifacts = True logger = logs.getLogger('worker_loop') hold_period = 4 retval = 0 while True: msg = queue.dequeue(acknowledge=False, timeout=timeout) if not msg: break # first_exp, ack_key = queue.dequeue(acknowledge=False) first_exp, ack_key = msg data_dict = json.loads(sixdecode(first_exp)) experiment_key = data_dict['experiment']['key'] config = data_dict['config'] parsed_args.config = config if verbose: config['verbose'] = verbose else: verbose = model.parse_verbosity(config.get('verbose')) logger.setLevel(verbose) logger.debug('Received message: \n{}'.format(data_dict)) executor = LocalExecutor(parsed_args) with model.get_db_provider(config) as db: # experiment = experiment_from_dict(data_dict['experiment']) def try_get_experiment(): experiment = db.get_experiment(experiment_key) if experiment is None: raise ValueError( 'experiment is not found - indicates storage failure') return experiment experiment = retry(try_get_experiment, sleep_time=10, logger=logger) if config.get('experimentLifetime') and \ int(str2duration(config['experimentLifetime']) .total_seconds()) + experiment.time_added < time.time(): logger.info( 'Experiment expired (max lifetime of {} was exceeded)'. format(config.get('experimentLifetime'))) queue.acknowledge(ack_key) continue if allocate_resources(experiment, config, verbose=verbose): def hold_job(): queue.hold(ack_key, hold_period) hold_job() sched = BackgroundScheduler() sched.add_job(hold_job, 'interval', minutes=hold_period / 2) sched.start() try: python = 'python' if experiment.pythonver == 3: python = 'python3' if '_singularity' not in experiment.artifacts.keys(): pip_diff = pip_needed_packages(experiment.pythonenv, python) if any(pip_diff): logger.info( 'Setting up python packages for experiment') if pip_install_packages(pip_diff, python, logger) != 0: logger.info( "Installation of all packages together " + " failed, " "trying one package at a time") for pkg in pip_diff: pip_install_packages([pkg], python, logger) for tag, art in six.iteritems(experiment.artifacts): if fetch_artifacts or 'local' not in art.keys(): logger.info('Fetching artifact ' + tag) if tag == 'workspace': art['local'] = retry(lambda: db.get_artifact( art, only_newer=False), sleep_time=10, logger=logger) else: art['local'] = retry( lambda: db.get_artifact(art), sleep_time=10, logger=logger) returncode = executor.run(experiment) if returncode != 0: retval = returncode finally: sched.shutdown() queue.acknowledge(ack_key) if single_experiment: logger.info('single_experiment is True, quitting') return retval else: logger.info('Cannot run experiment ' + experiment.key + ' due lack of resources. Will retry') time.sleep(config['sleep_time']) # wait_for_messages(queue, timeout, logger) # queue = glob.glob(fs_tracker.get_queue_directory() + "/*") logger.info("Queue in {} is empty, quitting".format( fs_tracker.get_queue_directory())) return retval
def get_logger(): global logger if not logger: logger = logs.getLogger('studio-serve') logger.setLevel(logs.DEBUG) return logger
def main(): """ A little utility to handle reading and writing streams to and from a queue. --pub <queue> : publish what's read from stdin to <queue> --sub <queue> : read from <queue> and write the messages to stdout --cat : when used with --pub, write all published messages to stdout --clean : check in incoming and outgoing messages. Verify the message is correct JSON and add an embersId if needed. --log_file : Path to write the log file to --log_level : Logging level Other standard EMBERS options (e.g. --verbose). """ import args import message global log ap = args.get_parser() ap.add_argument('--clean', action="store_true", help='Verify message format and add standard fields such as embersId.') ap.add_argument('--addfeed', action="store_true", help='Add feed and feedPath fields to published message.') ap.add_argument('--cat', action="store_true", help='Write all published messages to stdout.') ap.add_argument('--rm', nargs="+", help="delete queue") arg = ap.parse_args() log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) init(arg) if arg.rm and not arg.sub: for queue in arg.rm: print "Deleting", queue, queue = ikqueue.Queue(queue) queue.maybe_bind(connect()) queue.delete() print "." return try: # need to use the raw/utf handler unless we are doing clean marshal = UnicodeMarshal() if arg.clean or arg.addfeed: marshal = JsonMarshal() if arg.sub is None and os.environ.get('UPSTART_JOB') is None: arg.sub = '-' # stdin subq = open(arg.sub, 'r') #, marshal=marshal, ssh_key=arg.ssh_key, ssh_conn=arg.tunnel) if arg.pub is None and os.environ.get('UPSTART_JOB') is None: arg.pub = '-' # stdout pubq = open(arg.pub, 'w', capture=arg.cat, marshal=marshal) except Exception as e: log.exception("Exception opening queues: %s" % e) # "Human-readable" queue name can be retrieved as # # sname = subq.get_name() # pname = pubq.get_name() rc = 0 try: it = subq.__iter__() while True: m = '' try: m = it.next() if arg.clean: m = message.clean(m) if m: if arg.addfeed: m = message.add_embers_ids(m, feed=pubq.get_name(), feedPath=pubq.get_name()) pubq.write(m) except StopIteration: break except KeyboardInterrupt: break except Exception as e: rc += 1 if m: log.exception('Could not process message %s: %s' % (m, e)) else: log.exception('Unknown processing error %s' % e) except KeyboardInterrupt: pass except Exception as e: rc = 1 log.exception('Top level exception %s' % e) return rc
def main(): """ A little utility to handle reading and writing streams to and from a queue. --pub <queue> : publish what's read from stdin to <queue> --sub <queue> : read from <queue> and write the messages to stdout --cat : when used with --pub, write all published messages to stdout --clean : check in incoming and outgoing messages. Verify the message is correct JSON and add an embersId if needed. --log_file : Path to write the log file to --log_level : Logging level Other standard EMBERS options (e.g. --verbose). """ import args import message global log ap = args.get_parser() ap.add_argument( '--clean', action="store_true", help='Verify message format and add standard fields such as embersId.') ap.add_argument('--addfeed', action="store_true", help='Add feed and feedPath fields to published message.') ap.add_argument('--cat', action="store_true", help='Write all published messages to stdout.') ap.add_argument('--rm', nargs="+", help="delete queue") arg = ap.parse_args() log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) init(arg) if arg.rm and not arg.sub: for queue in arg.rm: print "Deleting", queue, queue = ikqueue.Queue(queue) queue.maybe_bind(connect()) queue.delete() print "." return try: # need to use the raw/utf handler unless we are doing clean marshal = UnicodeMarshal() if arg.clean or arg.addfeed: marshal = JsonMarshal() if arg.sub is None and os.environ.get('UPSTART_JOB') is None: arg.sub = '-' # stdin subq = open( arg.sub, 'r') #, marshal=marshal, ssh_key=arg.ssh_key, ssh_conn=arg.tunnel) if arg.pub is None and os.environ.get('UPSTART_JOB') is None: arg.pub = '-' # stdout pubq = open(arg.pub, 'w', capture=arg.cat, marshal=marshal) except Exception as e: log.exception("Exception opening queues: %s" % e) # "Human-readable" queue name can be retrieved as # # sname = subq.get_name() # pname = pubq.get_name() rc = 0 try: it = subq.__iter__() while True: m = '' try: m = it.next() if arg.clean: m = message.clean(m) if m: if arg.addfeed: m = message.add_embers_ids(m, feed=pubq.get_name(), feedPath=pubq.get_name()) pubq.write(m) except StopIteration: break except KeyboardInterrupt: break except Exception as e: rc += 1 if m: log.exception('Could not process message %s: %s' % (m, e)) else: log.exception('Unknown processing error %s' % e) except KeyboardInterrupt: pass except Exception as e: rc = 1 log.exception('Top level exception %s' % e) return rc
from BeautifulSoup import BeautifulSoup, Comment from langdetect import detect import logs import args as args_util import re from sys import exit from lxml import html as doc_parser from pycountry import languages from goose import Goose, Crawler from goose.text import StopWordsArabic from goose.text import StopWordsChinese from goose.text import StopWordsKorean import requests import gevent log = logs.getLogger(__name__) def validate_encoding(http_response): """ Validates that the encoding for the HTML/text in the http_response is properly marked. If it isn't, the encoding is corrected so that we don't receive errors on special characters of foreign languages :param http_response: :return: http_response: response with correct encoding """ encodings = requests.utils.get_encodings_from_content( http_response.content) if encodings and encodings[0].lower() != http_response.encoding.lower(): log.debug('Correcting encoding %s to %s' %
def main(args=sys.argv[1:]): logger = logs.getLogger('studio-runner') parser = argparse.ArgumentParser( description='Studio runner. \ Usage: studio run <runner_arguments> \ script <script_arguments>') parser.add_argument('--config', help='configuration file', default=None) parser.add_argument('--project', help='name of the project', default=None) parser.add_argument( '--experiment', '-e', help='Name of the experiment. If none provided, ' + 'random uuid will be generated', default=None) parser.add_argument( '--guest', help='Guest mode (does not require db credentials)', action='store_true') parser.add_argument( '--force-git', help='If run in a git directory, force running the experiment ' + 'even if changes are not commited', action='store_true') parser.add_argument( '--gpus', help='Number of gpus needed to run the experiment', type=int, default=None) parser.add_argument( '--cpus', help='Number of cpus needed to run the experiment' + ' (used to configure cloud instance)', type=int, default=None) parser.add_argument( '--ram', help='Amount of RAM needed to run the experiment' + ' (used to configure cloud instance), ex: 10G, 10GB', default=None) parser.add_argument( '--gpuMem', help='Amount of GPU RAM needed to run the experiment', default=None) parser.add_argument( '--hdd', help='Amount of hard drive space needed to run the experiment' + ' (used to configure cloud instance), ex: 10G, 10GB', default=None) parser.add_argument( '--queue', '-q', help='Name of the remote execution queue', default=None) parser.add_argument( '--cloud', help='Cloud execution mode. Could be gcloud, gcspot, ec2 or ec2spot', default=None) parser.add_argument( '--bid', help='Spot instance price bid, specified in USD or in percentage ' + 'of on-demand instance price. Default is %(default)s', default='100%') parser.add_argument( '--capture-once', '-co', help='Name of the immutable artifact to be captured. ' + 'It will be captured once before the experiment is run', default=[], action='append') parser.add_argument( '--capture', '-c', help='Name of the mutable artifact to be captured continuously', default=[], action='append') parser.add_argument( '--reuse', '-r', help='Name of the artifact from another experiment to use', default=[], action='append') parser.add_argument( '--verbose', '-v', help='Verbosity level. Allowed values: ' + 'debug, info, warn, error, crit ' + 'or numerical value of logger levels.', default=None) parser.add_argument( '--metric', help='Metric to show in the summary of the experiment, ' + 'and to base hyperparameter search on. ' + 'Refers a scalar value in tensorboard log ' + 'example: --metric=val_loss[:final | :min | :max] to report ' + 'validation loss in the end of the keras experiment ' + '(or smallest or largest throughout the experiment for :min ' + 'and :max respectively)', default=None) parser.add_argument( '--hyperparam', '-hp', help='Try out multiple values of a certain parameter. ' + 'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' + 'will instantiate 10 versions of the script, replace ' + 'learning_rate with a one of the 10 values for learning ' + 'rate that lies on a log grid from 0.01 to 0.1, create ' 'experiments and place them in the queue.', default=[], action='append') parser.add_argument( '--num-workers', help='Number of local or cloud workers to spin up', type=int, default=None) parser.add_argument( '--python-pkg', help='Python package not present in the current environment ' + 'that is needed for experiment. Only compatible with ' + 'remote and cloud workers for now', default=[], action='append') parser.add_argument( '--ssh-keypair', help='Name of the SSH keypair used to access the EC2 ' + 'instances directly', default=None) parser.add_argument( '--optimizer', '-opt', help='Name of optimizer to use, by default is grid search. ' + 'The name of the optimizer must either be in ' + 'studio/optimizer_plugins ' + 'directory or the path to the optimizer source file ' + 'must be supplied. ', default='grid') parser.add_argument( '--cloud-timeout', help="Time (in seconds) that cloud workers wait for messages. " + "If negative, " + "wait for the first message in the queue indefinitely " + "and shut down " + "as soon as no new messages are available. " + "If zero, don't wait at all." + "Default value is %(default)d", type=int, default=300) parser.add_argument( '--user-startup-script', help='Path of script to run immediately ' + 'before running the remote worker', default=None) parser.add_argument( '--branch', help='Branch of studioml to use when running remote worker, useful ' + 'for debugging pull requests. Default is current', default=None) parser.add_argument( '--max-duration', help='Max experiment runtime (i.e. time after which experiment ' + 'should be killed no matter what.). Examples of values ' + 'might include 5h, 48h2m10s', default=None) parser.add_argument( '--lifetime', help='Max experiment lifetime (i.e. wait time after which ' + 'experiment loses relevance and should not be started)' + ' Examples include 240h30m10s', default=None) parser.add_argument( '--container', help='Singularity container in which experiment should be run. ' + 'Assumes that container has all dependencies installed', default=None ) parser.add_argument( '--port', help='Ports to open on a cloud instance', default=[], action='append' ) # detect which argument is the script filename # and attribute all arguments past that index as related to the script (runner_args, other_args) = parser.parse_known_args(args) py_suffix_args = [i for i, arg in enumerate(args) if arg.endswith('.py') or '::' in arg] rerun = False if len(py_suffix_args) < 1: print('None of the arugments end with .py') if len(other_args) == 0: print("Trying to run a container job") assert runner_args.container is not None exec_filename = None elif len(other_args) == 1: print("Treating last argument as experiment key to rerun") rerun = True experiment_key = args[-1] else: print("Too many extra arguments - should be either none " + "for container job or one for experiment re-run") sys.exit(1) else: script_index = py_suffix_args[0] exec_filename, other_args = args[script_index], args[script_index + 1:] runner_args = parser.parse_args(args[:script_index]) # TODO: Queue the job based on arguments and only then execute. config = model.get_config(runner_args.config) if runner_args.verbose: config['verbose'] = runner_args.verbose if runner_args.guest: config['database']['guest'] = True if runner_args.container: runner_args.capture_once.append( runner_args.container + ':_singularity') verbose = model.parse_verbosity(config['verbose']) logger.setLevel(verbose) if git_util.is_git() and not git_util.is_clean() and not rerun: logger.warn('Running from dirty git repo') if not runner_args.force_git: logger.error( 'Specify --force-git to run experiment from dirty git repo') sys.exit(1) resources_needed = parse_hardware(runner_args, config['resources_needed']) logger.debug('resources requested: ') logger.debug(str(resources_needed)) artifacts = {} artifacts.update(parse_artifacts(runner_args.capture, mutable=True)) artifacts.update(parse_artifacts(runner_args.capture_once, mutable=False)) with model.get_db_provider(config) as db: artifacts.update(parse_external_artifacts(runner_args.reuse, db)) if runner_args.branch: config['cloud']['branch'] = runner_args.branch if runner_args.user_startup_script: config['cloud']['user_startup_script'] = \ runner_args.user_startup_script if runner_args.lifetime: config['experimentLifetime'] = runner_args.lifetime if any(runner_args.hyperparam): if runner_args.optimizer is "grid": experiments = add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed, logger) queue_name = submit_experiments( experiments, config=config, logger=logger, queue_name=runner_args.queue, cloud=runner_args.cloud) spin_up_workers( runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) else: opt_modulepath = os.path.join( os.path.dirname(os.path.abspath(__file__)), "optimizer_plugins", runner_args.optimizer + ".py") if not os.path.exists(opt_modulepath): opt_modulepath = os.path.abspath( os.path.expanduser(runner_args.optimizer)) logger.info('optimizer path: %s' % opt_modulepath) assert os.path.exists(opt_modulepath) sys.path.append(os.path.dirname(opt_modulepath)) opt_module = importlib.import_module( os.path.basename(opt_modulepath.replace(".py", ''))) h = HyperparameterParser(runner_args, logger) hyperparams = h.parse() optimizer = getattr( opt_module, "Optimizer")( hyperparams, config['optimizer'], logger) workers_started = False queue_name = runner_args.queue while not optimizer.stop(): hyperparam_pop = optimizer.ask() hyperparam_tuples = h.convert_to_tuples(hyperparam_pop) experiments = add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed, logger, optimizer=optimizer, hyperparam_tuples=hyperparam_tuples) queue_name = submit_experiments( experiments, config=config, logger=logger, cloud=runner_args.cloud, queue_name=queue_name) if not workers_started: spin_up_workers( runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) workers_started = True fitnesses, behaviors = get_experiment_fitnesses( experiments, optimizer, config, logger) # for i, hh in enumerate(hyperparam_pop): # print fitnesses[i] # for hhh in hh: # print hhh try: optimizer.tell(hyperparam_pop, fitnesses, behaviors) except BaseException: optimizer.tell(hyperparam_pop, fitnesses) try: optimizer.disp() except BaseException: logger.warn('Optimizer has no disp() method') else: if rerun: with model.get_db_provider(config) as db: experiment = db.get_experiment(experiment_key) new_key = runner_args.experiment if runner_args.experiment \ else experiment_key + '_rerun' + str(uuid.uuid4()) experiment.key = new_key for _, art in six.iteritems(experiment.artifacts): art['mutable'] = False experiments = [experiment] else: experiments = [create_experiment( filename=exec_filename, args=other_args, experiment_name=runner_args.experiment, project=runner_args.project, artifacts=artifacts, resources_needed=resources_needed, metric=runner_args.metric, max_duration=runner_args.max_duration, )] queue_name = submit_experiments( experiments, config=config, logger=logger, cloud=runner_args.cloud, queue_name=runner_args.queue) spin_up_workers( runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) return
def get_logger(): global _my_logger if not _my_logger: _my_logger = logs.getLogger('studio-runs') return _my_logger
import psutil import time import six from pygtail import Pygtail import threading from apscheduler.schedulers.background import BackgroundScheduler import fs_tracker, model, logs from local_queue import LocalQueue from gpu_util import get_available_gpus, get_gpu_mapping, get_gpus_summary from experiment import Experiment from util import sixdecode, str2duration, retry from model import parse_verbosity logs.getLogger('apscheduler.scheduler').setLevel(logs.ERROR) class LocalExecutor(object): """Runs job while capturing environment and logs results. """ def __init__(self, args): self.config = args.config if args.guest: self.config['database']['guest'] = True self.logger = logs.getLogger('LocalExecutor') self.logger.setLevel(model.parse_verbosity(self.config.get('verbose'))) self.logger.debug("Config: ") self.logger.debug(self.config)