Example #1
0
    def __init__(self,
                 auth_cookie=None,
                 verbose=10,
                 branch=None,
                 user_startup_script=None):
        self.startup_script_file = os.path.join(
            os.path.dirname(__file__), 'scripts/ec2_worker_startup.sh')

        self.install_studio_script = os.path.join(os.path.dirname(__file__),
                                                  'scripts/install_studio.sh')

        self.client = boto3.client('ec2')
        self.asclient = boto3.client('autoscaling')
        self.cwclient = boto3.client('cloudwatch')

        self.region = self.client._client_config.region_name

        self.logger = logs.getLogger('EC2WorkerManager')
        self.logger.setLevel(verbose)
        self.auth_cookie = auth_cookie

        self.prices = self._get_ondemand_prices(_instance_specs.keys())

        self.repo_url = git_util.get_my_repo_url()
        self.branch = branch if branch else git_util.get_my_checkout_target()
        self.user_startup_script = user_startup_script

        if user_startup_script:
            self.logger.warn('User startup script argument is deprecated')
Example #2
0
def get_worker_manager(config, cloud=None, verbose=10):
    if cloud is None:
        return None

    assert cloud in ['gcloud', 'gcspot', 'ec2', 'ec2spot']
    logger = logs.getLogger('runner.get_worker_manager')
    logger.setLevel(verbose)

    auth_cookie = None if config['database'].get('guest') \
        else os.path.join(
        auth.TOKEN_DIR,
        config['database']['apiKey']
    )

    branch = config['cloud'].get('branch')

    logger.info('using branch {}'.format(branch))

    if cloud in ['gcloud', 'gcspot']:

        cloudconfig = config['cloud']['gcloud']
        worker_manager = GCloudWorkerManager(
            auth_cookie=auth_cookie,
            zone=cloudconfig['zone'],
            branch=branch,
            user_startup_script=config['cloud'].get('user_startup_script')
        )

    if cloud in ['ec2', 'ec2spot']:
        worker_manager = EC2WorkerManager(
            auth_cookie=auth_cookie,
            branch=branch,
            user_startup_script=config['cloud'].get('user_startup_script')
        )
    return worker_manager
Example #3
0
    def __init__(self,
                 func=lambda x: x,
                 parent=None,
                 q_in=None,
                 q_out=None,
                 num_workers=0,
                 q_size=None,
                 batch_size=1,
                 filterf=lambda x: x is not None,
                 batcher=lambda x: x,
                 timeout=1):

        min_q_size = 10

        self.func = func
        self.parent = parent
        self.num_workers = num_workers
        self.filterf = filterf
        self.batch_size = batch_size
        self.batcher = batcher

        if num_workers > 0:
            self.q_size = q_size if q_size else 2 * num_workers

        self.q_out = q_out
        self.q_in = q_in
        self.q_size = max(min_q_size, 2 * num_workers)

        self.logger = logs.getLogger('BufferedPipe')
        self.logger.setLevel(10)
        self.timeout = timeout
        self.worker_frame = Thread
Example #4
0
 def __init__(self, path=None, verbose=10):
     if path is None:
         self.path = fs_tracker.get_queue_directory()
     else:
         self.path = path
     self.logger = logs.getLogger(self.__class__.__name__)
     self.logger.setLevel(verbose)
    def __init__(self, db_config,
                 measure_timestamp_diff=False,
                 blocking_auth=True,
                 compression=None,
                 verbose=10):

        guest = db_config.get('guest')

        self.app = pyrebase.initialize_app(db_config)

        if compression is None:
            compression = db_config.get('compression')

        self.auth = None
        if not guest and 'serviceAccount' not in db_config.keys():
            self.auth = get_auth(self.app,
                                 db_config.get("use_email_auth"),
                                 db_config.get("email"),
                                 db_config.get("password"),
                                 blocking_auth)

        self.logger = logs.getLogger('FirebaseArtifactStore')
        self.logger.setLevel(verbose)
        super(FirebaseArtifactStore, self).__init__(
            measure_timestamp_diff,
            compression=compression)
Example #6
0
def getlogger():
    global logger
    if logger is None:
        logger = logs.getLogger('studio_server')
        logger.setLevel(10)

    return logger
Example #7
0
    def __init__(self,
                 zone='us-east1-c',
                 auth_cookie=None,
                 verbose=10,
                 branch=None,
                 user_startup_script=None):
        assert 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ.keys()
        with open(os.environ['GOOGLE_APPLICATION_CREDENTIALS'], 'r') as f:
            credentials_dict = json.loads(f.read())

        self.compute = googleapiclient.discovery.build('compute', 'v1')

        self.startup_script_file = os.path.join(
            os.path.dirname(__file__), 'scripts/gcloud_worker_startup.sh')

        self.install_studio_script = os.path.join(os.path.dirname(__file__),
                                                  'scripts/install_studio.sh')

        self.zone = zone
        self.projectid = credentials_dict['project_id']
        self.logger = logs.getLogger("GCloudWorkerManager")
        self.logger.setLevel(verbose)
        self.auth_cookie = auth_cookie
        self.user_startup_script = user_startup_script
        self.repo_url = git_util.get_my_repo_url()
        self.branch = branch if branch else git_util.get_my_checkout_target()
        self.log_bucket = "studioml-logs"

        if user_startup_script:
            self.logger.warn('User startup script argument is deprecated')
Example #8
0
    def __init__(self, args):
        self.config = args.config

        if args.guest:
            self.config['database']['guest'] = True

        self.logger = logs.getLogger('LocalExecutor')
        self.logger.setLevel(model.parse_verbosity(self.config.get('verbose')))
        self.logger.debug("Config: ")
        self.logger.debug(self.config)
Example #9
0
def allocate_resources(experiment, config=None, verbose=10):
    logger = logs.getLogger('allocate_resources')
    logger.setLevel(verbose)
    logger.info('Allocating resources {} for experiment {}'.format(
        experiment.resources_needed, experiment.key))

    ret_val = True
    gpus_needed = int(experiment.resources_needed.get('gpus')) \
        if experiment.resources_needed else 0

    if gpus_needed > 0:
        ret_val = ret_val and allocate_gpus(gpus_needed, config)
    else:
        allocate_gpus(0, config)

    return ret_val
Example #10
0
    def __init__(self,
                 measure_timestamp_diff=False,
                 compression=None,
                 verbose=logs.DEBUG):
        if measure_timestamp_diff:
            try:
                self.timestamp_shift = self._measure_timestamp_diff()
            except BaseException:
                self.timestamp_shift = 0
        else:
            self.timestamp_shift = 0

        self.compression = compression

        self.logger = logs.getLogger(self.__class__.__name__)
        self.logger.setLevel(verbose)
Example #11
0
    def __init__(self, name, verbose=10, receive_timeout=300, retry_time=10):
        assert boto3 is not None
        self._client = boto3.client('sqs')

        create_q_response = self._client.create_queue(QueueName=name)

        self._queue_url = create_q_response['QueueUrl']
        self.logger = logs.getLogger('SQSQueue')
        if verbose is not None:
            self.logger.setLevel(parse_verbosity(verbose))
        self._name = name
        self.logger.info('Creating SQS queue with name ' + name)
        self.logger.info('Queue url = ' + self._queue_url)

        self._receive_timeout = receive_timeout
        self._retry_time = retry_time
Example #12
0
    def __init__(self,
                 config,
                 measure_timestamp_diff=False,
                 compression=None,
                 verbose=10):

        self.logger = logs.getLogger('GCloudArtifactStore')
        self.logger.setLevel(verbose)

        self.config = config
        self._client = None
        self._client_timestamp = None

        compression = compression if compression else config.get('compression')

        super(GCloudArtifactStore, self).__init__(measure_timestamp_diff,
                                                  compression=compression)
Example #13
0
    def __init__(self,
                 firebase,
                 use_email_auth=False,
                 email=None,
                 password=None,
                 blocking=True):
        if not os.path.exists(TOKEN_DIR):
            os.makedirs(TOKEN_DIR)

        self.logger = logs.getLogger(self.__class__.__name__)
        self.logger.setLevel(logs.DEBUG)

        self.firebase = firebase
        self.user = {}
        self.use_email_auth = use_email_auth
        if use_email_auth:
            if email and password:
                self.email = email
                self.password = password
            else:
                self.email = input('Firebase token is not found or expired! ' +
                                   'You need to re-login. (Or re-run with ' +
                                   'studio/studio-runner ' +
                                   'with --guest option ) '
                                   '\nemail:')
                self.password = getpass.getpass('password:'******'Authentication required! Either specify ' +
                  'use_email_auth in config file, or run '
                  'studio and go to webui ' + '(localhost:5000 by default) '
                  'to authenticate using google credentials')
            while self.expired:
                time.sleep(1)
                self._update_user()

        self.sched = BackgroundScheduler()
        self.sched.start()
        self.sched.add_job(self._update_user, 'interval', minutes=31)
        atexit.register(self.sched.shutdown)
Example #14
0
def get_db_provider(config=None, blocking_auth=True):
    if not config:
        config = get_config()
    verbose = parse_verbosity(config.get('verbose'))

    logger = logs.getLogger("get_db_provider")
    logger.setLevel(verbose)
    logger.debug('Choosing db provider with config:')
    logger.debug(config)

    if 'storage' in config.keys():
        artifact_store = get_artifact_store(config['storage'],
                                            blocking_auth=blocking_auth,
                                            verbose=verbose)
    else:
        artifact_store = None

    assert 'database' in config.keys()
    db_config = config['database']
    if db_config['type'].lower() == 'firebase':
        return FirebaseProvider(db_config,
                                blocking_auth,
                                verbose=verbose,
                                store=artifact_store)
    elif db_config['type'].lower() == 'http':
        return HTTPProvider(db_config,
                            verbose=verbose,
                            blocking_auth=blocking_auth)
    elif db_config['type'].lower() == 's3':
        return S3Provider(db_config,
                          verbose=verbose,
                          store=artifact_store,
                          blocking_auth=blocking_auth)

    elif db_config['type'].lower() == 'gs':
        return GSProvider(db_config,
                          verbose=verbose,
                          store=artifact_store,
                          blocking_auth=blocking_auth)

    else:
        raise ValueError('Unknown type of the database ' + db_config['type'])
Example #15
0
    def __init__(self, queue_name, sub_name=None, verbose=10):
        from google.cloud import pubsub

        assert 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ.keys()
        with open(os.environ['GOOGLE_APPLICATION_CREDENTIALS']) as f:
            credentials = json.loads(f.read())

        project_name = credentials['project_id']
        self.logger = logs.getLogger(self.__class__.__name__)
        if verbose is not None:
            self.logger.setLevel(parse_verbosity(verbose))

        self.pubclient = pubsub.PublisherClient()
        self.subclient = pubsub.SubscriberClient()

        self.project = project_name
        self.topic_name = self.pubclient.topic_path(project_name, queue_name)
        self.logger.info("Topic name = {}".format(self.topic_name))
        try:
            self.pubtopic = self.pubclient.get_topic(self.topic_name)
        except BaseException as e:
            self.pubtopic = self.pubclient.create_topic(self.topic_name)
            self.logger.info('topic {} created'.format(self.topic_name))

        sub_name = sub_name if sub_name else queue_name + "_sub"
        self.logger.info("Topic name = {}".format(queue_name))
        self.logger.info("Subscription name = {}".format(sub_name))

        self.sub_name = self.subclient.subscription_path(
            project_name, sub_name)
        try:
            self.subclient.get_subscription(self.sub_name)
        except BaseException as e:
            self.logger.warn(e)
            self.subclient.create_subscription(self.sub_name,
                                               self.topic_name,
                                               ack_deadline_seconds=20)

        self.logger.info('subscription {} created'.format(sub_name))
Example #16
0
    def __init__(self,
                 config,
                 verbose=10,
                 blocking_auth=True,
                 compression=None):
        # TODO: implement connection
        self.url = config.get('serverUrl')
        self.verbose = verbose
        self.logger = logs.getLogger('HTTPProvider')
        self.logger.setLevel(self.verbose)

        self.auth = None
        self.app = pyrebase.initialize_app(config)
        guest = config.get('guest')
        if not guest and 'serviceAccount' not in config.keys():
            self.auth = get_auth(self.app, config.get("use_email_auth"),
                                 config.get("email"), config.get("password"),
                                 blocking_auth)

        self.compression = compression
        if self.compression is None:
            self.compression = config.get('compression')
Example #17
0
    def __init__(
            self,
            db_config,
            blocking_auth=True,
            verbose=10,
            store=None,
            compression=None):
        guest = db_config.get('guest')

        self.app = pyrebase.initialize_app(db_config)
        self.logger = logs.getLogger(self.__class__.__name__)
        self.logger.setLevel(verbose)

        self.compression = compression
        if self.compression is None:
            self.compression = db_config.get('compression')

        self.auth = None
        if not guest and 'serviceAccount' not in db_config.keys():
            self.auth = get_auth(self.app,
                                 db_config.get("use_email_auth"),
                                 db_config.get("email"),
                                 db_config.get("password"),
                                 blocking_auth)

        self.store = store if store else FirebaseArtifactStore(
            db_config,
            verbose=verbose,
            blocking_auth=blocking_auth,
            compression=self.compression
        )

        if self.auth and not self.auth.expired:
            self.register_user(None, self.auth.get_user_email())

        self.max_keys = db_config.get('max_keys', 100)
Example #18
0
File: app.py Project: luoyancn/rest
import logs
from routes import Mapper
from webob import request

LOG = logs.getLogger(__name__)

class Server(object):

    def create(self, body):
        return {'method': 'create'}

    def index(self):
        return {'method': 'index'}

    def show(self, id):
        return {'method': 'get'}

    def delete(self, id):
        return {'method': 'delete'}

    def update(self, id, body):
        return {'method': 'update'}


class Public(object):

    def __init__(self, conf):
        self.conf = conf
        self.mapper = Mapper()
        self.mapper.resource('server', 'servers', controller=Server())
Example #19
0
def worker_loop(queue,
                parsed_args,
                single_experiment=False,
                timeout=0,
                verbose=None):

    fetch_artifacts = True

    logger = logs.getLogger('worker_loop')

    hold_period = 4
    retval = 0
    while True:
        msg = queue.dequeue(acknowledge=False, timeout=timeout)
        if not msg:
            break

        # first_exp, ack_key = queue.dequeue(acknowledge=False)
        first_exp, ack_key = msg

        data_dict = json.loads(sixdecode(first_exp))
        experiment_key = data_dict['experiment']['key']
        config = data_dict['config']

        parsed_args.config = config
        if verbose:
            config['verbose'] = verbose
        else:
            verbose = model.parse_verbosity(config.get('verbose'))

        logger.setLevel(verbose)

        logger.debug('Received message: \n{}'.format(data_dict))

        executor = LocalExecutor(parsed_args)

        with model.get_db_provider(config) as db:
            # experiment = experiment_from_dict(data_dict['experiment'])
            def try_get_experiment():
                experiment = db.get_experiment(experiment_key)
                if experiment is None:
                    raise ValueError(
                        'experiment is not found - indicates storage failure')
                return experiment

            experiment = retry(try_get_experiment,
                               sleep_time=10,
                               logger=logger)

            if config.get('experimentLifetime') and \
                int(str2duration(config['experimentLifetime'])
                    .total_seconds()) + experiment.time_added < time.time():
                logger.info(
                    'Experiment expired (max lifetime of {} was exceeded)'.
                    format(config.get('experimentLifetime')))
                queue.acknowledge(ack_key)
                continue

            if allocate_resources(experiment, config, verbose=verbose):

                def hold_job():
                    queue.hold(ack_key, hold_period)

                hold_job()
                sched = BackgroundScheduler()
                sched.add_job(hold_job, 'interval', minutes=hold_period / 2)
                sched.start()

                try:
                    python = 'python'
                    if experiment.pythonver == 3:
                        python = 'python3'
                    if '_singularity' not in experiment.artifacts.keys():
                        pip_diff = pip_needed_packages(experiment.pythonenv,
                                                       python)
                        if any(pip_diff):
                            logger.info(
                                'Setting up python packages for experiment')
                            if pip_install_packages(pip_diff, python,
                                                    logger) != 0:

                                logger.info(
                                    "Installation of all packages together " +
                                    " failed, "
                                    "trying one package at a time")

                                for pkg in pip_diff:
                                    pip_install_packages([pkg], python, logger)

                    for tag, art in six.iteritems(experiment.artifacts):
                        if fetch_artifacts or 'local' not in art.keys():
                            logger.info('Fetching artifact ' + tag)
                            if tag == 'workspace':
                                art['local'] = retry(lambda: db.get_artifact(
                                    art, only_newer=False),
                                                     sleep_time=10,
                                                     logger=logger)
                            else:
                                art['local'] = retry(
                                    lambda: db.get_artifact(art),
                                    sleep_time=10,
                                    logger=logger)

                    returncode = executor.run(experiment)
                    if returncode != 0:
                        retval = returncode
                finally:
                    sched.shutdown()
                    queue.acknowledge(ack_key)

                if single_experiment:
                    logger.info('single_experiment is True, quitting')
                    return retval
            else:
                logger.info('Cannot run experiment ' + experiment.key +
                            ' due lack of resources. Will retry')
                time.sleep(config['sleep_time'])

        # wait_for_messages(queue, timeout, logger)

        # queue = glob.glob(fs_tracker.get_queue_directory() + "/*")

    logger.info("Queue in {} is empty, quitting".format(
        fs_tracker.get_queue_directory()))

    return retval
Example #20
0
def get_logger():
    global logger
    if not logger:
        logger = logs.getLogger('studio-serve')
        logger.setLevel(logs.DEBUG)
    return logger
Example #21
0
def main():
    """
    A little utility to handle reading and writing streams
    to and from a queue.
    --pub <queue> : publish what's read from stdin to <queue>
    --sub <queue> : read from <queue> and write the messages to stdout
    --cat         : when used with --pub, write all published messages to stdout
    --clean       : check in incoming and outgoing messages.
                    Verify the message is correct JSON and add
                    an embersId if needed.
    --log_file    : Path to write the log file to
    --log_level   : Logging level
    Other standard EMBERS options (e.g. --verbose).
    """
    import args
    import message
    global log

    ap = args.get_parser()
    ap.add_argument('--clean', action="store_true",
                    help='Verify message format and add standard fields such as embersId.')
    ap.add_argument('--addfeed', action="store_true", help='Add feed and feedPath fields to published message.')
    ap.add_argument('--cat', action="store_true", help='Write all published messages to stdout.')
    ap.add_argument('--rm', nargs="+", help="delete queue")
    arg = ap.parse_args()
    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)
    init(arg)

    if arg.rm and not arg.sub:
        for queue in arg.rm:
            print "Deleting", queue,
            queue = ikqueue.Queue(queue)
            queue.maybe_bind(connect())
            queue.delete()
            print "."
        return
    try:
        # need to use the raw/utf handler unless we are doing clean
        marshal = UnicodeMarshal()
        if arg.clean or arg.addfeed:
            marshal = JsonMarshal()

        if arg.sub is None and os.environ.get('UPSTART_JOB') is None:
            arg.sub = '-'  # stdin

        subq = open(arg.sub, 'r') #, marshal=marshal, ssh_key=arg.ssh_key, ssh_conn=arg.tunnel)

        if arg.pub is None and os.environ.get('UPSTART_JOB') is None:
            arg.pub = '-'  # stdout

        pubq = open(arg.pub, 'w', capture=arg.cat, marshal=marshal)
    except Exception as e:
        log.exception("Exception opening queues: %s" % e)

    # "Human-readable" queue name can be retrieved as
    #
    # sname = subq.get_name()
    # pname = pubq.get_name()
    rc = 0
    try:
        it = subq.__iter__()
        while True:
            m = ''
            try:
                m = it.next()
                if arg.clean:
                    m = message.clean(m)

                if m:
                    if arg.addfeed:
                        m = message.add_embers_ids(m, feed=pubq.get_name(), feedPath=pubq.get_name())
                    pubq.write(m)
            except StopIteration:
                break
            except KeyboardInterrupt:
                break
            except Exception as e:
                rc += 1
                if m:
                    log.exception('Could not process message %s: %s' % (m, e))
                else:
                    log.exception('Unknown processing error %s' % e)
    except KeyboardInterrupt:
        pass
    except Exception as e:
        rc = 1
        log.exception('Top level exception %s' % e)

    return rc
Example #22
0
def main():
    """
    A little utility to handle reading and writing streams
    to and from a queue.
    --pub <queue> : publish what's read from stdin to <queue>
    --sub <queue> : read from <queue> and write the messages to stdout
    --cat         : when used with --pub, write all published messages to stdout
    --clean       : check in incoming and outgoing messages.
                    Verify the message is correct JSON and add
                    an embersId if needed.
    --log_file    : Path to write the log file to
    --log_level   : Logging level
    Other standard EMBERS options (e.g. --verbose).
    """
    import args
    import message
    global log

    ap = args.get_parser()
    ap.add_argument(
        '--clean',
        action="store_true",
        help='Verify message format and add standard fields such as embersId.')
    ap.add_argument('--addfeed',
                    action="store_true",
                    help='Add feed and feedPath fields to published message.')
    ap.add_argument('--cat',
                    action="store_true",
                    help='Write all published messages to stdout.')
    ap.add_argument('--rm', nargs="+", help="delete queue")
    arg = ap.parse_args()
    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)
    init(arg)

    if arg.rm and not arg.sub:
        for queue in arg.rm:
            print "Deleting", queue,
            queue = ikqueue.Queue(queue)
            queue.maybe_bind(connect())
            queue.delete()
            print "."
        return
    try:
        # need to use the raw/utf handler unless we are doing clean
        marshal = UnicodeMarshal()
        if arg.clean or arg.addfeed:
            marshal = JsonMarshal()

        if arg.sub is None and os.environ.get('UPSTART_JOB') is None:
            arg.sub = '-'  # stdin

        subq = open(
            arg.sub,
            'r')  #, marshal=marshal, ssh_key=arg.ssh_key, ssh_conn=arg.tunnel)

        if arg.pub is None and os.environ.get('UPSTART_JOB') is None:
            arg.pub = '-'  # stdout

        pubq = open(arg.pub, 'w', capture=arg.cat, marshal=marshal)
    except Exception as e:
        log.exception("Exception opening queues: %s" % e)

    # "Human-readable" queue name can be retrieved as
    #
    # sname = subq.get_name()
    # pname = pubq.get_name()
    rc = 0
    try:
        it = subq.__iter__()
        while True:
            m = ''
            try:
                m = it.next()
                if arg.clean:
                    m = message.clean(m)

                if m:
                    if arg.addfeed:
                        m = message.add_embers_ids(m,
                                                   feed=pubq.get_name(),
                                                   feedPath=pubq.get_name())
                    pubq.write(m)
            except StopIteration:
                break
            except KeyboardInterrupt:
                break
            except Exception as e:
                rc += 1
                if m:
                    log.exception('Could not process message %s: %s' % (m, e))
                else:
                    log.exception('Unknown processing error %s' % e)
    except KeyboardInterrupt:
        pass
    except Exception as e:
        rc = 1
        log.exception('Top level exception %s' % e)

    return rc
Example #23
0
from BeautifulSoup import BeautifulSoup, Comment
from langdetect import detect
import logs
import args as args_util
import re
from sys import exit
from lxml import html as doc_parser
from pycountry import languages
from goose import Goose, Crawler
from goose.text import StopWordsArabic
from goose.text import StopWordsChinese
from goose.text import StopWordsKorean
import requests
import gevent

log = logs.getLogger(__name__)


def validate_encoding(http_response):
    """
    Validates that the encoding for the HTML/text in the http_response is
    properly marked. If it isn't, the encoding is corrected so that we don't
    receive errors on special characters of foreign languages
    :param http_response:
    :return: http_response: response with correct encoding
    """
    encodings = requests.utils.get_encodings_from_content(
        http_response.content)

    if encodings and encodings[0].lower() != http_response.encoding.lower():
        log.debug('Correcting encoding %s to %s' %
Example #24
0
def main(args=sys.argv[1:]):
    logger = logs.getLogger('studio-runner')
    parser = argparse.ArgumentParser(
        description='Studio runner. \
                     Usage: studio run <runner_arguments> \
                     script <script_arguments>')
    parser.add_argument('--config', help='configuration file', default=None)
    parser.add_argument('--project', help='name of the project', default=None)
    parser.add_argument(
        '--experiment', '-e',
        help='Name of the experiment. If none provided, ' +
             'random uuid will be generated',
        default=None)

    parser.add_argument(
        '--guest',
        help='Guest mode (does not require db credentials)',
        action='store_true')

    parser.add_argument(
        '--force-git',
        help='If run in a git directory, force running the experiment ' +
             'even if changes are not commited',
        action='store_true')

    parser.add_argument(
        '--gpus',
        help='Number of gpus needed to run the experiment',
        type=int,
        default=None)

    parser.add_argument(
        '--cpus',
        help='Number of cpus needed to run the experiment' +
             ' (used to configure cloud instance)',
        type=int,
        default=None)

    parser.add_argument(
        '--ram',
        help='Amount of RAM needed to run the experiment' +
             ' (used to configure cloud instance), ex: 10G, 10GB',
        default=None)

    parser.add_argument(
        '--gpuMem',
        help='Amount of GPU RAM needed to run the experiment',
        default=None)

    parser.add_argument(
        '--hdd',
        help='Amount of hard drive space needed to run the experiment' +
             ' (used to configure cloud instance), ex: 10G, 10GB',
        default=None)

    parser.add_argument(
        '--queue', '-q',
        help='Name of the remote execution queue',
        default=None)

    parser.add_argument(
        '--cloud',
        help='Cloud execution mode. Could be gcloud, gcspot, ec2 or ec2spot',
        default=None)

    parser.add_argument(
        '--bid',
        help='Spot instance price bid, specified in USD or in percentage ' +
             'of on-demand instance price. Default is %(default)s',
        default='100%')

    parser.add_argument(
        '--capture-once', '-co',
        help='Name of the immutable artifact to be captured. ' +
        'It will be captured once before the experiment is run',
        default=[], action='append')

    parser.add_argument(
        '--capture', '-c',
        help='Name of the mutable artifact to be captured continuously',
        default=[], action='append')

    parser.add_argument(
        '--reuse', '-r',
        help='Name of the artifact from another experiment to use',
        default=[], action='append')

    parser.add_argument(
        '--verbose', '-v',
        help='Verbosity level. Allowed values: ' +
             'debug, info, warn, error, crit ' +
             'or numerical value of logger levels.',
        default=None)

    parser.add_argument(
        '--metric',
        help='Metric to show in the summary of the experiment, ' +
             'and to base hyperparameter search on. ' +
             'Refers a scalar value in tensorboard log ' +
             'example: --metric=val_loss[:final | :min | :max] to report ' +
             'validation loss in the end of the keras experiment ' +
             '(or smallest or largest throughout the experiment for :min ' +
             'and :max respectively)',
        default=None)

    parser.add_argument(
        '--hyperparam', '-hp',
        help='Try out multiple values of a certain parameter. ' +
             'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' +
             'will instantiate 10 versions of the script, replace ' +
             'learning_rate with a one of the 10 values for learning ' +
             'rate that lies on a log grid from 0.01 to 0.1, create '
             'experiments and place them in the queue.',
             default=[], action='append')

    parser.add_argument(
        '--num-workers',
        help='Number of local or cloud workers to spin up',
        type=int,
        default=None)

    parser.add_argument(
        '--python-pkg',
        help='Python package not present in the current environment ' +
             'that is needed for experiment. Only compatible with ' +
             'remote and cloud workers for now',
        default=[], action='append')

    parser.add_argument(
        '--ssh-keypair',
        help='Name of the SSH keypair used to access the EC2 ' +
             'instances directly',
        default=None)

    parser.add_argument(
        '--optimizer', '-opt',
        help='Name of optimizer to use, by default is grid search. ' +
        'The name of the optimizer must either be in ' +
        'studio/optimizer_plugins ' +
        'directory or the path to the optimizer source file ' +
        'must be supplied. ',
        default='grid')

    parser.add_argument(
        '--cloud-timeout',
        help="Time (in seconds) that cloud workers wait for messages. " +
             "If negative, " +
             "wait for the first message in the queue indefinitely " +
             "and shut down " +
             "as soon as no new messages are available. " +
             "If zero, don't wait at all." +
             "Default value is %(default)d",
        type=int,
        default=300)

    parser.add_argument(
        '--user-startup-script',
        help='Path of script to run immediately ' +
             'before running the remote worker',
        default=None)

    parser.add_argument(
        '--branch',
        help='Branch of studioml to use when running remote worker, useful ' +
             'for debugging pull requests. Default is current',
        default=None)

    parser.add_argument(
        '--max-duration',
        help='Max experiment runtime (i.e. time after which experiment ' +
             'should be killed no matter what.).  Examples of values ' +
             'might include 5h, 48h2m10s',
        default=None)

    parser.add_argument(
        '--lifetime',
        help='Max experiment lifetime (i.e. wait time after which ' +
             'experiment loses relevance and should not be started)' +
             '  Examples include 240h30m10s',
        default=None)

    parser.add_argument(
        '--container',
        help='Singularity container in which experiment should be run. ' +
             'Assumes that container has all dependencies installed',
        default=None
    )

    parser.add_argument(
        '--port',
        help='Ports to open on a cloud instance',
        default=[], action='append'
    )

    # detect which argument is the script filename
    # and attribute all arguments past that index as related to the script
    (runner_args, other_args) = parser.parse_known_args(args)
    py_suffix_args = [i for i, arg in enumerate(args) if arg.endswith('.py')
                      or '::' in arg]

    rerun = False
    if len(py_suffix_args) < 1:
        print('None of the arugments end with .py')
        if len(other_args) == 0:
            print("Trying to run a container job")
            assert runner_args.container is not None
            exec_filename = None
        elif len(other_args) == 1:
            print("Treating last argument as experiment key to rerun")
            rerun = True
            experiment_key = args[-1]
        else:
            print("Too many extra arguments - should be either none " +
                  "for container job or one for experiment re-run")
            sys.exit(1)
    else:
        script_index = py_suffix_args[0]
        exec_filename, other_args = args[script_index], args[script_index + 1:]
        runner_args = parser.parse_args(args[:script_index])

    # TODO: Queue the job based on arguments and only then execute.

    config = model.get_config(runner_args.config)

    if runner_args.verbose:
        config['verbose'] = runner_args.verbose

    if runner_args.guest:
        config['database']['guest'] = True

    if runner_args.container:
        runner_args.capture_once.append(
            runner_args.container + ':_singularity')

    verbose = model.parse_verbosity(config['verbose'])
    logger.setLevel(verbose)

    if git_util.is_git() and not git_util.is_clean() and not rerun:
        logger.warn('Running from dirty git repo')
        if not runner_args.force_git:
            logger.error(
                'Specify --force-git to run experiment from dirty git repo')
            sys.exit(1)

    resources_needed = parse_hardware(runner_args, config['resources_needed'])
    logger.debug('resources requested: ')
    logger.debug(str(resources_needed))

    artifacts = {}
    artifacts.update(parse_artifacts(runner_args.capture, mutable=True))
    artifacts.update(parse_artifacts(runner_args.capture_once, mutable=False))
    with model.get_db_provider(config) as db:
        artifacts.update(parse_external_artifacts(runner_args.reuse, db))

    if runner_args.branch:
        config['cloud']['branch'] = runner_args.branch

    if runner_args.user_startup_script:
        config['cloud']['user_startup_script'] = \
            runner_args.user_startup_script

    if runner_args.lifetime:
        config['experimentLifetime'] = runner_args.lifetime

    if any(runner_args.hyperparam):
        if runner_args.optimizer is "grid":
            experiments = add_hyperparam_experiments(
                exec_filename,
                other_args,
                runner_args,
                artifacts,
                resources_needed,
                logger)

            queue_name = submit_experiments(
                experiments,
                config=config,
                logger=logger,
                queue_name=runner_args.queue,
                cloud=runner_args.cloud)

            spin_up_workers(
                runner_args,
                config,
                resources_needed,
                queue_name=queue_name,
                verbose=verbose)
        else:
            opt_modulepath = os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "optimizer_plugins",
                runner_args.optimizer + ".py")
            if not os.path.exists(opt_modulepath):
                opt_modulepath = os.path.abspath(
                    os.path.expanduser(runner_args.optimizer))
            logger.info('optimizer path: %s' % opt_modulepath)

            assert os.path.exists(opt_modulepath)
            sys.path.append(os.path.dirname(opt_modulepath))
            opt_module = importlib.import_module(
                os.path.basename(opt_modulepath.replace(".py", '')))

            h = HyperparameterParser(runner_args, logger)
            hyperparams = h.parse()
            optimizer = getattr(
                opt_module,
                "Optimizer")(
                hyperparams,
                config['optimizer'],
                logger)

            workers_started = False
            queue_name = runner_args.queue
            while not optimizer.stop():
                hyperparam_pop = optimizer.ask()
                hyperparam_tuples = h.convert_to_tuples(hyperparam_pop)

                experiments = add_hyperparam_experiments(
                    exec_filename,
                    other_args,
                    runner_args,
                    artifacts,
                    resources_needed,
                    logger,
                    optimizer=optimizer,
                    hyperparam_tuples=hyperparam_tuples)

                queue_name = submit_experiments(
                    experiments,
                    config=config,
                    logger=logger,
                    cloud=runner_args.cloud,
                    queue_name=queue_name)

                if not workers_started:
                    spin_up_workers(
                        runner_args,
                        config,
                        resources_needed,
                        queue_name=queue_name,
                        verbose=verbose)
                    workers_started = True

                fitnesses, behaviors = get_experiment_fitnesses(
                    experiments, optimizer, config, logger)

                # for i, hh in enumerate(hyperparam_pop):
                #     print fitnesses[i]
                #     for hhh in hh:
                #         print hhh
                try:
                    optimizer.tell(hyperparam_pop, fitnesses, behaviors)
                except BaseException:
                    optimizer.tell(hyperparam_pop, fitnesses)

                try:
                    optimizer.disp()
                except BaseException:
                    logger.warn('Optimizer has no disp() method')
    else:
        if rerun:
            with model.get_db_provider(config) as db:
                experiment = db.get_experiment(experiment_key)
                new_key = runner_args.experiment if runner_args.experiment \
                    else experiment_key + '_rerun' + str(uuid.uuid4())
                experiment.key = new_key
                for _, art in six.iteritems(experiment.artifacts):
                    art['mutable'] = False

                experiments = [experiment]

        else:
            experiments = [create_experiment(
                filename=exec_filename,
                args=other_args,
                experiment_name=runner_args.experiment,
                project=runner_args.project,
                artifacts=artifacts,
                resources_needed=resources_needed,
                metric=runner_args.metric,
                max_duration=runner_args.max_duration,
            )]

        queue_name = submit_experiments(
            experiments,
            config=config,
            logger=logger,
            cloud=runner_args.cloud,
            queue_name=runner_args.queue)

        spin_up_workers(
            runner_args,
            config,
            resources_needed,
            queue_name=queue_name,
            verbose=verbose)

    return
Example #25
0
def get_logger():
    global _my_logger
    if not _my_logger:
        _my_logger = logs.getLogger('studio-runs')
    return _my_logger
Example #26
0
import psutil
import time
import six
from pygtail import Pygtail
import threading

from apscheduler.schedulers.background import BackgroundScheduler

import fs_tracker, model, logs
from local_queue import LocalQueue
from gpu_util import get_available_gpus, get_gpu_mapping, get_gpus_summary
from experiment import Experiment
from util import sixdecode, str2duration, retry
from model import parse_verbosity

logs.getLogger('apscheduler.scheduler').setLevel(logs.ERROR)


class LocalExecutor(object):
    """Runs job while capturing environment and logs results.
    """
    def __init__(self, args):
        self.config = args.config

        if args.guest:
            self.config['database']['guest'] = True

        self.logger = logs.getLogger('LocalExecutor')
        self.logger.setLevel(model.parse_verbosity(self.config.get('verbose')))
        self.logger.debug("Config: ")
        self.logger.debug(self.config)