def get_exported_table_df(table_name): """Retrieve exported table file on GCS. Args: table_name (string): Name of the table to load. Returns: pandas.DataFrame """ bucket = storage\ .Client(get_config('gcp_project_name'))\ .get_bucket(get_config('gcs_bucket_name')) key = \ '{experiment_name}/exported_tables/{table_name}/' \ '{date_descriptor}/out.csv.gzip'.format( experiment_name=get_config('experiment_name'), table_name=table_name, date_descriptor='{{ ds_nodash }}') blob = storage.Blob(key, bucket) bio = io.BytesIO() blob.download_to_file(bio) bio.seek(0) return pd.read_csv(bio, compression='gzip')
def get_kubernetes_pod_operator(operator_name=None, operator_image=None, cmds=['python', 'main.py'], env_vars=None, dag=None, image_tag=None): """Get templated KubernetesPodOperator. Intended to be used with your own implementations of kuberenetes pod operator bootstrapped using `new_pod_operator` command. Args: operator_name (string): Name of the operator. Defaults to None. e.g. `train-operator` operator_image (string): Name of the operator image. Defaults to None. Mutually exclusive to `operator_name` param. e.g. `gcr.io/my-project/my-experience_train-operator` cmds (list[str]): Command overrides for the pod. env_vars (dict): Env vars overrides for the pod. dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_kuberenetes_pod_operator(..., dag=dag)`. Defaults to None. Returns: airflow.contrib.operators.kubernetes_pod_operator.KubernetesPodOperator """ if dag is None: logger.warning( 'No DAG context was found. The operator may not be associated to any DAG nor appeared in Web UI' ) if (operator_name is None and operator_image is None) or \ (operator_name is not None and operator_image is not None): raise Exception( '''You need to specify either one of `opertor_name` or `operator_image` param''' ) elif operator_name is not None: image = 'gcr.io/{gcp_project_name}/{experiment_name}_{operator_name}'\ .format( gcp_project_name=get_config('gcp_project_name'), experiment_name=get_config('experiment_name'), operator_name=operator_name) elif operator_image is not None: image = operator_image image_tag = image_tag or 'LATEST' return KubernetesPodOperator( dag=dag or models._CONTEXT_MANAGER_DAG, task_id='{experiment_name}_{operator_name}'.format( experiment_name=get_config('experiment_name').replace('-', '_'), operator_name=operator_name.replace('-', '_')), name='{experiment_name}__{operator_name}:{image_tag}'.format( experiment_name=get_config('experiment_name').replace('_', '-'), operator_name=operator_name.replace('_', '-')), namespace='default', # Parameterize tags image=image, image_pull_policy='Always', cmds=cmds, env_vars=env_vars, startup_timeout_seconds=3600)
def get_maybe_create_dataset_operator( dag=None, table_expiration_seconds=2678400, # 60 * 60 * 24 * 31 partition_expiration_seconds=2678400): """Get templated BigQueryOperator. Args: dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_bq_to_bq_operator(..., dag=dag)`. Defaults to None. table_expiration_seconds (int): Default expiration time (in seconds) for tables in created datset. partition_expiration_seconds (int): Default expiration time (in seconds) for partitions in created datset. Returns: airflow.contrib.operators.bigquery_operator.BigQueryOperator """ dag = dag or models._CONTEXT_MANAGER_DAG dataset_name = '{experiment_name}_database'.format( experiment_name=get_config('experiment_name')) return BigQueryMaybeCreateEmptyDatasetOperator( dag=dag, task_id='{experiment_name}.create_dataset'.format( experiment_name=get_config('experiment_name')), project_id=get_config('gcp_project_name'), dataset_id=dataset_name, dataset_reference={ "description": "Dataset for experiment {experiment_name}." " Auto generated by fuga.", "defaultTableExpirationMs": str(table_expiration_seconds * 1000), "defaultPartitionExpirationMs": str(partition_expiration_seconds * 1000) })
def get_bq_to_bq_operator( sql_or_filename, dst_table_name, dag=None, params={}, table_expiration_seconds=None, partition_expiration_seconds=None): """Get templated BigQueryOperator. Args: sql_or_filename (string): Valid SQL statement or a path to a sql file. It can be templated using Jinja in either case. dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_bq_to_bq_operator(..., dag=dag)`. Defaults to None. Returns: airflow.contrib.operators.bigquery_operator.BigQueryOperator """ dag = dag or models._CONTEXT_MANAGER_DAG if dag is None: logger.warning('No DAG context was found. The operator may not be associated to any DAG nor appeared in Web UI') dst_table_name_with_date_descriptor = \ '{table_name}{date_descriptor}'.format( table_name=dst_table_name, date_descriptor='{{ ds_nodash }}') dataset_name = '{experiment_name}_database'.format( experiment_name=get_config('experiment_name')) return BigQueryOperator( dag=dag, task_id='{experiment_name}.{table_name}.bq_to_bq' .format( experiment_name=get_config('experiment_name'), table_name=dst_table_name), sql=sql_or_filename, use_legacy_sql=False, write_disposition="WRITE_TRUNCATE", destination_dataset_table="{gcp_project_name}:{dataset_name}.{table_name}" .format( gcp_project_name=get_config('gcp_project_name'), dataset_name=dataset_name, table_name=dst_table_name_with_date_descriptor), params=params)
def get_export_table_operator(table_name, dag=None): """Get templated BigQueryToCloudStorageOperator. Args: table_name (string): Name of the table to export. dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_export_table_operator(..., dag=dag)`. Defaults to None. Returns: airflow.contrib.operators.bigquery_operator.BigQueryOperator """ if dag is None: logger.warning( 'No DAG context was found. The operator may not be associated to any DAG nor appeared in Web UI' ) date_descriptor = '{{ ds_nodash }}' table_name_with_date_descriptor = \ '{table_name}{date_descriptor}'.format( table_name=table_name, date_descriptor=date_descriptor) return BigQueryToCloudStorageOperator( dag=dag or models._CONTEXT_MANAGER_DAG, task_id='{experiment_name}.{table_name}.export'.format( experiment_name=get_config('experiment_name'), table_name=table_name), source_project_dataset_table= '{gcp_project_name}.{database_name}.{table_name}'.format( gcp_project_name=get_config('gcp_project_name'), database_name='%s_database' % get_config('experiment_name'), table_name=table_name_with_date_descriptor), # TODO: 1GB以上のデータに対応 # https://cloud.google.com/bigquery/exporting-data-from-bigquery#exportingmultiple destination_cloud_storage_uris=[ 'gs://{bucket_name}/{experiment_name}/exported_tables/' '{table_name}/{date_descriptor}/' 'out.csv.gzip'.format( bucket_name=get_config('bucket_name'), experiment_name=get_config('experiment_name'), date_descriptor=date_descriptor, table_name=table_name) ], compression="GZIP")
def run(self): experiment_root_dir = find_experiment_root_dir() experiment = Experiment.from_path(experiment_root_dir) storage_client = storage.Client() composer_client = composer.Client() environment = composer_client.get_environment( get_config('composer_environment_full_path')) if environment.state != 'RUNNING': raise Exception( 'Composer environment %s is in invalid state %s.\n' 'You need to wait until the environment is running ' 'or fix it if it\'s broken.' % ( environment.name, environment.state)) if not environment.config.get('dagGcsPrefix', None): raise Exception( 'Missing dagGcsPrefix config with environment %s.\n' 'The environment may be in an invalid state or ' 'failed to launch.' % ( environment.name)) bucket_url = urlparse(environment.config['dagGcsPrefix']) bucket_name = bucket_url.netloc bucket_prefix = bucket_url.path[1:] # omit slash bucket = storage_client.get_bucket(bucket_name) pairs = [] for target in ['py', 'sql', 'pod_operators']: for local_path in glob.iglob( os.path.join(experiment_root_dir, target, '**/*'), recursive=True): if self._is_ignored(local_path): continue remote_path = os.path.join( bucket_prefix, experiment.name, local_path[len(experiment_root_dir) + 1:]) pairs.append((local_path, remote_path)) click.echo(''' Following files are going to uploaded to GCS bucket %s ''' % bucket_name) click.echo( '\n'.join( '\t%s to %s' % (l, r) for l, r in pairs)) click.echo('') click.confirm('Do you want to conitnue?', abort=True) for l, r in pairs: new_blob = bucket.blob(r) new_blob.upload_from_filename(l)
def run(self): config_overrides = {} if get_config('gcp_project_id'): resource_manager_client = resource_manager.Client() project = resource_manager_client.fetch_project( get_config('gcp_project_id')) else: project = self._setup_gcp_project() config_overrides['gcp_project_id'] = project.project_id if get_config('gcs_bucket_name'): storage_client = storage.Client(project=project.project_id) bucket = storage_client.get_bucket(get_config('gcs_bucket_name')) else: bucket = self._setup_gcs_bucket(project) config_overrides['gcs_bucket_name'] = bucket.name if get_config('composer_environment_full_path'): composer_client = composer.Client(project=project.project_id) environment = composer_client.get_environment( get_config('composer_environment_full_path')) else: environment = self._setup_composer_environment( project, location=bucket.location.lower()) # XXX: make it configurable config_overrides['composer_environment_full_path'] = \ environment.full_path # Overwrite configurations for k, v in config_overrides.items(): write_config(k, v) click.echo( 'fuga environment is initialized. Now you can proceed to create ' 'environments by running `fuga experiment new`')
def find_or_clone_cookiecutter_template( template_name=_FUGA_DEFAULT_TEMPLATE_NAME): template = find_cookiecutter_template(template_name) if template is not None: return template click.confirm( 'Could not find fuga experiment template with name %s.\n' 'Do you want to clone it?', abort=True) local_git_dir = os.path.join(get_config('cookiecutters_dir'), template_name) remote_git_repo_name = f'git://github.com.org/{_FUGA_GITHUB_ORG_NAME}/{template_name}.git' git.Git(local_git_dir).clone(remote_git_repo_name)
def save_df(df, name): # XXX: Function name!! """Save dataframe to GCS. Args: df (pandas.DataFrame): Dataframe to save. Returns: key (string): Key of dataframe blob saved to GCS. """ bucket = storage.Client(get_config('gcp_project_name')) \ .get_bucket(get_config('bucket_name')) key = '{experiment_name}/output/{date_descriptor}/{name}.csv' \ .format( experiment_name=get_config('experiment_name'), date_descriptor='{{ ds_nodash }}', name=name) blob = bucket.blob(key) bio = io.BytesIO() df.to_csv(bio) blob.upload_from_file(bio, rewind=True) return key
def get_dag(start_date=None, schedule_interval=None, **xargs): default_args = { 'start_date': start_date or dt.datetime.today(), 'retries': 1, 'email_on_failure': True}.items()} if models.Variable.get("notification_email_address", None) is not None: defaualt_args['email'] = models.Variable.get("notification_email_address") default_dag_args = dict(itertools.chain( default_args.items(), xargs.items())) return models.DAG( '{experiment_name}_dag'.format( experiment_name=get_config('experiment_name')), schedule_interval=schedule_interval or dt.timedelta(days=1), default_args=default_dag_args, catchup=False)
def run(self, operator_name, dockerfile='./Dockerfile', image_name=None, version_tag=None, dryrun=False, remote_container_repo=None): import docker client = docker.from_env() experiment = Experiment.from_path(find_experiment_root_dir()) build_path = os.path.join(experiment.root_path, OPERATOR_DIR_PREFIX, operator_name) if not os.path.isdir(build_path): raise Exception(f'{build_path} is not a directory') try: repo = Repo(find_experiment_root_dir()) if len(repo.head.commit.diff(None)) > 0 \ or len(repo.untracked_files) > 0: import sys click.echo('Current Git working tree has either ' 'untracked file or diff to HEAD. ' 'Please commit your changes/new files before ' 'any deployment.') sys.exit(0) version_hash = repo.head.commit.hexsha except InvalidGitRepositoryError: raise Exception( f'Experiment directory ({experiment.root_path}) needs to ' 'be a valid Git repository.\n' 'Run `git init` in your experiment root') except ValueError as e: raise Exception( f'ValueError ({e}) has occured.\n' 'Current Git repository might not have any commit.') image_name = image_name or f'{experiment.name}__{operator_name}' version_tag = version_tag or version_hash click.echo(f'Building docker image {image_name}:{version_tag}') image, _logs = client.images.build(path=build_path, dockerfile=dockerfile, tag=f'{image_name}:{version_tag}') click.echo('Done') remote_container_repo = remote_container_repo or \ os.path.join( DEFAULT_REMOTE_CONTAINER_REPO_HOST, get_config('gcp_project_id')) remote_tag = os.path.join(remote_container_repo, f'{image_name}:{version_tag}') image.tag(remote_tag) latest_tag = os.path.join(remote_container_repo, f'{image_name}:LATEST') image.tag(latest_tag) click.echo(f'Pushing images to {remote_container_repo}') click.echo('\t' + remote_tag) click.echo('\t' + latest_tag) client.images.push(remote_tag) client.images.push(latest_tag) click.echo('Done')
dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_kuberenetes_pod_operator(..., dag=dag)`. Defaults to None. Returns: airflow.contrib.operators.kubernetes_pod_operator.KubernetesPodOperator """ if dag is None: logger.warning('No DAG context was found. The operator may not be associated to any DAG nor appeared in Web UI') if (operator_name is None and operator_image is None) or \ (operator_name is not None and operator_image is not None): raise Exception('''You need to specify either one of `opertor_name` or `operator_image` param''') elif operator_name is not None: image = 'gcr.io/{gcp_project_name}/{experiment_name}_{operator_name}'\ .format( gcp_project_name=get_config('gcp_project_name'), experiment_name=get_config('experiment_name'), operator_name=operator_name) elif operator_image is not None: image = operator_image image_tag = image_tag or 'LATEST' return KubernetesPodOperator( dag=dag or models._CONTEXT_MANAGER_DAG, task_id='{experiment_name}_{operator_name}'.format( experiment_name=get_config('experiment_name') .replace('-', '_'), operator_name=operator_name.replace('-', '_')), name='{experiment_name}__{operator_name}:{image_tag}'.format( experiment_name=get_config('experiment_name').replace('_', '-'),