def less_simple_data_frame_output_materialization_config( context, config, value ): # Materialize LessSimpleDataFrame into a csv file csv_path = os.path.join( os.path.dirname(__file__), os.path.abspath(config['csv']['path']) ) os.makedirs(os.path.dirname(csv_path), exist_ok=True) with open(csv_path, 'w') as fd: fieldnames = list(value[0].keys()) writer = csv.DictWriter( fd, fieldnames, delimiter=config['csv']['sep'] ) writer.writeheader() writer.writerows(value) context.log.debug( 'Wrote dataframe as .csv to {path}'.format(path=csv_path) ) yield Materialization( '1data_frame_csv', 'LessSimpleDataFrame materialized as csv', [ EventMetadataEntry.path( path=csv_path, label='data_frame_csv_path', description='LessSimpleDataFrame written to csv format', ) ], ) # Materialize LessSimpleDataFrame into a json file json_path = os.path.abspath(config['json']['path']) with open(json_path, 'w') as fd: json_value = seven.json.dumps([dict(row) for row in value]) fd.write(json_value) context.log.debug( 'Wrote dataframe as .json to {path}'.format(path=json_path) ) yield Materialization( 'data_frame_json', 'LessSimpleDataFrame materialized as json', [ EventMetadataEntry.path( path=json_path, label='data_frame_json_path', description='LessSimpleDataFrame written to json format', ) ], )
def less_simple_data_frame_materializer(context, config, value): # Materialize LessSimpleDataFrame into a csv file csv_path = os.path.join( os.path.dirname(__file__), os.path.abspath(config["csv"]["path"]) ) os.makedirs(os.path.dirname(csv_path), exist_ok=True) with open(csv_path, "w") as fd: fieldnames = list(value[0].keys()) writer = csv.DictWriter( fd, fieldnames, delimiter=config["csv"]["sep"] ) writer.writeheader() writer.writerows(value) context.log.debug( "Wrote dataframe as .csv to {path}".format(path=csv_path) ) yield AssetMaterialization( "1data_frame_csv", "LessSimpleDataFrame materialized as csv", [ EventMetadataEntry.path( path=csv_path, label="data_frame_csv_path", description="LessSimpleDataFrame written to csv format", ) ], ) # Materialize LessSimpleDataFrame into a json file json_path = os.path.abspath(config["json"]["path"]) with open(json_path, "w") as fd: json_value = seven.json.dumps([dict(row) for row in value]) fd.write(json_value) context.log.debug( "Wrote dataframe as .json to {path}".format(path=json_path) ) yield AssetMaterialization( "data_frame_json", "LessSimpleDataFrame materialized as json", [ EventMetadataEntry.path( path=json_path, label="data_frame_json_path", description="LessSimpleDataFrame written to json format", ) ], )
def sort_by_calories(context, cereals): sorted_cereals = sorted( cereals, key=lambda cereal: int(cereal["calories"]) ) context.log.info( "Least caloric cereal: {least_caloric}".format( least_caloric=sorted_cereals[0]["name"] ) ) context.log.info( "Most caloric cereal: {most_caloric}".format( most_caloric=sorted_cereals[-1]["name"] ) ) fieldnames = list(sorted_cereals[0].keys()) sorted_cereals_csv_path = os.path.abspath( "output/calories_sorted_{run_id}.csv".format(run_id=context.run_id) ) os.makedirs(os.path.dirname(sorted_cereals_csv_path), exist_ok=True) with open(sorted_cereals_csv_path, "w") as fd: writer = csv.DictWriter(fd, fieldnames) writer.writeheader() writer.writerows(sorted_cereals) yield AssetMaterialization( asset_key="sorted_cereals_csv", description="Cereals data frame sorted by caloric content", metadata_entries=[ EventMetadataEntry.path( sorted_cereals_csv_path, "sorted_cereals_csv_path" ) ], ) yield Output(None)
def sort_by_calories(context, cereals): sorted_cereals = sorted( cereals, key=lambda cereal: int(cereal['calories']) ) context.log.info( 'Least caloric cereal: {least_caloric}'.format( least_caloric=sorted_cereals[0]['name'] ) ) context.log.info( 'Most caloric cereal: {most_caloric}'.format( most_caloric=sorted_cereals[-1]['name'] ) ) fieldnames = list(sorted_cereals[0].keys()) sorted_cereals_csv_path = os.path.abspath( 'output/calories_sorted_{run_id}.csv'.format(run_id=context.run_id) ) os.makedirs(os.path.dirname(sorted_cereals_csv_path), exist_ok=True) with open(sorted_cereals_csv_path, 'w') as fd: writer = csv.DictWriter(fd, fieldnames) writer.writeheader() writer.writerows(sorted_cereals) yield Materialization( label='sorted_cereals_csv', description='Cereals data frame sorted by caloric content', metadata_entries=[ EventMetadataEntry.path( sorted_cereals_csv_path, 'sorted_cereals_csv_path' ) ], ) yield Output(None)
def cache_file_from_s3(context, s3_coordinate: S3Coordinate) -> FileHandle: target_key = context.solid_config.get("file_key", s3_coordinate["key"].split("/")[-1]) file_cache = context.resources.file_cache target_file_handle = file_cache.get_file_handle(target_key) if file_cache.overwrite or not file_cache.has_file_object(target_key): with get_temp_file_name() as tmp_file: context.resources.s3.download_file(Bucket=s3_coordinate["bucket"], Key=s3_coordinate["key"], Filename=tmp_file) context.log.info("File downloaded to {}".format(tmp_file)) with open(tmp_file, "rb") as tmp_file_object: file_cache.write_file_object(target_key, tmp_file_object) context.log.info("File handle written at : {}".format( target_file_handle.path_desc)) else: context.log.info("File {} already present in cache".format( target_file_handle.path_desc)) yield ExpectationResult( success=file_cache.has_file_object(target_key), label="file_handle_exists", metadata_entries=[ EventMetadataEntry.path(path=target_file_handle.path_desc, label=target_key) ], ) yield Output(target_file_handle)
def _base_compute(context): time.sleep(context.solid_config["sleep"]) if random() < context.solid_config["error_rate"]: raise Exception("blah") asset_key = None if context.solid_config.get("materialization_key_list") is not None: asset_key = AssetKey( context.solid_config.get("materialization_key_list")) elif context.solid_config.get("materialization_key") is not None: asset_key = AssetKey(context.solid_config.get("materialization_key")) if asset_key: metadata_entries = [] if context.solid_config.get("materialization_text") is not None: metadata_entries.append( EventMetadataEntry.text( context.solid_config.get("materialization_text"), context.solid.name, )) if context.solid_config.get("materialization_url") is not None: metadata_entries.append( EventMetadataEntry.url( context.solid_config.get("materialization_url"), context.solid.name, )) if context.solid_config.get("materialization_path") is not None: metadata_entries.append( EventMetadataEntry.path( context.solid_config.get("materialization_url"), context.solid.name, )) if context.solid_config.get("materialization_json") is not None: metadata_entries.append( EventMetadataEntry.json( context.solid_config.get("materialization_json"), context.solid.name, )) if context.solid_config.get("materialization_value") is not None: metadata_entries = [ EventMetadataEntry.float( context.solid_config.get("materialization_value"), context.solid.name, ) ] if len(metadata_entries) == 0: metadata_entries = None yield AssetMaterialization( asset_key=asset_key, metadata_entries=metadata_entries, ) yield Output(1)
def many_materializations_and_passing_expectations(_context): tables = [ 'users', 'groups', 'events', 'friends', 'pages', 'fans', 'event_admins', 'group_admins', ] for table in tables: yield AssetMaterialization( asset_key='table_info', metadata_entries=[ EventMetadataEntry.path(label='table_path', path='/path/to/{}.raw'.format(table)) ], ) yield ExpectationResult( success=True, label='{table}.row_count'.format(table=table), description='Row count passed for {table}'.format(table=table), )
def event_metadata_entries(metadata_entry_datas): if not metadata_entry_datas: return for metadata_entry_data in metadata_entry_datas: typename = metadata_entry_data['__typename'] label = metadata_entry_data['label'] description = metadata_entry_data.get('description') if typename == 'EventPathMetadataEntry': yield EventMetadataEntry.path(label=label, description=description, path=metadata_entry_data['path']) elif typename == 'EventJsonMetadataEntry': yield EventMetadataEntry.json( label=label, description=description, data=json.loads(metadata_entry_data.get('jsonString', '')), ) elif typename == 'EventTextMetadataEntry': yield EventMetadataEntry.text(label=label, description=description, text=metadata_entry_data['text']) elif typename == 'EventUrlMetadataEntry': yield EventMetadataEntry.url(label=label, description=description, url=metadata_entry_data['url']) else: check.not_implemented('TODO for type {}'.format(typename))
def cache_file_from_s3(context, bucket_data): target_key = context.solid_config.get('file_key', bucket_data['key'].split('/')[-1]) file_cache = context.resources.file_cache file_handle = file_cache.get_file_handle(target_key) if file_cache.overwrite or not file_cache.has_file_object(target_key): with get_temp_file_name() as tmp_file: context.resources.s3.session.download_file( Bucket=bucket_data['bucket'], Key=bucket_data['key'], Filename=tmp_file) context.log.info('File downloaded to {}'.format(tmp_file)) with open(tmp_file, 'rb') as tmp_file_object: file_cache.write_file_object(target_key, tmp_file_object) context.log.info('File handle written at : {}'.format( file_handle.path_desc)) else: context.log.info('File {} already present in cache'.format( file_handle.path_desc)) yield ExpectationResult( success=file_cache.has_file_object(target_key), label='file_handle_exists', metadata_entries=[ EventMetadataEntry.path(path=file_handle.path_desc, label=target_key) ], ) yield Output(file_handle)
def many_materializations_and_passing_expectations(_context): tables = [ "users", "groups", "events", "friends", "pages", "fans", "event_admins", "group_admins", ] for table in tables: yield AssetMaterialization( asset_key="table_info", metadata_entries=[ EventMetadataEntry.path(label="table_path", path="/path/to/{}.raw".format(table)) ], ) yield ExpectationResult( success=True, label="{table}.row_count".format(table=table), description="Row count passed for {table}".format(table=table), )
def save(self, key, df): path = os.path.join(self.root_dir, key) df.to_parquet(path) return AssetMaterialization( asset_key=AssetKey(["local_metastore", key]), metadata_entries=[EventMetadataEntry.path(path, "on_disk")], )
def write_operation_inventory(context: SolidExecutionContext, analysis: Dict[str, RightSizeAnalysis], resources: DataFrame) -> Nothing: resources = resources.set_index('resource_id') resizes = [{ 'subscription_id': resources.at[resource_id, 'subscription_id'], 'resource_id': resource_id, 'current_sku': resources.at[resource_id, 'vm_size'], 'new_sku': analysis.advisor_sku } for resource_id, analysis in analysis.items() if analysis.advisor_sku_valid] output = {'vm_resize_operations': resizes} output_path = os.path.abspath(f'operation_inventory_{context.run_id}.json') with open(output_path, 'w') as fd: json.dump(output, fd, indent=3) yield Materialization( label='operation_inventory', description= 'An inventory of the right sizing operations that are recommended and validated.', metadata_entries=[ EventMetadataEntry.path(output_path, 'operation_inventory_path') ], ) yield Output(None)
def course_roles(context: SolidExecutionContext, edx_course_ids: List[String]) -> DagsterPath: """Retrieve information about user roles for given courses. :param context: Dagster execution context for propagaint configuration data :type context: SolidExecutionContext :param edx_course_ids: List of edX course ID strings :type edx_course_ids: List[String] :returns: A path definition that points to the rendered data table :rtype: DagsterPath """ access_role = Table('student_courseaccessrole') roles_query = Query.from_(access_role).select( 'id', 'user_id', 'org', 'course_id', 'role').where(access_role.course_id.isin(edx_course_ids)) query_fields, roles_data = context.resources.sqldb.run_query(roles_query) # Maintaining previous file name for compatibility (TMM 2020-05-01) roles_path = context.resources.results_dir.path.joinpath('role_query.csv') write_csv(query_fields, roles_data, roles_path) yield Materialization( label='role_query.csv', description='Course roles records from Open edX installation', metadata_entries=[ EventMetadataEntry.text( label='course_roles_count', description='Number of course roles records', text=str(len(roles_data))), EventMetadataEntry.path(roles_path.name, 'role_query_csv_path') ]) yield Output(roles_path, 'edx_course_roles')
def cache_properies_from_rest_api( context, properties: PropertyDataFrame, target_key: String ) -> FileHandle: property_list = [] date = datetime.today().strftime('%y%m%d') date_time = datetime.now().strftime("%y%m%d_%H%M%S") for p in properties: # Is it possible to do a range instead of each seperately? json_prop = requests.get(context.solid_config['immo24_api_en'] + p['id']).json() # add metadata if flat, house, detatched-house, etc. json_prop['propertyDetails']['propertyType'] = p['propertyType'] json_prop['propertyDetails']['isBuyRent'] = p['rentOrBuy'] # add metadata from search json_prop['propertyDetails']['propertyId'] = p['id'] json_prop['propertyDetails']['searchCity'] = p['city'] json_prop['propertyDetails']['searchRadius'] = p['radius'] json_prop['propertyDetails']['searchDate'] = date json_prop['propertyDetails']['searchDateTime'] = date_time property_list.append(json_prop) filename = ( property_list[0]['propertyDetails']['searchDate'] + '_' + property_list[0]['propertyDetails']['searchCity'] + '_' + property_list[0]['propertyDetails']['isBuyRent'] + '_' + str(property_list[0]['propertyDetails']['searchRadius']) + '_' + property_list[0]['propertyDetails']['propertyType'] + '.gz' ) target_key = target_key + '/' + filename '''caching to file ''' file_cache = context.resources.file_cache target_file_handle = file_cache.get_file_handle(target_key) if file_cache.overwrite or not file_cache.has_file_object(target_key): json_zip_writer(property_list, target_key) context.log.info("File handle written at : {}".format(target_file_handle.path_desc)) else: context.log.info("File {} already present in cache".format(target_file_handle.path_desc)) yield ExpectationResult( success=file_cache.has_file_object(target_key), label="file_handle_exists", metadata_entries=[ EventMetadataEntry.path(path=target_file_handle.path_desc, label=target_key) ], ) yield Output(target_file_handle)
def raw_file_solid(_context): yield Materialization( label='table_info', metadata_entries=[ EventMetadataEntry.path(label='table_path', path='/path/to/{}.raw'.format(name)) ], ) yield do_expectation(_context, name) yield Output(name)
def raw_file_solid(_context): yield AssetMaterialization( asset_key="table_info", metadata_entries=[ EventMetadataEntry.path(label="table_path", path="/path/to/{}.raw".format(name)) ], ) yield do_expectation(_context, name) yield Output(name)
def _base_compute(context): time.sleep(context.solid_config['sleep']) if random() < context.solid_config['error_rate']: raise Exception('blah') if context.solid_config.get('materialization_key') is not None: metadata_entries = [] if context.solid_config.get('materialization_text') is not None: metadata_entries.append( EventMetadataEntry.text( context.solid_config.get('materialization_text'), context.solid.name, ) ) if context.solid_config.get('materialization_url') is not None: metadata_entries.append( EventMetadataEntry.url( context.solid_config.get('materialization_url'), context.solid.name, ) ) if context.solid_config.get('materialization_path') is not None: metadata_entries.append( EventMetadataEntry.path( context.solid_config.get('materialization_url'), context.solid.name, ) ) if context.solid_config.get('materialization_json') is not None: metadata_entries.append( EventMetadataEntry.json( context.solid_config.get('materialization_json'), context.solid.name, ) ) if context.solid_config.get('materialization_value') is not None: metadata_entries = [ EventMetadataEntry.float( context.solid_config.get('materialization_value'), context.solid.name, ) ] if len(metadata_entries) == 0: metadata_entries = None yield Materialization( label=context.solid.name, asset_key=context.solid_config.get('materialization_key'), metadata_entries=metadata_entries, ) yield Output(1)
def file_handle_to_s3(context, file_handle): bucket = context.solid_config["Bucket"] key = context.solid_config["Key"] with context.file_manager.read(file_handle, "rb") as fileobj: context.resources.s3.upload_fileobj(fileobj, bucket, key) s3_file_handle = S3FileHandle(bucket, key) yield AssetMaterialization( asset_key=s3_file_handle.s3_path, metadata_entries=[EventMetadataEntry.path(s3_file_handle.s3_path, label=last_key(key))], ) yield Output(value=s3_file_handle, output_name="s3_file_handle")
def write_html_report(context: SolidExecutionContext, report_notebook: FileHandle) -> Nothing: with context.file_manager.read(report_notebook) as node_file: node = nbformat.read(node_file, nbformat.NO_CONVERT) html = convert_nodebook_node_to_html(node, full_width=True) handle = context.file_manager.write_data(html.encode(), ext='html') yield Materialization( label='resize_report', description= 'A report of all VMs utilization data and evaulation of the recommendations.', metadata_entries=[ EventMetadataEntry.path(handle.path_desc, 'resize_report_path') ], )
def handle_output(self, context: OutputContext, obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]): path = self._get_path(context) if isinstance(obj, pandas.DataFrame): row_count = len(obj) obj.to_parquet(path=path) elif isinstance(obj, pyspark.sql.DataFrame): row_count = obj.count() obj.write.parquet(path=path, mode="overwrite") else: raise Exception(f"Outputs of type {type(obj)} not supported.") yield EventMetadataEntry.int(value=row_count, label="row_count") yield EventMetadataEntry.path(path=path, label="path")
def file_handle_to_s3(context, file_handle): bucket = context.solid_config['Bucket'] key = context.solid_config['Key'] with context.file_manager.read(file_handle, 'rb') as fileobj: context.resources.s3.upload_fileobj(fileobj, bucket, key) s3_file_handle = S3FileHandle(bucket, key) yield Materialization( label='file_to_s3', metadata_entries=[EventMetadataEntry.path(s3_file_handle.s3_path, label=last_key(key))], ) yield Output(value=s3_file_handle, output_name='s3_file_handle')
def less_simple_data_frame_output_materialization_config( context, config, value): csv_path = os.path.abspath(config['csv']['path']) with open(csv_path, 'w') as fd: fieldnames = list(value[0].keys()) writer = csv.DictWriter(fd, fieldnames, delimiter=config['csv']['sep']) writer.writeheader() writer.writerows(value) context.log.debug( 'Wrote dataframe as .csv to {path}'.format(path=csv_path)) return Materialization( 'data_frame_csv', 'LessSimpleDataFrame materialized as csv', [EventMetadataEntry.path(csv_path, 'data_frame_csv_path')], )
def _handle_pointer_output(self, context: OutputContext, parquet_pointer: ParquetPointer): yield EventMetadataEntry.path(parquet_pointer.path, "Source Parquet Path") with connect_snowflake(config=context.resource_config) as con: # stage the data stored at the given path con.execute( f""" CREATE TEMPORARY STAGE tmp_s3_stage URL = '{parquet_pointer.path}' FILE_FORMAT=(TYPE=PARQUET COMPRESSION=SNAPPY) CREDENTIALS=( AWS_KEY_ID='{os.getenv("AWS_ACCESS_KEY_ID")}', AWS_SECRET_KEY='{os.getenv("AWS_SECRET_ACCESS_KEY")}' ); """ ) con.execute(self._get_create_table_statement(context, parquet_pointer)) con.execute(self._get_cleanup_statement(context)) con.execute(self._get_copy_statement(context, parquet_pointer))
def event_metadata_entries(metadata_entry_datas): if not metadata_entry_datas: return for metadata_entry_data in metadata_entry_datas: typename = metadata_entry_data["__typename"] label = metadata_entry_data["label"] description = metadata_entry_data.get("description") if typename == "EventPathMetadataEntry": yield EventMetadataEntry.path(label=label, description=description, path=metadata_entry_data["path"]) elif typename == "EventJsonMetadataEntry": yield EventMetadataEntry.json( label=label, description=description, data=seven.json.loads(metadata_entry_data.get( "jsonString", "")), ) elif typename == "EventMarkdownMetadataEntry": yield EventMetadataEntry.md(label=label, description=description, md_str=metadata_entry_data.get( "md_str", "")) elif typename == "EventTextMetadataEntry": yield EventMetadataEntry.text(label=label, description=description, text=metadata_entry_data["text"]) elif typename == "EventUrlMetadataEntry": yield EventMetadataEntry.url(label=label, description=description, url=metadata_entry_data["url"]) elif typename == "EventPythonArtifactMetadataEntry": yield EventMetadataEntry( label=label, description=description, entry_data=PythonArtifactMetadataEntryData( metadata_entry_data["module"], metadata_entry_data["name"]), ) else: check.not_implemented("TODO for type {}".format(typename))
def enrolled_users(context: SolidExecutionContext, edx_course_ids: List[String]) -> DagsterPath: """Generate a table showing which students are currently enrolled in which courses. :param context: Dagster execution context for propagaint configuration data :type context: SolidExecutionContext :param edx_course_ids: List of course IDs to retrieve student enrollments for :type edx_course_ids: List[String] :returns: A path definition that points to the rendered data table :rtype: DagsterPath """ course_enrollment, users = Tables('student_courseenrollment', 'auth_user') users_query = Query.from_(users).join(course_enrollment).on( users.id == course_enrollment.user_id).select( users.id, users.username, users.first_name, users.last_name, users.email, users.is_staff, users.is_active, users.is_superuser, users.last_login, users.date_joined, course_enrollment.course_id).where( course_enrollment.course_id.isin(edx_course_ids)) query_fields, users_data = context.resources.sqldb.run_query(users_query) # Maintaining previous file name for compatibility (TMM 2020-05-01) enrollments_path = context.resources.results_dir.path.joinpath( 'users_query.csv') write_csv(query_fields, users_data, enrollments_path) yield Materialization( label='users_query.csv', description= 'Information of users enrolled in available courses on Open edX installation', metadata_entries=[ EventMetadataEntry.text( label='enrolled_users_count', description='Number of users who are enrolled in courses', text=str(len(users_data))), EventMetadataEntry.path(enrollments_path.name, 'enrollment_query_csv_path') ]) yield Output(enrollments_path, 'edx_enrolled_users')
def student_submissions(context: SolidExecutionContext, edx_course_ids: List[String]) -> DagsterPath: """Retrieve details of student submissions for the given courses. :param context: Dagster execution context for propagaint configuration data :type context: SolidExecutionContext :param edx_course_ids: List of edX course ID strings :type edx_course_ids: List[String] :returns: A path definition that points to the rendered data table :rtype: DagsterPath """ studentmodule = Table('courseware_studentmodule') submissions_count = 0 # Maintaining previous file name for compatibility (TMM 2020-05-01) submissions_path = context.resources.results_dir.path.joinpath( 'studentmodule_query.csv') for course_id in edx_course_ids: submission_query = Query.from_(studentmodule).select( 'id', 'module_type', 'module_id', 'student_id', 'state', 'grade', 'created', 'modified', 'max_grade', 'done', 'course_id').where(studentmodule.course_id == course_id) query_fields, submission_data = context.resources.sqldb.run_query( submission_query) submissions_count += len(submission_data) write_csv(query_fields, submission_data, submissions_path) yield Materialization( label='enrolled_students.csv', description='Students enrolled in edX courses', metadata_entries=[ EventMetadataEntry.text( label='student_submission_count', description='Number of student submission records', text=str(submissions_count)), EventMetadataEntry.path(submissions_path.name, 'student_submissions_path') ]) yield Output(submissions_path, 'edx_student_submissions')
def file_handle_to_s3(context, file_handle): bucket = context.solid_config['Bucket'] key = context.solid_config['Key'] # the s3 put_object API expects the actual bytes to be on the 'Body' key in kwargs; since we # get all other fields from config, we copy the config object and add 'Body' here. cfg = context.solid_config.copy() with context.file_manager.read(file_handle, 'rb') as file_obj: cfg['Body'] = file_obj context.resources.s3.put_object(**cfg) s3_file_handle = S3FileHandle(bucket, key) yield Materialization( label='file_to_s3', metadata_entries=[ EventMetadataEntry.path(s3_file_handle.s3_path, label=last_key(key)) ], ) yield Output(value=s3_file_handle, output_name='s3_file_handle')
yield Failure( description= 'The mongodump command for exporting the Open edX forum database failed.', metadata_entries=[ EventMetadataEntry.text( text=mongodump_output, label='mongodump_output', description='Output of the mongodump command') ]) yield Materialization( label='edx_forum_database', description= 'Exported Mongo database of forum data from Open edX installation', metadata_entries=[ EventMetadataEntry.path(str(forum_data_path), 'edx_forum_database_export_path') ]) yield Output(forum_data_path, 'edx_forum_data_directory') @solid def export_course(context: SolidExecutionContext, course_id: String) -> Nothing: pass @solid( name='edx_upload_daily_extracts', description= 'Upload all data from daily extracts to S3 for institutional research.',