コード例 #1
0
def less_simple_data_frame_output_materialization_config(
    context, config, value
):
    # Materialize LessSimpleDataFrame into a csv file
    csv_path = os.path.join(
        os.path.dirname(__file__), os.path.abspath(config['csv']['path'])
    )
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    with open(csv_path, 'w') as fd:
        fieldnames = list(value[0].keys())
        writer = csv.DictWriter(
            fd, fieldnames, delimiter=config['csv']['sep']
        )
        writer.writeheader()
        writer.writerows(value)

    context.log.debug(
        'Wrote dataframe as .csv to {path}'.format(path=csv_path)
    )
    yield Materialization(
        '1data_frame_csv',
        'LessSimpleDataFrame materialized as csv',
        [
            EventMetadataEntry.path(
                path=csv_path,
                label='data_frame_csv_path',
                description='LessSimpleDataFrame written to csv format',
            )
        ],
    )
    # Materialize LessSimpleDataFrame into a json file
    json_path = os.path.abspath(config['json']['path'])
    with open(json_path, 'w') as fd:
        json_value = seven.json.dumps([dict(row) for row in value])
        fd.write(json_value)

    context.log.debug(
        'Wrote dataframe as .json to {path}'.format(path=json_path)
    )
    yield Materialization(
        'data_frame_json',
        'LessSimpleDataFrame materialized as json',
        [
            EventMetadataEntry.path(
                path=json_path,
                label='data_frame_json_path',
                description='LessSimpleDataFrame written to json format',
            )
        ],
    )
コード例 #2
0
def less_simple_data_frame_materializer(context, config, value):
    # Materialize LessSimpleDataFrame into a csv file
    csv_path = os.path.join(
        os.path.dirname(__file__), os.path.abspath(config["csv"]["path"])
    )
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    with open(csv_path, "w") as fd:
        fieldnames = list(value[0].keys())
        writer = csv.DictWriter(
            fd, fieldnames, delimiter=config["csv"]["sep"]
        )
        writer.writeheader()
        writer.writerows(value)

    context.log.debug(
        "Wrote dataframe as .csv to {path}".format(path=csv_path)
    )
    yield AssetMaterialization(
        "1data_frame_csv",
        "LessSimpleDataFrame materialized as csv",
        [
            EventMetadataEntry.path(
                path=csv_path,
                label="data_frame_csv_path",
                description="LessSimpleDataFrame written to csv format",
            )
        ],
    )
    # Materialize LessSimpleDataFrame into a json file
    json_path = os.path.abspath(config["json"]["path"])
    with open(json_path, "w") as fd:
        json_value = seven.json.dumps([dict(row) for row in value])
        fd.write(json_value)

    context.log.debug(
        "Wrote dataframe as .json to {path}".format(path=json_path)
    )
    yield AssetMaterialization(
        "data_frame_json",
        "LessSimpleDataFrame materialized as json",
        [
            EventMetadataEntry.path(
                path=json_path,
                label="data_frame_json_path",
                description="LessSimpleDataFrame written to json format",
            )
        ],
    )
コード例 #3
0
ファイル: materializations.py プロジェクト: sd2k/dagster
def sort_by_calories(context, cereals):
    sorted_cereals = sorted(
        cereals, key=lambda cereal: int(cereal["calories"])
    )
    context.log.info(
        "Least caloric cereal: {least_caloric}".format(
            least_caloric=sorted_cereals[0]["name"]
        )
    )
    context.log.info(
        "Most caloric cereal: {most_caloric}".format(
            most_caloric=sorted_cereals[-1]["name"]
        )
    )
    fieldnames = list(sorted_cereals[0].keys())
    sorted_cereals_csv_path = os.path.abspath(
        "output/calories_sorted_{run_id}.csv".format(run_id=context.run_id)
    )
    os.makedirs(os.path.dirname(sorted_cereals_csv_path), exist_ok=True)
    with open(sorted_cereals_csv_path, "w") as fd:
        writer = csv.DictWriter(fd, fieldnames)
        writer.writeheader()
        writer.writerows(sorted_cereals)
    yield AssetMaterialization(
        asset_key="sorted_cereals_csv",
        description="Cereals data frame sorted by caloric content",
        metadata_entries=[
            EventMetadataEntry.path(
                sorted_cereals_csv_path, "sorted_cereals_csv_path"
            )
        ],
    )
    yield Output(None)
コード例 #4
0
def sort_by_calories(context, cereals):
    sorted_cereals = sorted(
        cereals, key=lambda cereal: int(cereal['calories'])
    )
    context.log.info(
        'Least caloric cereal: {least_caloric}'.format(
            least_caloric=sorted_cereals[0]['name']
        )
    )
    context.log.info(
        'Most caloric cereal: {most_caloric}'.format(
            most_caloric=sorted_cereals[-1]['name']
        )
    )
    fieldnames = list(sorted_cereals[0].keys())
    sorted_cereals_csv_path = os.path.abspath(
        'output/calories_sorted_{run_id}.csv'.format(run_id=context.run_id)
    )
    os.makedirs(os.path.dirname(sorted_cereals_csv_path), exist_ok=True)
    with open(sorted_cereals_csv_path, 'w') as fd:
        writer = csv.DictWriter(fd, fieldnames)
        writer.writeheader()
        writer.writerows(sorted_cereals)
    yield Materialization(
        label='sorted_cereals_csv',
        description='Cereals data frame sorted by caloric content',
        metadata_entries=[
            EventMetadataEntry.path(
                sorted_cereals_csv_path, 'sorted_cereals_csv_path'
            )
        ],
    )
    yield Output(None)
コード例 #5
0
ファイル: cache_file_from_s3.py プロジェクト: yuhan/dagster
def cache_file_from_s3(context, s3_coordinate: S3Coordinate) -> FileHandle:
    target_key = context.solid_config.get("file_key",
                                          s3_coordinate["key"].split("/")[-1])

    file_cache = context.resources.file_cache

    target_file_handle = file_cache.get_file_handle(target_key)

    if file_cache.overwrite or not file_cache.has_file_object(target_key):
        with get_temp_file_name() as tmp_file:
            context.resources.s3.download_file(Bucket=s3_coordinate["bucket"],
                                               Key=s3_coordinate["key"],
                                               Filename=tmp_file)

            context.log.info("File downloaded to {}".format(tmp_file))

            with open(tmp_file, "rb") as tmp_file_object:
                file_cache.write_file_object(target_key, tmp_file_object)
                context.log.info("File handle written at : {}".format(
                    target_file_handle.path_desc))
    else:
        context.log.info("File {} already present in cache".format(
            target_file_handle.path_desc))

    yield ExpectationResult(
        success=file_cache.has_file_object(target_key),
        label="file_handle_exists",
        metadata_entries=[
            EventMetadataEntry.path(path=target_file_handle.path_desc,
                                    label=target_key)
        ],
    )
    yield Output(target_file_handle)
コード例 #6
0
ファイル: longitudinal.py プロジェクト: sd2k/dagster
def _base_compute(context):
    time.sleep(context.solid_config["sleep"])

    if random() < context.solid_config["error_rate"]:
        raise Exception("blah")

    asset_key = None
    if context.solid_config.get("materialization_key_list") is not None:
        asset_key = AssetKey(
            context.solid_config.get("materialization_key_list"))
    elif context.solid_config.get("materialization_key") is not None:
        asset_key = AssetKey(context.solid_config.get("materialization_key"))

    if asset_key:
        metadata_entries = []
        if context.solid_config.get("materialization_text") is not None:
            metadata_entries.append(
                EventMetadataEntry.text(
                    context.solid_config.get("materialization_text"),
                    context.solid.name,
                ))

        if context.solid_config.get("materialization_url") is not None:
            metadata_entries.append(
                EventMetadataEntry.url(
                    context.solid_config.get("materialization_url"),
                    context.solid.name,
                ))

        if context.solid_config.get("materialization_path") is not None:
            metadata_entries.append(
                EventMetadataEntry.path(
                    context.solid_config.get("materialization_url"),
                    context.solid.name,
                ))

        if context.solid_config.get("materialization_json") is not None:
            metadata_entries.append(
                EventMetadataEntry.json(
                    context.solid_config.get("materialization_json"),
                    context.solid.name,
                ))

        if context.solid_config.get("materialization_value") is not None:
            metadata_entries = [
                EventMetadataEntry.float(
                    context.solid_config.get("materialization_value"),
                    context.solid.name,
                )
            ]

        if len(metadata_entries) == 0:
            metadata_entries = None

        yield AssetMaterialization(
            asset_key=asset_key,
            metadata_entries=metadata_entries,
        )

    yield Output(1)
コード例 #7
0
def many_materializations_and_passing_expectations(_context):
    tables = [
        'users',
        'groups',
        'events',
        'friends',
        'pages',
        'fans',
        'event_admins',
        'group_admins',
    ]

    for table in tables:
        yield AssetMaterialization(
            asset_key='table_info',
            metadata_entries=[
                EventMetadataEntry.path(label='table_path',
                                        path='/path/to/{}.raw'.format(table))
            ],
        )
        yield ExpectationResult(
            success=True,
            label='{table}.row_count'.format(table=table),
            description='Row count passed for {table}'.format(table=table),
        )
コード例 #8
0
def event_metadata_entries(metadata_entry_datas):
    if not metadata_entry_datas:
        return

    for metadata_entry_data in metadata_entry_datas:
        typename = metadata_entry_data['__typename']
        label = metadata_entry_data['label']
        description = metadata_entry_data.get('description')
        if typename == 'EventPathMetadataEntry':
            yield EventMetadataEntry.path(label=label,
                                          description=description,
                                          path=metadata_entry_data['path'])
        elif typename == 'EventJsonMetadataEntry':
            yield EventMetadataEntry.json(
                label=label,
                description=description,
                data=json.loads(metadata_entry_data.get('jsonString', '')),
            )
        elif typename == 'EventTextMetadataEntry':
            yield EventMetadataEntry.text(label=label,
                                          description=description,
                                          text=metadata_entry_data['text'])
        elif typename == 'EventUrlMetadataEntry':
            yield EventMetadataEntry.url(label=label,
                                         description=description,
                                         url=metadata_entry_data['url'])
        else:
            check.not_implemented('TODO for type {}'.format(typename))
コード例 #9
0
def cache_file_from_s3(context, bucket_data):
    target_key = context.solid_config.get('file_key',
                                          bucket_data['key'].split('/')[-1])

    file_cache = context.resources.file_cache

    file_handle = file_cache.get_file_handle(target_key)

    if file_cache.overwrite or not file_cache.has_file_object(target_key):

        with get_temp_file_name() as tmp_file:
            context.resources.s3.session.download_file(
                Bucket=bucket_data['bucket'],
                Key=bucket_data['key'],
                Filename=tmp_file)

            context.log.info('File downloaded to {}'.format(tmp_file))

            with open(tmp_file, 'rb') as tmp_file_object:
                file_cache.write_file_object(target_key, tmp_file_object)
                context.log.info('File handle written at : {}'.format(
                    file_handle.path_desc))
    else:
        context.log.info('File {} already present in cache'.format(
            file_handle.path_desc))

    yield ExpectationResult(
        success=file_cache.has_file_object(target_key),
        label='file_handle_exists',
        metadata_entries=[
            EventMetadataEntry.path(path=file_handle.path_desc,
                                    label=target_key)
        ],
    )
    yield Output(file_handle)
コード例 #10
0
def many_materializations_and_passing_expectations(_context):
    tables = [
        "users",
        "groups",
        "events",
        "friends",
        "pages",
        "fans",
        "event_admins",
        "group_admins",
    ]

    for table in tables:
        yield AssetMaterialization(
            asset_key="table_info",
            metadata_entries=[
                EventMetadataEntry.path(label="table_path",
                                        path="/path/to/{}.raw".format(table))
            ],
        )
        yield ExpectationResult(
            success=True,
            label="{table}.row_count".format(table=table),
            description="Row count passed for {table}".format(table=table),
        )
コード例 #11
0
ファイル: step_six.py プロジェクト: zuik/dagster
 def save(self, key, df):
     path = os.path.join(self.root_dir, key)
     df.to_parquet(path)
     return AssetMaterialization(
         asset_key=AssetKey(["local_metastore", key]),
         metadata_entries=[EventMetadataEntry.path(path, "on_disk")],
     )
コード例 #12
0
def write_operation_inventory(context: SolidExecutionContext,
                              analysis: Dict[str, RightSizeAnalysis],
                              resources: DataFrame) -> Nothing:
    resources = resources.set_index('resource_id')
    resizes = [{
        'subscription_id': resources.at[resource_id, 'subscription_id'],
        'resource_id': resource_id,
        'current_sku': resources.at[resource_id, 'vm_size'],
        'new_sku': analysis.advisor_sku
    } for resource_id, analysis in analysis.items()
               if analysis.advisor_sku_valid]
    output = {'vm_resize_operations': resizes}

    output_path = os.path.abspath(f'operation_inventory_{context.run_id}.json')
    with open(output_path, 'w') as fd:
        json.dump(output, fd, indent=3)

    yield Materialization(
        label='operation_inventory',
        description=
        'An inventory of the right sizing operations that are recommended and validated.',
        metadata_entries=[
            EventMetadataEntry.path(output_path, 'operation_inventory_path')
        ],
    )
    yield Output(None)
コード例 #13
0
ファイル: solids.py プロジェクト: mitodl/ol-data-pipelines
def course_roles(context: SolidExecutionContext,
                 edx_course_ids: List[String]) -> DagsterPath:
    """Retrieve information about user roles for given courses.

    :param context: Dagster execution context for propagaint configuration data
    :type context: SolidExecutionContext

    :param edx_course_ids: List of edX course ID strings
    :type edx_course_ids: List[String]

    :returns: A path definition that points to the rendered data table

    :rtype: DagsterPath
    """
    access_role = Table('student_courseaccessrole')
    roles_query = Query.from_(access_role).select(
        'id', 'user_id', 'org', 'course_id',
        'role').where(access_role.course_id.isin(edx_course_ids))
    query_fields, roles_data = context.resources.sqldb.run_query(roles_query)
    # Maintaining previous file name for compatibility (TMM 2020-05-01)
    roles_path = context.resources.results_dir.path.joinpath('role_query.csv')
    write_csv(query_fields, roles_data, roles_path)
    yield Materialization(
        label='role_query.csv',
        description='Course roles records from Open edX installation',
        metadata_entries=[
            EventMetadataEntry.text(
                label='course_roles_count',
                description='Number of course roles records',
                text=str(len(roles_data))),
            EventMetadataEntry.path(roles_path.name, 'role_query_csv_path')
        ])
    yield Output(roles_path, 'edx_course_roles')
コード例 #14
0
def cache_properies_from_rest_api(
    context, properties: PropertyDataFrame, target_key: String
) -> FileHandle:

    property_list = []
    date = datetime.today().strftime('%y%m%d')
    date_time = datetime.now().strftime("%y%m%d_%H%M%S")
    for p in properties:

        # Is it possible to do a range instead of each seperately?
        json_prop = requests.get(context.solid_config['immo24_api_en'] + p['id']).json()

        # add metadata if flat, house, detatched-house, etc.
        json_prop['propertyDetails']['propertyType'] = p['propertyType']
        json_prop['propertyDetails']['isBuyRent'] = p['rentOrBuy']

        # add metadata from search
        json_prop['propertyDetails']['propertyId'] = p['id']
        json_prop['propertyDetails']['searchCity'] = p['city']
        json_prop['propertyDetails']['searchRadius'] = p['radius']
        json_prop['propertyDetails']['searchDate'] = date
        json_prop['propertyDetails']['searchDateTime'] = date_time

        property_list.append(json_prop)

    filename = (
        property_list[0]['propertyDetails']['searchDate']
        + '_'
        + property_list[0]['propertyDetails']['searchCity']
        + '_'
        + property_list[0]['propertyDetails']['isBuyRent']
        + '_'
        + str(property_list[0]['propertyDetails']['searchRadius'])
        + '_'
        + property_list[0]['propertyDetails']['propertyType']
        + '.gz'
    )
    target_key = target_key + '/' + filename

    '''caching to file
    '''
    file_cache = context.resources.file_cache
    target_file_handle = file_cache.get_file_handle(target_key)

    if file_cache.overwrite or not file_cache.has_file_object(target_key):
        json_zip_writer(property_list, target_key)
        context.log.info("File handle written at : {}".format(target_file_handle.path_desc))
    else:
        context.log.info("File {} already present in cache".format(target_file_handle.path_desc))

    yield ExpectationResult(
        success=file_cache.has_file_object(target_key),
        label="file_handle_exists",
        metadata_entries=[
            EventMetadataEntry.path(path=target_file_handle.path_desc, label=target_key)
        ],
    )
    yield Output(target_file_handle)
コード例 #15
0
ファイル: many_events.py プロジェクト: JPeer264/dagster-fork
 def raw_file_solid(_context):
     yield Materialization(
         label='table_info',
         metadata_entries=[
             EventMetadataEntry.path(label='table_path', path='/path/to/{}.raw'.format(name))
         ],
     )
     yield do_expectation(_context, name)
     yield Output(name)
コード例 #16
0
 def raw_file_solid(_context):
     yield AssetMaterialization(
         asset_key="table_info",
         metadata_entries=[
             EventMetadataEntry.path(label="table_path",
                                     path="/path/to/{}.raw".format(name))
         ],
     )
     yield do_expectation(_context, name)
     yield Output(name)
コード例 #17
0
def _base_compute(context):
    time.sleep(context.solid_config['sleep'])

    if random() < context.solid_config['error_rate']:
        raise Exception('blah')

    if context.solid_config.get('materialization_key') is not None:
        metadata_entries = []
        if context.solid_config.get('materialization_text') is not None:
            metadata_entries.append(
                EventMetadataEntry.text(
                    context.solid_config.get('materialization_text'), context.solid.name,
                )
            )

        if context.solid_config.get('materialization_url') is not None:
            metadata_entries.append(
                EventMetadataEntry.url(
                    context.solid_config.get('materialization_url'), context.solid.name,
                )
            )

        if context.solid_config.get('materialization_path') is not None:
            metadata_entries.append(
                EventMetadataEntry.path(
                    context.solid_config.get('materialization_url'), context.solid.name,
                )
            )

        if context.solid_config.get('materialization_json') is not None:
            metadata_entries.append(
                EventMetadataEntry.json(
                    context.solid_config.get('materialization_json'), context.solid.name,
                )
            )

        if context.solid_config.get('materialization_value') is not None:
            metadata_entries = [
                EventMetadataEntry.float(
                    context.solid_config.get('materialization_value'), context.solid.name,
                )
            ]

        if len(metadata_entries) == 0:
            metadata_entries = None

        yield Materialization(
            label=context.solid.name,
            asset_key=context.solid_config.get('materialization_key'),
            metadata_entries=metadata_entries,
        )

    yield Output(1)
コード例 #18
0
ファイル: solids.py プロジェクト: xaniasd/dagster
def file_handle_to_s3(context, file_handle):
    bucket = context.solid_config["Bucket"]
    key = context.solid_config["Key"]

    with context.file_manager.read(file_handle, "rb") as fileobj:
        context.resources.s3.upload_fileobj(fileobj, bucket, key)
        s3_file_handle = S3FileHandle(bucket, key)

        yield AssetMaterialization(
            asset_key=s3_file_handle.s3_path,
            metadata_entries=[EventMetadataEntry.path(s3_file_handle.s3_path, label=last_key(key))],
        )

        yield Output(value=s3_file_handle, output_name="s3_file_handle")
コード例 #19
0
def write_html_report(context: SolidExecutionContext,
                      report_notebook: FileHandle) -> Nothing:
    with context.file_manager.read(report_notebook) as node_file:
        node = nbformat.read(node_file, nbformat.NO_CONVERT)
    html = convert_nodebook_node_to_html(node, full_width=True)
    handle = context.file_manager.write_data(html.encode(), ext='html')
    yield Materialization(
        label='resize_report',
        description=
        'A report of all VMs utilization data and evaulation of the recommendations.',
        metadata_entries=[
            EventMetadataEntry.path(handle.path_desc, 'resize_report_path')
        ],
    )
コード例 #20
0
ファイル: parquet_io_manager.py プロジェクト: prezi/dagster
    def handle_output(self, context: OutputContext,
                      obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]):

        path = self._get_path(context)
        if isinstance(obj, pandas.DataFrame):
            row_count = len(obj)
            obj.to_parquet(path=path)
        elif isinstance(obj, pyspark.sql.DataFrame):
            row_count = obj.count()
            obj.write.parquet(path=path, mode="overwrite")
        else:
            raise Exception(f"Outputs of type {type(obj)} not supported.")
        yield EventMetadataEntry.int(value=row_count, label="row_count")
        yield EventMetadataEntry.path(path=path, label="path")
コード例 #21
0
ファイル: solids.py プロジェクト: yetudada/dagster
def file_handle_to_s3(context, file_handle):
    bucket = context.solid_config['Bucket']
    key = context.solid_config['Key']

    with context.file_manager.read(file_handle, 'rb') as fileobj:
        context.resources.s3.upload_fileobj(fileobj, bucket, key)
        s3_file_handle = S3FileHandle(bucket, key)

        yield Materialization(
            label='file_to_s3',
            metadata_entries=[EventMetadataEntry.path(s3_file_handle.s3_path, label=last_key(key))],
        )

        yield Output(value=s3_file_handle, output_name='s3_file_handle')
コード例 #22
0
def less_simple_data_frame_output_materialization_config(
        context, config, value):
    csv_path = os.path.abspath(config['csv']['path'])
    with open(csv_path, 'w') as fd:
        fieldnames = list(value[0].keys())
        writer = csv.DictWriter(fd, fieldnames, delimiter=config['csv']['sep'])
        writer.writeheader()
        writer.writerows(value)
    context.log.debug(
        'Wrote dataframe as .csv to {path}'.format(path=csv_path))
    return Materialization(
        'data_frame_csv',
        'LessSimpleDataFrame materialized as csv',
        [EventMetadataEntry.path(csv_path, 'data_frame_csv_path')],
    )
コード例 #23
0
ファイル: snowflake_io_manager.py プロジェクト: prezi/dagster
    def _handle_pointer_output(self, context: OutputContext, parquet_pointer: ParquetPointer):

        yield EventMetadataEntry.path(parquet_pointer.path, "Source Parquet Path")
        with connect_snowflake(config=context.resource_config) as con:
            # stage the data stored at the given path
            con.execute(
                f"""
            CREATE TEMPORARY STAGE tmp_s3_stage
                URL = '{parquet_pointer.path}'
                FILE_FORMAT=(TYPE=PARQUET COMPRESSION=SNAPPY)
                CREDENTIALS=(
                    AWS_KEY_ID='{os.getenv("AWS_ACCESS_KEY_ID")}',
                    AWS_SECRET_KEY='{os.getenv("AWS_SECRET_ACCESS_KEY")}'
                );
            """
            )
            con.execute(self._get_create_table_statement(context, parquet_pointer))
            con.execute(self._get_cleanup_statement(context))
            con.execute(self._get_copy_statement(context, parquet_pointer))
コード例 #24
0
ファイル: util.py プロジェクト: sd2k/dagster
def event_metadata_entries(metadata_entry_datas):
    if not metadata_entry_datas:
        return

    for metadata_entry_data in metadata_entry_datas:
        typename = metadata_entry_data["__typename"]
        label = metadata_entry_data["label"]
        description = metadata_entry_data.get("description")
        if typename == "EventPathMetadataEntry":
            yield EventMetadataEntry.path(label=label,
                                          description=description,
                                          path=metadata_entry_data["path"])
        elif typename == "EventJsonMetadataEntry":
            yield EventMetadataEntry.json(
                label=label,
                description=description,
                data=seven.json.loads(metadata_entry_data.get(
                    "jsonString", "")),
            )
        elif typename == "EventMarkdownMetadataEntry":
            yield EventMetadataEntry.md(label=label,
                                        description=description,
                                        md_str=metadata_entry_data.get(
                                            "md_str", ""))
        elif typename == "EventTextMetadataEntry":
            yield EventMetadataEntry.text(label=label,
                                          description=description,
                                          text=metadata_entry_data["text"])
        elif typename == "EventUrlMetadataEntry":
            yield EventMetadataEntry.url(label=label,
                                         description=description,
                                         url=metadata_entry_data["url"])
        elif typename == "EventPythonArtifactMetadataEntry":
            yield EventMetadataEntry(
                label=label,
                description=description,
                entry_data=PythonArtifactMetadataEntryData(
                    metadata_entry_data["module"],
                    metadata_entry_data["name"]),
            )
        else:
            check.not_implemented("TODO for type {}".format(typename))
コード例 #25
0
ファイル: solids.py プロジェクト: mitodl/ol-data-pipelines
def enrolled_users(context: SolidExecutionContext,
                   edx_course_ids: List[String]) -> DagsterPath:
    """Generate a table showing which students are currently enrolled in which courses.

    :param context: Dagster execution context for propagaint configuration data
    :type context: SolidExecutionContext

    :param edx_course_ids: List of course IDs to retrieve student enrollments for
    :type edx_course_ids: List[String]

    :returns: A path definition that points to the rendered data table

    :rtype: DagsterPath
    """
    course_enrollment, users = Tables('student_courseenrollment', 'auth_user')
    users_query = Query.from_(users).join(course_enrollment).on(
        users.id == course_enrollment.user_id).select(
            users.id, users.username, users.first_name, users.last_name,
            users.email, users.is_staff, users.is_active, users.is_superuser,
            users.last_login, users.date_joined,
            course_enrollment.course_id).where(
                course_enrollment.course_id.isin(edx_course_ids))
    query_fields, users_data = context.resources.sqldb.run_query(users_query)
    # Maintaining previous file name for compatibility (TMM 2020-05-01)
    enrollments_path = context.resources.results_dir.path.joinpath(
        'users_query.csv')
    write_csv(query_fields, users_data, enrollments_path)
    yield Materialization(
        label='users_query.csv',
        description=
        'Information of users enrolled in available courses on Open edX installation',
        metadata_entries=[
            EventMetadataEntry.text(
                label='enrolled_users_count',
                description='Number of users who are enrolled in courses',
                text=str(len(users_data))),
            EventMetadataEntry.path(enrollments_path.name,
                                    'enrollment_query_csv_path')
        ])
    yield Output(enrollments_path, 'edx_enrolled_users')
コード例 #26
0
ファイル: solids.py プロジェクト: mitodl/ol-data-pipelines
def student_submissions(context: SolidExecutionContext,
                        edx_course_ids: List[String]) -> DagsterPath:
    """Retrieve details of student submissions for the given courses.

    :param context: Dagster execution context for propagaint configuration data
    :type context: SolidExecutionContext

    :param edx_course_ids: List of edX course ID strings
    :type edx_course_ids: List[String]

    :returns: A path definition that points to the rendered data table

    :rtype: DagsterPath
    """
    studentmodule = Table('courseware_studentmodule')
    submissions_count = 0
    # Maintaining previous file name for compatibility (TMM 2020-05-01)
    submissions_path = context.resources.results_dir.path.joinpath(
        'studentmodule_query.csv')
    for course_id in edx_course_ids:
        submission_query = Query.from_(studentmodule).select(
            'id', 'module_type', 'module_id', 'student_id', 'state', 'grade',
            'created', 'modified', 'max_grade', 'done',
            'course_id').where(studentmodule.course_id == course_id)
        query_fields, submission_data = context.resources.sqldb.run_query(
            submission_query)
        submissions_count += len(submission_data)
        write_csv(query_fields, submission_data, submissions_path)
    yield Materialization(
        label='enrolled_students.csv',
        description='Students enrolled in edX courses',
        metadata_entries=[
            EventMetadataEntry.text(
                label='student_submission_count',
                description='Number of student submission records',
                text=str(submissions_count)),
            EventMetadataEntry.path(submissions_path.name,
                                    'student_submissions_path')
        ])
    yield Output(submissions_path, 'edx_student_submissions')
コード例 #27
0
def file_handle_to_s3(context, file_handle):
    bucket = context.solid_config['Bucket']
    key = context.solid_config['Key']

    # the s3 put_object API expects the actual bytes to be on the 'Body' key in kwargs; since we
    # get all other fields from config, we copy the config object and add 'Body' here.
    cfg = context.solid_config.copy()
    with context.file_manager.read(file_handle, 'rb') as file_obj:
        cfg['Body'] = file_obj

        context.resources.s3.put_object(**cfg)
        s3_file_handle = S3FileHandle(bucket, key)

        yield Materialization(
            label='file_to_s3',
            metadata_entries=[
                EventMetadataEntry.path(s3_file_handle.s3_path,
                                        label=last_key(key))
            ],
        )

        yield Output(value=s3_file_handle, output_name='s3_file_handle')
コード例 #28
0
ファイル: solids.py プロジェクト: mitodl/ol-data-pipelines
        yield Failure(
            description=
            'The mongodump command for exporting the Open edX forum database failed.',
            metadata_entries=[
                EventMetadataEntry.text(
                    text=mongodump_output,
                    label='mongodump_output',
                    description='Output of the mongodump command')
            ])

    yield Materialization(
        label='edx_forum_database',
        description=
        'Exported Mongo database of forum data from Open edX installation',
        metadata_entries=[
            EventMetadataEntry.path(str(forum_data_path),
                                    'edx_forum_database_export_path')
        ])

    yield Output(forum_data_path, 'edx_forum_data_directory')


@solid
def export_course(context: SolidExecutionContext,
                  course_id: String) -> Nothing:
    pass


@solid(
    name='edx_upload_daily_extracts',
    description=
    'Upload all data from daily extracts to S3 for institutional research.',