# "solids":{ # "push_new_data":{ # "config":{ # "commit":"Metadata", # "branch":"dev" # } # } # } } ################ SOLIDS ################## @dg.solid( input_defs=[ dg.InputDefinition("cumulus", root_manager_key="cumulus_root"), dg.InputDefinition("wikidata", root_manager_key="wikidata_root"), dg.InputDefinition("portals", root_manager_key="portals_root"), dg.InputDefinition("camera", root_manager_key="camera_root"), dg.InputDefinition("images", root_manager_key="images_root"), ], output_defs=[dg.OutputDefinition(dagster_type=pd.DataFrame)], ) def create_metadata( context, cumulus: main_dataframe_types, wikidata: main_dataframe_types, portals: main_dataframe_types, camera: gpd.GeoDataFrame, images: main_dataframe_types, ):
class Image: """ Original image path. Attributes return id, jpg or tif extension. """ def __init__(self, path): self.path = path self.jpg = str(os.path.split(self.path)[1].split(".")[0] + ".jpg") self.tif = str(os.path.split(self.path)[1]) self.id = str(os.path.split(self.path)[1].split(".")[0]) @dg.solid( config_schema=dg.StringSource, input_defs=[dg.InputDefinition("metadata", root_manager_key="metadata_root")], output_defs=[dg.OutputDefinition(dagster_type=dict)], ) def file_picker(context, metadata: dp.DataFrame): """ Walks directory tree and glob relevant files. Returns dictionary with geolocated, backlog and files for review """ source = context.solid_config metadata["Source ID"] = metadata["Source ID"].str.upper() has_kml = list(metadata.loc[metadata["Latitude"].notna(), "Source ID"]) catalog = list(metadata["Source ID"]) image_list = [ Image(os.path.join(root, name)) for root, dirs, files in os.walk(source)
import dagster as dg import subprocess @dg.solid(config_schema={ "commit": dg.Field(dg.String), "branch": dg.Field(dg.String) }, input_defs=[dg.InputDefinition("commit", dagster_type=dg.Nothing)]) def push_new_data(context): """ Push data to Git submodule and commit changes to main repository """ commit = context.solid_config["commit"] branch = context.solid_config["branch"] submodule_push = [ "pwd", "git checkout main", "git add .", f"git commit -a -m ':card_file_box: Update {commit} data'", "git push", ] for command in submodule_push: git_cli_sub = subprocess.Popen( command, shell=True,
from bokeh.transform import cumsum from bokeh.plotting import output_file, show, figure from bokeh.models import ( CheckboxButtonGroup, CustomJS, LinearColorMapper, Row, HoverTool, Span, Title, ) @dg.solid( input_defs=[ dg.InputDefinition("metadata", root_manager_key="metadata_root"), dg.InputDefinition("camera", root_manager_key="camera_root"), dg.InputDefinition("cumulus", root_manager_key="cumulus_root"), ], output_defs=[dg.OutputDefinition(dagster_type=dp.DataFrame)], ) def load_metadata( _, metadata: dp.DataFrame, camera: main_dataframe_types, cumulus: main_dataframe_types, ): """ Merge relevant dataframes to access objects status and properties """
collections = {} collections["data"] = collection collections["key"] = collection_path collections["type"] = "json" data.append(collections) print("Collection updated: {0}".format(collection_name)) else: pass return data @dg.solid( input_defs=[ dg.InputDefinition("metadata", root_manager_key="metadata_root"), dg.InputDefinition("mapping", root_manager_key="mapping_root"), ], output_defs=[dg.DynamicOutputDefinition(dict)], ) def get_items(context, metadata, mapping): ims = ((metadata["Latitude"].notna()) & (metadata["Source URL"].notna()) & (metadata["Media URL"].notna()) & (metadata["First Year"].notna()) & (metadata["Last Year"].notna())) jstor = metadata["Source"] != "Instituto Moreira Salles" metadata = metadata.loc[ims | jstor] metadata.fillna("", inplace=True) metadata.set_index("Source ID", inplace=True) mapping.set_index("Label:en", inplace=True)
import dagster as dg import dagster_pandas as dp from dagster_pandas.data_frame import DataFrame from numpy.core.numeric import True_ import pandas as pd from pandas._libs.tslibs import NaT from tests.dataframe_types import * from tests.objects_types import * # solids cumulus @dg.solid( input_defs=[dg.InputDefinition("root", root_manager_key="cumulus_root")], output_defs=[dg.OutputDefinition(dagster_type=dp.DataFrame)], ) def xml_to_df(context, root): """ Build Pandas DataFrame from XML file """ # Find the uids uids = {} for thing in root[0][0]: uids[thing.attrib["uid"]] = thing[0].text table = {} for field in uids.values(): table[field] = [] outDict = {"table": table, "uids": uids}