def test_shell_initializes_and_runs_multiline_cmd(): cmd = """ TEST=$(cat <<-END This is line one This is line two This is line three boom! END ) for i in $TEST do echo $i done""" with Flow(name="test") as f: task = ShellTask()(command=cmd, env={key: "test" for key in "abcdefgh"}) out = f.run() assert out.is_successful() assert out.result[task].result == "boom!"
def test_shell_task_handles_multiline_commands(): with tempfile.TemporaryDirectory() as tempdir: cmd = """ cd {} for file in $(ls) do cat $file done """.format( tempdir ) with open(tempdir + "/testfile.txt", "w") as f: f.write("this is a test") with Flow(name="test") as f: task = ShellTask()(command=cmd) out = f.run() assert out.is_successful() assert out.result[task].result == "this is a test"
def test_basic_trigger_dag_triggers(self, airflow_settings): task = AirflowTriggerDAG(dag_id="tutorial", execution_date="1986-09-20", env=airflow_settings) check_task = ShellTask( command="airflow list_dag_runs tutorial", helper_script=task.helper_script, env=airflow_settings, ) with Flow(name="tutorial") as flow: res = check_task(upstream_tasks=[task]) flow_state = flow.run() assert flow_state.is_successful() check_state = flow_state.result[res] assert check_state.is_successful() # check CLI output assert "manual__1986-09-20T00:00:00+00:00" in check_state.result assert "running" in check_state.result assert "1986-09-20T00:00:00+00:00" in check_state.result
def test_shell_runs_other_shells(): with Flow(name="test") as f: task = ShellTask(shell="zsh")(command="echo -n $ZSH_NAME") out = f.run() assert out.is_successful() assert out.result[task].result == "zsh"
def test_shell_initializes_and_multiline_output_optionally_returns_all_lines(): with Flow(name="test") as f: task = ShellTask(return_all=True)(command="echo -n 'hello world\n42'") out = f.run() assert out.is_successful() assert out.result[task].result == ["hello world", "42"]
def test_shell_initializes_and_multiline_output_returns_last_line(): with Flow(name="test") as f: task = ShellTask()(command="echo -n 'hello world\n42'") out = f.run() assert out.is_successful() assert out.result[task].result == "42"
def test_shell_task_raises_fail_if_cmd_fails(): with Flow(name="test") as f: task = ShellTask()(command="ls surely_a_dir_that_doesnt_exist") out = f.run() assert out.is_failed() assert "Command failed with exit code" in str(out.result[task].message)
) @task def curl_cmd(url: str, file: str) -> str: """ The curl command we wish to execute. """ prefect.context.get("logger") if Path(file).exists(): raise SKIP("Image data file already exists.") return f"curl -fL -o {file} {url}" download: Runable = ShellTask(name="curl_task", max_retries=2, retry_delay=datetime.timedelta(seconds=10)) @task(skip_on_upstream_skip=False) def load_and_split(fname: str) -> list: """ Loads image data file at `fname` and splits it into multiple frames. Returns a list of bytes, one element for each frame. """ prefect.context.get("logger") with open(fname, "rb") as f: imgs = f.read() return [img for img in imgs.split(b"\n" * 4) if img]
def match_ssp(ssp, component_spec): command = "python ssp.py --reader json-l match --components {} {}".format( component_spec, ssp) output = ShellTask(command=command, return_all=True).run() return "\n".join(output)
#!/usr/bin/env python # # Requires: # conda create -n prefect -c conda-forge prefect # from prefect import Flow, Parameter, task, unmapped from prefect.executors import LocalDaskExecutor from prefect.tasks.shell import ShellTask from prefect.utilities.debug import raise_on_exception from omero.cli import cli_login from omero.gateway import BlitzGateway name = Parameter("name") shell = ShellTask(return_all=True, log_stderr=True) COMMAND = "/opt/omero/server/OMERO.server/bin/omero" @task def render(object, name): return (f"{COMMAND} render set {object} " f"/uod/idr/metadata/idr0072-schormann-subcellref/" f"{name}/idr0072-{name}-render.yml") @task def list_children(name, ignore): with cli_login() as cli: conn = BlitzGateway(client_obj=cli.get_client())
start_date=pendulum.datetime(2020, 4, 22, 17, 30, tz="America/Toronto"), interval=timedelta(days=1) )], # but only on weekdays filters=[filters.is_weekday], # and not in January TODO: Add TSX Holidays not_filters=[filters.between_dates(1, 1, 1, 31)] ) #tsx_imb_fl.schedule = schedule ############## Storage ecr docker flow ############## dkr_ecr_scrt = PrefectSecret("docker_ecr_login").run() get_ecr_auth_token = ShellTask(helper_script="cd ~") ecr_auth_token = get_ecr_auth_token.run(command=dkr_ecr_scrt) ecr_client = boto3.client('ecr', region_name=aws_region) ecr_token = ecr_client.get_authorization_token() # # Decode the aws token username, password = base64.b64decode(ecr_token['authorizationData'][0]['authorizationToken']).decode().split(':') ecr_url = ecr_token['authorizationData'][0]['proxyEndpoint'] ############################################################ # # # Registry URL for prefect or docker push ecr_repo_name = f"{ecr_url.replace('https://', '')}"#/{aws_ecr_repo_name}" #:latest"
# import pandas as pd from prefect import task, Flow from prefect.tasks.shell import ShellTask @task() def get_dataframe(): return pd.read_excel('./top2000.xlsx') my_task = ShellTask() with Flow("shell") as f: output = my_task(command="in2csv top2000.xlsx | tee top2000.csv | ls") flow_state = f.run() shell_output = flow_state.result[output].result print(shell_output)
def test_shell_task_env_can_be_set_at_init(): with Flow(name="test") as f: task = ShellTask(env=dict(MYTESTVAR="test"))(command="echo -n $MYTESTVAR") out = f.run() assert out.is_successful() assert out.result[task].result == "test"
def test_shell_task_accepts_env(): with Flow(name="test") as f: task = ShellTask()(command="echo -n $MYTESTVAR", env=dict(MYTESTVAR="test")) out = f.run() assert out.is_successful() assert out.result[task].result == "test"
import prefect from prefect import task, Flow from prefect.tasks.shell import ShellTask from prefect.tasks.templates.strings import StringFormatter spark_submit_command = StringFormatter(template=''' spark-submit --packages org.apache.hadoop:hadoop-aws:3.2.0 \ /opt/scripts/warehouse.py {yesterday} ''') bash = ShellTask(log_stderr=True, return_all=True) with Flow('warehouse') as flow: bash(command=spark_submit_command()) if __name__ == '__main__': flow.run()
def test_shell_log_stream_type_error_on_invalid_log_level_string(caplog): with pytest.raises(TypeError): with raise_on_exception(): with Flow(name="test") as f: ShellTask(stream_output="FOO")
from prefect import task, Flow, Parameter from prefect.tasks.shell import ShellTask from datetime import timedelta run_script = ShellTask(helper_script="cd ./scripts", cache_for=timedelta(days=1), return_all=True, log_stdout=True) with Flow("covid_update_data") as f: run_date = Parameter(name='run_date') # update_jhu = run_script(command="python 0_prepare_data_jhu.py") get_apple = run_script(command="python 1_apple_download_report.py") update_apple = run_script(command="python 2_prepare_data_apple.py") get_rki = run_script( command=f"python 3_rki_report_download.py --date={run_date}") parse_rki = run_script( command=f"python 4_rki_report_parse.py --date={run_date}") # update_rki = run_script(command="python 5_prepare_data_rki.py") update_apple.set_upstream(get_apple) parse_rki.set_upstream(get_rki) # update_rki.set_upstream(parse_rki) f.run(parameters={"run_date": "2020-06-14"}) # f.visualize()
from prefect import Flow, task from prefect.triggers import any_failed from prefect.tasks.shell import ShellTask from prefect.engine.executors import LocalDaskExecutor from prefect.environments import LocalEnvironment @task(max_retries=3, retry_delay=timedelta(seconds=0)) def extract_phizz(): return [ # schema output '''psql "dbname='db' user='******' password='******' host='postgres.rds.amazonaws.com'" -c "\COPY (select row_to_json(t) from public.information_schema.columns as t ) to '~/s3_bucket/extractload/schema.json';"''', ] getdata = ShellTask(name='shell task', helper_script='cd /home/ubuntu/prefect_scripts') with Flow("Schema Extract") as flow: phizz_data_extract = getdata.map(extract_phizz) flow.environment = LocalEnvironment( labels=[], executor=LocalDaskExecutor(scheduler="threads", num_workers=50), ) # to run locally use flow.run() #flow.run() # to register to the prefect server use flow.register # this assumes you have a project named ExtractLoad flow.register(project_name="ExtractLoad")
from datetime import datetime, timedelta import prefect from prefect import Flow, Parameter, task from prefect.schedules import IntervalSchedule from prefect.tasks.shell import ShellTask from prefect.tasks.templates.jinja2 import JinjaTemplate ## default config settings such as this can generally be set in your ## user config file retry_delay = timedelta(minutes=5) ## create all relevant tasks t1 = ShellTask( name="print_date", command="date", max_retries=1, retry_delay=retry_delay ) t2 = ShellTask(name="sleep", command="sleep 5", max_retries=3, retry_delay=retry_delay) @task(max_retries=1, retry_delay=retry_delay) def add_7(): date = prefect.context.get("scheduled_start_time", datetime.utcnow()) return date + timedelta(days=7) ## templated command; template vars will be read from both prefect.context as well as ## any passed kwargs to the task command = """ {% for i in range(5) %} echo "{{ scheduled_start_time }}"
## - place into database @task(name="Format Command") def cmd(last_date): """ Based on the last available date in the database, creates the appropriate journalctl command to collect all sshd logs since the last seen date. """ if not last_date: since = pendulum.now("utc").add( hours=-48).strftime("%Y-%m-%d %H:%M:%S") else: since = last_date[-1][0] return f'journalctl _COMM=sshd -o json --since "{since}" --no-pager' shell_task = ShellTask(name="Extract", return_all=True) @task(name="Transform") def transform(raw_data): """ Takes the raw data returned from the journalctl command and filters / parses it down into a database-ready collection of rows. """ data = [json.loads(line) for line in raw_data] rows = [] user_patt = re.compile("user (.*?) from") network_patt = re.compile("from (.*?) port (.*?)$") db_path = os.path.expanduser("~/GeoLite/GeoLite2-City.mmdb")
retry_delay=timedelta(minutes=1), nout=2, trigger=triggers.all_finished, ) def create_parquet(_success): ts = prefect.context.scheduled_start_time dt_str = pd.to_datetime(ts).strftime("%Y-%m-%dT%H") vintage_fn = FN_STR.format(dt_str) + ".parquet" fn = FN_STR.format("") + ".parquet" df = pd.read_csv(CSV_FN, parse_dates=["dt"]) df.to_parquet(DATA_PATH / vintage_fn, index=False) df.to_parquet(DATA_PATH / fn, index=False) return vintage_fn, fn @task def get_gcs_cmd(fn): return f"gsutil acl ch -u AllUsers:R gs://can-scrape-outputs/final/{fn}" shell = ShellTask() with Flow("UpdateParquetFiles", CronSchedule("10 */2 * * *")) as f: connstr = EnvVarSecret("COVID_DB_CONN_URI") success = export_to_csv(connstr) vintage_fn, fn = create_parquet(success) shell(get_gcs_cmd(vintage_fn)) shell(get_gcs_cmd(fn)) f.register(project_name="can-scrape")
def test_shell_initializes_with_basic_cmd(): with Flow(name="test") as f: task = ShellTask(command="echo -n 'hello world'")() out = f.run() assert out.is_successful() assert out.result[task].result == "hello world"
params=params) return response.text @task def load_file(filename: str) -> str: with open(filename, "r", encoding="utf-8") as file: return file.read() @task def printa(stuff): print(stuff) task = ShellTask(return_all=True) with Flow("shell") as f: translation_server_url = "http://localhost:1969" bibtex = load_file("./workspace/aksw-short.bib") zotero = import_translation(bibtex, translation_server_url) rdf = export_translation(zotero, translation_server_url, "rdf_bibliontology") turtle = task(command="rapper - -o turtle -I www.test.com > tests.ttl") printa(turtle) f.run_config = DockerRun(image="prefecthq/prefect") f.register(project_name="tutoriala") # Configure extra environment variables for this flow, # and set a custom image # f.run()
def test_shell_returns_none_if_empty_output(): with Flow(name="test") as f: task = ShellTask()(command="ls > /dev/null") out = f.run() assert out.is_successful() assert out.result[task].result is None
jobs = [] for k, v in data.items(): jobs.append( bq.load_table_from_dataframe( dataframe=v, destination=".".join(["cbs", k]), project=GCP.project, job_config=job_config, location=GCP.location, )) return jobs gcp = Parameter("gcp", required=True) filepath = Parameter("filepath", required=True) curl_download = ShellTask(name="curl_download") with Flow("CBS regionaal") as flow: # # TODO: fix UnicodeDecodeError when writing to Google Drive curl_command = curl_cmd(URL_PC6HUISNR, filepath) # curl_download = curl_download(command=curl_command) # gwb = pc6huisnr_to_gbq(zipfile=filepath, GCP=gcp, upstream_tasks=[curl_download]) regionaal = cbsodatav3_to_gbq.map( id=ODATA_REGIONAAL, GCP=unmapped(gcp), task_args={'skip_on_upstream_skip': False}) regionaal_column_description = column_descriptions.map( table_id=ODATA_REGIONAAL, GCP=unmapped(gcp), upstream_tasks=[regionaal])
def test_shell_raises_if_no_command_provided(): with Flow(name="test") as f: ShellTask()() with pytest.raises(TypeError): with raise_on_exception(): assert f.run()
""" A quick demo of three little shell tasks """ import sys from pathlib import Path from prefect import Flow, task from prefect.schedules import Schedule from prefect.schedules.clocks import CronClock from prefect.tasks.shell import ShellTask shelltask = ShellTask() @task(log_stdout=True) def show_file(): with Path("/tmp/flow.me") as fd: print(fd.read_text()) with Flow("three_little_tasks_flow") as flow: t1 = shelltask(command="echo '====== start' >> /tmp/flow.me") t2 = shelltask(command="date >> /tmp/flow.me; sleep 3") t3 = shelltask(command="echo '====== stop' >> /tmp/flow.me") t1.set_downstream(t2) t2.set_downstream(t3) t3.set_downstream(show_file) if __name__ == "__main__": cmd = "run"
# # Assumes you have SnowSQL CLI installed # Assumes you have setup the user config # import prefect from prefect import task, Flow from prefect.tasks.shell import ShellTask with Flow("SnowSQL") as flow: data_load_date = ShellTask( name='what time is it')(command='snowsql -d dw -s public -q "select current_timestamp()"') # to run locally use flow.run() #flow.run() # to register to the prefect server use flow.register # this assumes you have a project named ExtractLoad flow.register(project_name="ExtractLoad")
from prefect import Task, Flow from prefect.tasks.shell import ShellTask class ShowOutput(Task): def run(self, std_out): print(std_out) ls_task = ShellTask(command="ls", return_all=True) show_output = ShowOutput() ls_count = ShellTask(command="ls | wc -l", return_all=True) show_output2 = ShowOutput() flow = Flow("list_files") show_output.set_upstream(ls_task, key="std_out", flow=flow) show_output2.set_upstream(ls_count, key="std_out", flow=flow) flow.run()