def test_cross_downstream(self): """Test if all dependencies between tasks are all set correctly.""" dag = DAG(dag_id="test_dag", start_date=datetime.now()) start_tasks = [DummyOperator(task_id=f"t{i}", dag=dag) for i in range(1, 4)] end_tasks = [DummyOperator(task_id=f"t{i}", dag=dag) for i in range(4, 7)] cross_downstream(from_tasks=start_tasks, to_tasks=end_tasks) for start_task in start_tasks: self.assertCountEqual(start_task.get_direct_relatives(upstream=False), end_tasks)
f.write('my_data') def _checking_data(): print("checking data") with DAG(dag_id='sample_af2_dag', default_args=default_args, schedule_interval='@daily', catchup=False, start_date=datetime(2021, 1, 1)) as dag: downloading_data = PythonOperator( task_id='downloading_data', python_callable=_downloading_data ) checking_data = PythonOperator( task_id='checking_data', python_callable=_checking_data ) waiting_for_data = FileSensor( task_id='waiting_for_data', fs_conn_id='fs_default', filepath='my_files.txt' ) processing_data = BashOperator( task_id='processing_data', bash_command='exit 0' ) cross_downstream([downloading_data, checking_data], [waiting_for_data,processing_data])
task_all_failed = DummyOperator( task_id='task_all_failed', trigger_rule='all_failed', dag=dag ) task_none_failed = DummyOperator( task_id='task_none_failed', trigger_rule='none_failed', dag=dag ) task_none_failed_or_skipped = DummyOperator( task_id='task_none_failed_or_skipped', trigger_rule='none_failed_or_skipped', dag=dag ) task_one_failed = DummyOperator( task_id='task_one_failed', trigger_rule='one_failed', dag=dag ) cross_downstream(from_tasks=[task_all_success, task_skipped, task_failed, task_one_success], \ to_tasks=[task_none_failed, task_none_failed_or_skipped, task_all_failed, task_one_failed]) task_all_done = DummyOperator(task_id='task_all_done', trigger_rule='all_done', dag=dag) task_start >> task_list >> case_group >> task_all_done
def create_dag(dag_id, schedule, window, default_args): with DAG( dag_id, default_args=default_args, description='creates sliding windows based on months', schedule_interval=schedule, start_date=datetime.datetime(2021, 4, 30), on_failure_callback=dag_fail_slack_alert, on_success_callback=dag_success_slack_alert, tags=['selection', 'sliding'], ) as dag: OUTPUT_DIR = WORKING_DIR + "/data/sliding-windows-bealign/" + '_'.join( window) default_args["params"]["output-dir"] = OUTPUT_DIR default_args["params"][ "meta-output"] = OUTPUT_DIR + '/master-no-sequences.json' default_args["params"]["sequence-output"] = OUTPUT_DIR + '/sequences' with open(dag.params["region_cfg"], 'r') as stream: regions = yaml.safe_load(stream) mk_dir_task = BashOperator( task_id='make_directory', bash_command='mkdir -p {{params.output}}', params={'output': default_args['params']['output-dir']}, dag=dag, ) export_meta_task = PythonOperator( task_id='export_meta', python_callable=export_meta, op_kwargs={"config": default_args['params']}, pool='mongo', dag=dag, ) export_meta_task.set_upstream(mk_dir_task) export_sequences_task = PythonOperator( task_id='export_sequences', python_callable=export_sequences, op_kwargs={"config": default_args['params']}, pool='mongo', dag=dag, ) export_sequences_task.set_upstream(mk_dir_task) # For each region export_by_gene = [] for gene in regions.keys(): filepath_prefix = OUTPUT_DIR + '/sequences.' + gene nuc_sequence_output = filepath_prefix + '_nuc.fas' uniques_fn = filepath_prefix + '_nuc.uniques.fas' duplicate_output = filepath_prefix + '.duplicates.json' variants_csv_output = filepath_prefix + '.variants.csv' variants_json_output = filepath_prefix + '.variants.json' filtered_fasta_output = filepath_prefix + '.compressed.filtered.fas' filtered_json_output = filepath_prefix + '.filtered.json' output_edits_fn = filepath_prefix + '.filtered.edits.json' compressor_duplicate_out = filepath_prefix + '.duplicates.variants.json' tree_output = filepath_prefix + '.compressed.filtered.fas.rapidnj.bestTree' sto_output = filepath_prefix + '.compressed.filtered.sto' slac_output_fn = filepath_prefix + '.SLAC.json' fel_output_fn = filepath_prefix + '.FEL.json' meme_output_fn = filepath_prefix + '.MEME.json' summary_output_fn = filepath_prefix + '.json' default_args["params"]["nuc-sequence-output"] = nuc_sequence_output default_args["params"]["duplicate-output"] = duplicate_output with TaskGroup(f"alignment_{gene}") as alignment: export_bealign_task = PythonOperator( task_id=f'export_bealign', python_callable=export_bealign_sequences, op_kwargs={ "config": default_args['params'], 'nuc_output_fn': nuc_sequence_output, 'gene': gene }, dag=dag, ) # Occasional errors when cleaning up tmp files, so or'ing true cleanup_task = BashOperator( task_id=f'cleanup', bash_command= "sed -i '/^>/! s/[^ACTG-]/N/g' $NUC_OUTPUT_FN || true", env={ 'NUC_OUTPUT_FN': nuc_sequence_output, **os.environ }, dag=dag) export_bealign_task >> cleanup_task with TaskGroup(f"duplicates_{gene}") as duplicates_group: compute_duplicates_task = PythonOperator( task_id=f'write_raw_duplicates', python_callable=write_nuc_raw_duplicates, op_kwargs={ "input": nuc_sequence_output, "duplicate_output": duplicate_output, 'uniques_output': uniques_fn }, dag=dag, ) compute_duplicates_task # $HYPHY LIBPATH=$HYPHYLIBPATH $COMPRESSOR --msa ${FILE}.${GENE}.compressed.fas --regexp "epi_isl_([0-9]+)" --duplicates ${FILE}.${GENE}.duplicates.json --output ${FILE}.${GENE}.variants.csv --json ${FILE}.${GENE}.variants.json --duplicate-out ${FILE}.${GENE}.duplicates.variants.json with TaskGroup(f"filter_{gene}") as filter: COMPRESSOR = """ {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor }} --msa $FASTA_FN --regexp "epi_isl_([0-9]+)" --duplicates $DUPLICATE_FN --output $VARIANTS_CSV_FN --json $VARIANTS_JSON_FN --duplicate-out $COMPRESSOR_DUPLICATE_OUT """ compressor_task = BashOperator(task_id=f'compressor', bash_command=COMPRESSOR, env={ 'FASTA_FN': uniques_fn, 'DUPLICATE_FN': duplicate_output, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'COMPRESSOR_DUPLICATE_OUT': compressor_duplicate_out, **os.environ }, dag=dag) # --output-edits ${FILE}.${GENE}.filtered.edits.json COMPRESSOR2 = """ {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor2 }} --msa $FASTA_FN --duplicates $DUPLICATE_FN --csv $VARIANTS_CSV_FN --byseq $VARIANTS_JSON_FN --p 0.95 --output $FILTERED_FASTA_FN --json $FILTERED_JSON_FN --output-edits ${OUTPUT_EDITS} """ compressor_two_task = BashOperator( task_id=f'compressor_two', bash_command=COMPRESSOR2, env={ 'FASTA_FN': uniques_fn, 'DUPLICATE_FN': compressor_duplicate_out, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'FILTERED_FASTA_FN': filtered_fasta_output, 'FILTERED_JSON_FN': filtered_json_output, 'OUTPUT_EDITS': output_edits_fn, **os.environ }, dag=dag) compressor_task >> compressor_two_task INFER_TREE = """ seqmagick convert $FILTERED_FASTA_FN $STO_OUTPUT; rapidnj $STO_OUTPUT -i sth > $TREE_OUTPUT sed -i "s/'//g" $TREE_OUTPUT; """ infer_tree_task = BashOperator(task_id=f'infer_tree_{gene}', bash_command=INFER_TREE, env={ 'FILTERED_FASTA_FN': filtered_fasta_output, 'STO_OUTPUT': sto_output, 'TREE_OUTPUT': tree_output, **os.environ }, dag=dag) slac_task = BashOperator( task_id=f'slac_{gene}', bash_command= "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} slac --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches All --samples 0 --output $SLAC_OUTPUT", env={ 'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'SLAC_OUTPUT': slac_output_fn, **os.environ }, dag=dag, ) big_data_flags = '--full-model No' fel_task = BashOperator( task_id=f'fel_{gene}', bash_command= "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} fel --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $FEL_OUTPUT", env={ 'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'FEL_OUTPUT': fel_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ }, dag=dag, ) meme_task = BashOperator( task_id=f'meme_{gene}', bash_command= "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} meme --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $MEME_OUTPUT", env={ 'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'MEME_OUTPUT': meme_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ }, dag=dag, ) annotation_file = filepath_prefix + '.annotation.json' copy_annotation_task = BashOperator( task_id=f'copy_annotation_{gene}', bash_command= 'cp {{params.working_dir}}/data/comparative-annotation.json {{params.annotation_file}}', params={ 'annotation_file': annotation_file, 'working_dir': WORKING_DIR }, dag=dag) summarize_gene_task = BashOperator( task_id=f'summarize_gene_{gene}', bash_command= '{{ params.python }} {{params.working_dir}}/python/summarize_gene.py -T {{params.working_dir}}/data/ctl/epitopes.json -B {{params.working_dir}}/data/single_mut_effects.csv -D $MASTERNOFASTA -d $DUPLICATES -s $SLAC_OUTPUT -f $FEL_OUTPUT -m $MEME_OUTPUT -P 0.1 --output $SUMMARY_OUTPUT -c $COMPRESSED_OUTPUT_FN -E {{params.working_dir}}/data/evo_annotation.json -A {{params.working_dir}}/data/mafs.csv -V {{params.working_dir}}/data/evo_freqs.csv -F $FRAGMENT --frame_shift $ADDSHIFT --fragment_shift $SHIFT -S $OFFSET -O $ANNOTATION', params={ 'python': default_args['params']['python'], 'working_dir': WORKING_DIR }, env={ 'MASTERNOFASTA': default_args["params"]["meta-output"], 'DUPLICATES': duplicate_output, 'SLAC_OUTPUT': slac_output_fn, 'FEL_OUTPUT': fel_output_fn, 'MEME_OUTPUT': meme_output_fn, 'SUMMARY_OUTPUT': summary_output_fn, 'COMPRESSED_OUTPUT_FN': filtered_fasta_output, 'FRAGMENT': str(regions[gene]['fragment']), 'ADDSHIFT': str(regions[gene]['add_one']), 'SHIFT': str(regions[gene]['shift']), 'OFFSET': str(regions[gene]['offset']), 'ANNOTATION': annotation_file, **os.environ }, dag=dag, ) summarize_gene_task.set_upstream(export_meta_task) alignment.set_upstream(export_sequences_task) export_by_gene.append( alignment >> duplicates_group >> filter >> infer_tree_task >> [ slac_task, fel_task, meme_task ] >> copy_annotation_task >> summarize_gene_task) dag.doc_md = __doc__ # Add export meta and export sequence tasks to be executed in parallel cross_downstream([export_meta_task, export_sequences_task], export_by_gene) return dag
env={ 'MASTERNOFASTA': default_args["params"]["meta-output"], 'DUPLICATES': duplicate_output, 'SLAC_OUTPUT': slac_output_fn, 'FEL_OUTPUT': fel_output_fn, 'MEME_OUTPUT': meme_output_fn, 'SUMMARY_OUTPUT': summary_output_fn, 'COMPRESSED_OUTPUT_FN': filtered_fasta_output, 'FRAGMENT': str(regions[gene]['fragment']), 'ADDSHIFT': str(regions[gene]['add_one']), 'SHIFT': str(regions[gene]['shift']), 'OFFSET': str(regions[gene]['offset']), 'ANNOTATION': annotation_file, **os.environ }, dag=dag, ) summarize_gene_task.set_upstream(export_meta_task) alignment.set_upstream(export_sequences_task) export_by_gene.append( alignment >> duplicates_group >> filter >> infer_tree_task >> [ slac_task, fel_task, meme_task ] >> copy_annotation_task >> summarize_gene_task) dag.doc_md = __doc__ # Add export meta and export sequence tasks to be executed in parallel # cross_downstream([export_meta_task, export_sequences_task], export_by_gene) cross_downstream([export_meta_task, export_sequences_task], export_by_gene)
def create_dag(dag_id, schedule, window, default_args): with DAG( dag_id, default_args=default_args, description='creates sliding windows based on months', schedule_interval=schedule, start_date=datetime.datetime(2021, 4, 30), on_failure_callback=task_fail_slack_alert, on_success_callback=task_success_slack_alert, tags=['selection','sliding'], ) as dag: OUTPUT_DIR = WORKING_DIR + "/data/sliding-windows/" + '_'.join(window) default_args["params"]["output-dir"] = OUTPUT_DIR default_args["params"]["meta-output"] = OUTPUT_DIR + '/master-no-sequences.json' default_args["params"]["sequence-output"] = OUTPUT_DIR + '/sequences' with open(dag.params["region_cfg"], 'r') as stream: regions = yaml.safe_load(stream) mk_dir_task = BashOperator( task_id='make_directory', bash_command='mkdir -p {{params.output}}', params={'output': default_args['params']['output-dir']}, dag=dag, ) export_meta_task = PythonOperator( task_id='export_meta', python_callable=export_meta, op_kwargs={ "config" : default_args['params'] }, pool='mongo', dag=dag, ) export_meta_task.set_upstream(mk_dir_task) export_sequences_task = PythonOperator( task_id='export_sequences', python_callable=export_sequences, op_kwargs={ "config" : default_args['params'] }, pool='mongo', dag=dag, ) export_sequences_task.set_upstream(mk_dir_task) # For each region export_by_gene = [] for gene in regions.keys(): reference_filepath = WORKING_DIR + 'reference_genes/reference.' + gene + '_protein.fas' filepath_prefix = OUTPUT_DIR + '/sequences.' + gene nuc_sequence_output = filepath_prefix + '_nuc.fas' prot_sequence_output = filepath_prefix + '_protein.fas' initial_duplicate_output = filepath_prefix + '.initial.duplicates.json' protein_duplicate_output = filepath_prefix + '.protein.duplicates.json' duplicate_output = filepath_prefix + '.duplicates.json' map_output = filepath_prefix + '.map.json' variants_csv_output = filepath_prefix + '.variants.csv' variants_json_output = filepath_prefix + '.variants.json' filtered_fasta_output = filepath_prefix + '.compressed.filtered.fas' filtered_json_output = filepath_prefix + '.filtered.json' output_edits_fn = filepath_prefix + '.filtered.edits.json' compressed_output_filepath = filepath_prefix + '.compressed.fas' compressor_duplicate_out = filepath_prefix + '.duplicates.variants.json' tree_output = filepath_prefix + '.compressed.filtered.fas.rapidnj.bestTree' sto_output = filepath_prefix + '.compressed.filtered.sto'; tmp_output_fn = filepath_prefix + '.tmp.msa' output_fn = filepath_prefix + '.msa' slac_output_fn = filepath_prefix + '.SLAC.json' fel_output_fn = filepath_prefix + '.FEL.json' meme_output_fn = filepath_prefix + '.MEME.json' summary_output_fn = filepath_prefix + '.json' default_args["params"]["nuc-sequence-output"] = nuc_sequence_output default_args["params"]["prot-sequence-output"] = prot_sequence_output default_args["params"]["duplicate-output"] = duplicate_output default_args["params"]["protein-duplicate-output"] = protein_duplicate_output default_args["params"]["inital-duplicate-output"] = initial_duplicate_output with TaskGroup(f"alignment_{gene}") as alignment: export_premsa_sequence_task = PythonOperator( task_id=f'export_premsa_sequences_{gene}', python_callable=export_premsa_sequences, op_kwargs={ "config" : default_args['params'], 'nuc_output_fn': nuc_sequence_output, 'prot_output_fn' : prot_sequence_output, 'gene' : gene }, pool='mongo', dag=dag, ) export_duplicates_task = PythonOperator( task_id=f'export_duplicates_{gene}', python_callable=export_duplicates, op_kwargs={ 'output_fn' : initial_duplicate_output, 'gene': gene }, pool='mongo', dag=dag, ) MAFFT = """ {{ params.mafft }} --thread -1 --add $INPUT_FN $REFERENCE_FILEPATH >| $TMP_OUTPUT_FN """ mafft_task = BashOperator( task_id=f'mafft_{gene}', bash_command=MAFFT, params={'mafft': default_args['params']['mafft']}, env={'INPUT_FN': prot_sequence_output, 'TMP_OUTPUT_FN': tmp_output_fn, 'REFERENCE_FILEPATH': reference_filepath }, dag=dag ) # input_fn, reference_fn, output_fn remove_ref_task = PythonOperator( task_id=f'remove_ref_{gene}', python_callable=reserve_only_original_input, op_kwargs={ "input_fn" : tmp_output_fn, "original_fn" : prot_sequence_output, "output_fn": output_fn }, dag=dag, ) POSTMSA = """ {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.post_msa }} --protein-msa $INPUT_FN --nucleotide-sequences $NUC_INPUT_FN --output $COMPRESSED_OUTPUT_FN --duplicates $DUPLICATE_OUTPUT_FN """ # Run POST-MSA on cancatenated dataset to translate back to nucleotides reverse_translate_task = BashOperator( task_id=f'post_msa_{gene}', bash_command=POSTMSA, env={'INPUT_FN': output_fn, 'NUC_INPUT_FN': nuc_sequence_output , 'COMPRESSED_OUTPUT_FN': compressed_output_filepath, 'DUPLICATE_OUTPUT_FN': protein_duplicate_output, **os.environ}, dag=dag ) cleanup_task = BashOperator( task_id=f'cleanup_{gene}', bash_command="sed -i '/^>/! s/[^ACTG-]/N/g' $COMPRESSED_OUTPUT_FN", env={'COMPRESSED_OUTPUT_FN': compressed_output_filepath, **os.environ}, dag=dag ) [export_premsa_sequence_task] >> mafft_task >> remove_ref_task >> reverse_translate_task >> cleanup_task with TaskGroup(f"duplicates_{gene}") as duplicates_group: merge_duplicate_task = PythonOperator( task_id=f'merge_duplicates_{gene}', python_callable=merge_duplicates, op_kwargs={ 'protein_duplicates' : protein_duplicate_output, 'nuc_duplicates': initial_duplicate_output, 'output': duplicate_output}, dag=dag, ) # Fix duplicates fix_duplicate_task = PythonOperator( task_id=f'fix_duplicates_{gene}', python_callable=fix_duplicates, op_kwargs={ 'duplicates' : duplicate_output, 'map': map_output, 'overwrite': True }, dag=dag, ) # # Fix header files # echo "$PYTHON python/update_fasta_duplicates.py -f ${FILE}.${GENE}.compressed.fas -m ${FILE}.${GENE}.map.json" # $PYTHON python/update_fasta_duplicates.py -f ${FILE}.${GENE}.compressed.fas -m ${FILE}.${GENE}.map.json update_fasta_duplicates_task = PythonOperator( task_id=f'update_fasta_duplicates_{gene}', python_callable=update_fasta_duplicates, op_kwargs={ 'fasta_file' : compressed_output_filepath, 'map_file': map_output }, dag=dag, ) merge_duplicate_task >> fix_duplicate_task >> update_fasta_duplicates_task # $HYPHY LIBPATH=$HYPHYLIBPATH $COMPRESSOR --msa ${FILE}.${GENE}.compressed.fas --regexp "epi_isl_([0-9]+)" --duplicates ${FILE}.${GENE}.duplicates.json --output ${FILE}.${GENE}.variants.csv --json ${FILE}.${GENE}.variants.json --duplicate-out ${FILE}.${GENE}.duplicates.variants.json with TaskGroup(f"filter_{gene}") as filter: COMPRESSOR = """ {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor }} --msa $COMPRESSED_FN --regexp "epi_isl_([0-9]+)" --duplicates $DUPLICATE_FN --output $VARIANTS_CSV_FN --json $VARIANTS_JSON_FN --duplicate-out $COMPRESSOR_DUPLICATE_OUT """ compressor_task = BashOperator( task_id=f'compressor_{gene}', bash_command=COMPRESSOR, env={'COMPRESSED_FN': compressed_output_filepath, 'DUPLICATE_FN': duplicate_output, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'COMPRESSOR_DUPLICATE_OUT': compressor_duplicate_out, **os.environ}, dag=dag ) # --output-edits ${FILE}.${GENE}.filtered.edits.json COMPRESSOR2 = """ {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor2 }} --msa $COMPRESSED_FN --duplicates $DUPLICATE_FN --csv $VARIANTS_CSV_FN --byseq $VARIANTS_JSON_FN --p 0.95 --output $FILTERED_FASTA_FN --json $FILTERED_JSON_FN --output-edits ${OUTPUT_EDITS} """ compressor_two_task = BashOperator( task_id=f'compressor_two_{gene}', bash_command=COMPRESSOR2, env={'COMPRESSED_FN': compressed_output_filepath, 'DUPLICATE_FN': compressor_duplicate_out, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'FILTERED_FASTA_FN': filtered_fasta_output, 'FILTERED_JSON_FN': filtered_json_output, 'OUTPUT_EDITS': output_edits_fn, **os.environ}, dag=dag ) compressor_task >> compressor_two_task INFER_TREE = """ seqmagick convert $FILTERED_FASTA_FN $STO_OUTPUT; rapidnj $STO_OUTPUT -i sth > $TREE_OUTPUT sed -i "s/'//g" $TREE_OUTPUT; """ infer_tree_task = BashOperator( task_id=f'infer_tree_{gene}', bash_command=INFER_TREE, env={'FILTERED_FASTA_FN': filtered_fasta_output, 'STO_OUTPUT': sto_output, 'TREE_OUTPUT': tree_output, **os.environ}, dag=dag ) slac_task = BashOperator( task_id=f'slac_{gene}', bash_command="{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} slac --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches All --samples 0 --output $SLAC_OUTPUT", env={'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'SLAC_OUTPUT': slac_output_fn, **os.environ}, dag=dag, ) big_data_flags='--full-model No' fel_task = BashOperator( task_id=f'fel_{gene}', bash_command="{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} fel --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $FEL_OUTPUT", env={'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'FEL_OUTPUT': fel_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ}, dag=dag, ) meme_task = BashOperator( task_id=f'meme_{gene}', bash_command="{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} meme --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $MEME_OUTPUT", env={'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'MEME_OUTPUT': meme_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ}, dag=dag, ) # fubar_task = BashOperator( # task_id='fubar_{gene}', # bash_command='mkdir -p {{params.working_dir}}/data/fasta/{{params.date}}', # dag=dag, # ) # prime_task = BashOperator( # task_id='prime_{gene}', # bash_command='mkdir -p {{params.working_dir}}/data/fasta/{{params.date}}', # dag=dag, # ) annotation_file = filepath_prefix + '.annotation.json' copy_annotation_task = BashOperator( task_id=f'copy_annotation_{gene}', bash_command='cp {{params.working_dir}}/data/comparative-annotation.json {{params.annotation_file}}', params={'annotation_file': annotation_file, 'working_dir': WORKING_DIR}, dag=dag ) summarize_gene_task = BashOperator( task_id=f'summarize_gene_{gene}', bash_command='{{ params.python }} {{params.working_dir}}/python/summarize_gene.py -T {{params.working_dir}}/data/ctl/epitopes.json -B {{params.working_dir}}/data/single_mut_effects.csv -D $MASTERNOFASTA -d $DUPLICATES -s $SLAC_OUTPUT -f $FEL_OUTPUT -m $MEME_OUTPUT -P 0.1 --output $SUMMARY_OUTPUT -c $COMPRESSED_OUTPUT_FN -E {{params.working_dir}}/data/evo_annotation.json -A {{params.working_dir}}/data/mafs.csv -V {{params.working_dir}}/data/evo_freqs.csv -F $FRAGMENT --frame_shift $ADDSHIFT --fragment_shift $SHIFT -S $OFFSET -O $ANNOTATION', params={'python': default_args['params']['python'], 'working_dir': WORKING_DIR}, env={ 'MASTERNOFASTA': default_args["params"]["meta-output"], 'DUPLICATES': duplicate_output, 'SLAC_OUTPUT': slac_output_fn, 'FEL_OUTPUT': fel_output_fn, 'MEME_OUTPUT': meme_output_fn, 'SUMMARY_OUTPUT': summary_output_fn, 'COMPRESSED_OUTPUT_FN': filtered_fasta_output, 'FRAGMENT': str(regions[gene]['fragment']), 'ADDSHIFT': str(regions[gene]['add_one']), 'SHIFT': str(regions[gene]['shift']), 'OFFSET': str(regions[gene]['offset']), 'ANNOTATION': annotation_file, **os.environ}, dag=dag, ) summarize_gene_task.set_upstream(export_meta_task) alignment.set_upstream(export_sequences_task) export_by_gene.append(alignment >> duplicates_group >> filter >> infer_tree_task >> [slac_task, fel_task, meme_task] >> copy_annotation_task >> summarize_gene_task) dag.doc_md = __doc__ # Add export meta and export sequence tasks to be executed in parallel cross_downstream([export_meta_task, export_sequences_task], export_by_gene) return dag