def create_status_json_file(self): """Create [stage name]_status.json, which will include step-level job status updates to be used in idseq-web.""" log.write(f"Creating {self.step_status_local}") with open(self.step_status_local, 'w') as status_file: json.dump({}, status_file) idseq_dag.util.s3.upload_with_retries(self.step_status_local, self.output_dir_s3 + "/")
def sync_pairs(fastq_files, max_discrepant_fraction=0): """The given fastq_files contain the same read IDs but in different order. Output the same data in synchronized order. Omit any reads missing their mate up to max_discrepant_fraction if necessary. If more must be suppressed, indicate it in the second value of the returned tuple. """ if len(fastq_files) != 2: return fastq_files, False output_fnames = [ifn + ".synchronized_pairs.fq" for ifn in fastq_files] with open(fastq_files[0], "rb") as if_0, open(fastq_files[1], "rb") as if_1: with open(output_fnames[0], "wb") as of_0, open( output_fnames[1], "wb") as of_1: outstanding_r0, outstanding_r1, max_mem, total = PipelineStepRunStar.sync_pairs_work( of_0, of_1, if_0, if_1) if max_mem: # This will be printed if some pairs were out of order. msg = "WARNING: Pair order out of sync in {fqf}. Synchronized using RAM for {max_mem} pairs." msg = msg.format(fqf=fastq_files, max_mem=max_mem) log.write(msg) discrepancies_count = len(outstanding_r0) + len(outstanding_r1) if discrepancies_count: msg = "WARNING: Found {dc} broken pairs in {fqf}, e.g., {example}." msg = msg.format(dc=discrepancies_count, fqf=fastq_files, example=(outstanding_r0 or outstanding_r1).popitem()[0]) log.write(msg) too_discrepant = (discrepancies_count > max_discrepant_fraction * total) return output_fnames, too_discrepant
def parse_tsv(path, schema, expect_headers=False, raw_lines=False, strict=True): ''' Parse TSV file with given schema, yielding a dict per line. See BLAST_OUTPUT_SCHEMA, for example. When expect_headers=True, treat the first line as column headers. When strict mode is True, all columns in schema are required. When strict mode is False, will return None values for column not found. ''' assert expect_headers == False, "Headers not yet implemented." schema_items = list(schema.items()) with open(path, "r") as stream: for line_number, raw_line in enumerate(stream): try: row = raw_line.rstrip("\n").split("\t") if strict and len(row) != len(schema): msg = f"{path}:{line_number + 1}: Parse error. Input line does not conform to schema: {schema}" log.write(msg, warning=True) length = min(len(row), len(schema)) row_dict = { cname: ctype(vstr) for vstr, ( cname, ctype) in zip(row[0:length], schema_items[0:length]) } except: msg = f"{path}:{line_number + 1}: Parse error. Input does not conform to schema: {schema}" log.write(msg, warning=True) raise if raw_lines: yield row_dict, raw_line else: yield row_dict
def update_status_json_file(self, status): log.write( f"Updating status file for step {self.name} with status {status}") # First, update own status dictionary if "description" not in self.status_dict: self.status_dict["description"] = self.step_description() if "resources" not in self.status_dict: self.status_dict["resources"] = self.step_resources() if "start_time" not in self.status_dict and status == "running": self.status_dict["start_time"] = time.time() # seconds since epoch self.status_dict["status"] = status if self.input_file_error: self.status_dict["error"] = self.input_file_error.name if status == "uploaded": self.status_dict["end_time"] = time.time() self.status_dict["additional_output"] = \ [self.relative_path(f) for f in self.additional_output_files_visible] # Then, update file by reading the json, modifying, and overwriting. with self.step_status_lock: with open(self.step_status_local, 'r') as status_file: status = json.load(status_file) status.update({self.name: self.status_dict}) with open(self.step_status_local, 'w') as status_file: json.dump(status, status_file) idseq_dag.util.s3.upload_with_retries(self.step_status_local, self.output_dir_s3 + "/")
def get_sequence_for_thread(error_flags, accession_info, accession_id, nt_loc_dict, nt_bucket, nt_key, semaphore, mutex, seq_count=[0]): #pylint: disable=dangerous-default-value try: ref_seq_len, seq_name, accession_file = PipelineStepGenerateAlignmentViz.get_sequence_by_accession_id_s3( accession_id, nt_loc_dict, nt_bucket, nt_key) with mutex: accession_info['seq_file'] = accession_file accession_info['ref_seq_len'] = ref_seq_len accession_info['name'] = seq_name seq_count[0] += 1 if seq_count[0] % 100 == 0: msg = f"{seq_count[0]} sequences fetched, most recently " \ f"{accession_id}" log.write(msg) except: with mutex: if not error_flags: traceback.print_exc() error_flags["error"] = 1 finally: semaphore.release()
def sync_pairs(fastq_files, max_discrepancies=0): """The given fastq_files contain the same read IDs but in different order. Output the same data in synchronized order. Omit up to max_discrepancies if necessary. If more must be suppressed, raise assertion. """ if len(fastq_files) != 2: return fastq_files output_fnames = [ifn + ".synchronized_pairs.fq" for ifn in fastq_files] with open(fastq_files[0], "rb") as if_0, open(fastq_files[1], "rb") as if_1: with open(output_fnames[0], "wb") as of_0, open(output_fnames[1], "wb") as of_1: outstanding_r0, outstanding_r1, max_mem = PipelineStepRunStar.sync_pairs_work( of_0, of_1, if_0, if_1) if max_mem: # This will be printed if some pairs were out of order. msg = "WARNING: Pair order out of sync in {fqf}. " \ "Synchronized using RAM for {max_mem} pairs.".format( fqf=fastq_files, max_mem=max_mem) log.write(msg) discrepancies_count = len(outstanding_r0) + len(outstanding_r1) if discrepancies_count: msg = "WARNING: Found {dc} broken pairs in {fqf}, e.g., " \ "{example}.".format( dc=discrepancies_count, fqf=fastq_files, example=(outstanding_r0 or outstanding_r1).popitem()[0]) log.write(msg) assert discrepancies_count <= max_discrepancies, msg return output_fnames
def get_taxid_mapping_for_batch(taxids, taxid2wikidict, mutex, semaphore, max_attempt=3): ''' Get wiki mapping for a list of taxids ''' taxid_str = ",".join(taxids) log.write(f"fetching batch {taxid_str}") for attempt in range(max_attempt): try: handle = Entrez.elink(dbfrom="taxonomy", id=taxid_str, cmd="llinks") record = Entrez.read(handle) handle.close() parsed = {} results = record[0]['IdUrlList']['IdUrlSet'] for result in results: taxid = result['Id'] wikiurl = "" for link in result['ObjUrl']: url = str(link['Url']) if re.search('wikipedia.org', url): wikiurl = url break parsed[taxid] = wikiurl break except: log.write(f"failed batch attempt {attempt}") time.sleep(5) semaphore.release() with mutex: taxid2wikidict.update(parsed)
def run(self): """ Generate GSNAP index. To be called from idseq-infra """ nt_db = self.input_files_local[0][0] output_nt_index_tar = self.output_files_local()[0] output_nt_index_parent_dir = os.path.dirname(output_nt_index_tar) output_tar_base = os.path.basename(output_nt_index_tar) output_nt_index_dir_base = output_tar_base[:-4] k = self.additional_attributes.get("k", 16) # kmer k log.write(f"input: {nt_db} output: {output_nt_index_tar}") command.execute( command_patterns.SingleCommand(cmd="gmap_build", args=[ "-D", output_nt_index_parent_dir, "-d", output_nt_index_dir_base, "-k", k, nt_db ])) output_nt_index_dir = os.path.join(output_nt_index_parent_dir, output_nt_index_dir_base) self.additional_output_folders_hidden.append(output_nt_index_dir) command.execute( command_patterns.SingleCommand( cd=output_nt_index_parent_dir, cmd="tar", args=["cvf", output_tar_base, output_nt_index_dir_base]))
def fetch_target_from_s3(self, target): ''' .done file should be written to the result dir when the download is complete ''' log.write("Downloading target %s" % target) if target in self.given_targets: input_path_s3 = self.given_targets[target]["s3_dir"] else: input_path_s3 = self.output_dir_s3 with log.log_context("fetch_input_files_from_s3", {"target": target}): PipelineFlow.fetch_input_files_from_s3( input_files=self.targets[target], input_dir_s3=input_path_s3, result_dir_local=self.output_dir_local) if target in self.given_targets and self.given_targets[target].get( "count_reads"): with log.log_context("count_input_reads", {"target": target}): try: PipelineFlow.count_input_reads( input_files=self.targets[target], result_dir_local=self.output_dir_local, result_dir_s3=self.output_dir_s3, target_name=target, max_fragments=self.given_targets[target] ["max_fragments"]) except AssertionError as e: # The counting methods may raise assertion errors if assumptions # about input format are not satisfied. self.write_invalid_input_json({ "error": str(e), "step": None })
def multilinefa2singlelinefa(input_fasta, output_fasta): ''' Multi-line FASTA to Single-line FASTA conversion ''' step = "Multi-line FASTA to single-line FASTA conversion" log.write(f"Starting {step}...") cmd = f"awk 'NR==1 {{print $0}} NR>1 && /^>/ {{printf(\"\\n%s\\n\",$0);next; }} NR>1 {{ printf(\"%s\",$0);}} END {{printf(\"\\n\");}}' <{input_fasta} > {output_fasta}" command.execute(cmd) log.write(f"Finished {step}.")
def get_accession_metadata(accession): ''' Retrieve metadata of an NCBI accession (e.g. name, country, collection date) TODO: Put this data in S3 instead and get it from there. ''' accession_metadata = {} efetch_command = ";".join([ f"QUERY={accession}", "BASE=https://eutils.ncbi.nlm.nih.gov/entrez/eutils", "SEARCH_URL=${BASE}/esearch.fcgi?db=nuccore\&term=${QUERY}\&usehistory=y", "OUTPUT=$(curl $SEARCH_URL)", "WEB=$(echo $OUTPUT | sed -e 's/.*<WebEnv>\(.*\)<\/WebEnv>.*/\\1/')", "KEY=$(echo $OUTPUT | sed -e 's/.*<QueryKey>\(.*\)<\/QueryKey>.*/\\1/')", "FETCH_URL=${BASE}/efetch.fcgi?db=nuccore\&query_key=${KEY}\&WebEnv=${WEB}\&rettype=gb\&retmode=xml", f"curl $FETCH_URL" ]) genbank_xml = command.execute_with_output(efetch_command) root = ET.fromstring(genbank_xml).find('GBSeq') if not root: log.write(f"WARNING: {efetch_command} did not give a result") return accession_metadata accession_metadata['name'] = root.find('GBSeq_definition').text qualifiers_needed = {'country', 'collection_date'} for entry in root.find('GBSeq_feature-table')[0].find('GBFeature_quals'): if all(key in accession_metadata for key in qualifiers_needed): break for key in qualifiers_needed - accession_metadata.keys(): if entry.find('GBQualifier_name').text == key: accession_metadata[key] = entry.find('GBQualifier_value').text return accession_metadata
def subsample_fastas(input_fas, output_fas, max_fragments): ''' In memory subsampling ''' paired = len(input_fas) >= 2 # count lines cmd = "wc -l %s | cut -f1 -d ' '" % input_fas[0] total_records = int(command.execute_with_output(cmd)) // 2 log.write("total reads: %d" % total_records) log.write("target reads: %d" % max_fragments) if total_records <= max_fragments: for infile, outfile in zip(input_fas, output_fas): command.execute("cp %s %s" % (infile, outfile)) return # total_records > max_fragments, sample randgen = random.Random(x=hash(input_fas[0])) records_to_keep = randgen.sample(range(total_records), max_fragments) PipelineStepRunSubsample.subset(input_fas[0], output_fas[0], records_to_keep) if paired: PipelineStepRunSubsample.subset(input_fas[1], output_fas[1], records_to_keep) if len(input_fas) == 3 and len(output_fas) == 3: # subset the merged fasta records_to_keep_merged = [] for r in records_to_keep: records_to_keep_merged += [2 * r, 2 * r + 1] PipelineStepRunSubsample.subset(input_fas[2], output_fas[2], records_to_keep_merged)
def run(self): input_fas = self.input_files_local[0][0:2] output_fas = self.output_files_local() output_sam_file = os.path.join( self.output_dir_local, self.additional_attributes["output_sam_file"]) self.additional_output_files_hidden.append(output_sam_file) genome_dir = fetch_reference(self.additional_files["gsnap_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) gsnap_base_dir = os.path.dirname(genome_dir) gsnap_index_name = os.path.basename(genome_dir) # Run Gsnap gsnap_params = [ '-A', 'sam', '--batch=0', '--use-shared-memory=0', '--gmap-mode=all', '--npaths=1', '--ordered', '-t', 32, '--max-mismatches=40', '-D', gsnap_base_dir, '-d', gsnap_index_name, '-o', output_sam_file ] + input_fas command.execute( command_patterns.SingleCommand(cmd='gsnapl', args=gsnap_params)) log.write("Finished GSNAP alignment.") # Extract out unmapped files from sam if len(input_fas) == 2: convert.generate_unmapped_pairs_from_sam(output_sam_file, output_fas) else: convert.generate_unmapped_singles_from_sam(output_sam_file, output_fas[0])
def run_locally(self, input_fas, output_m8): with tempfile.TemporaryDirectory( prefix=self.alignment_algorithm) as tmpdir: index_path = os.path.join(tmpdir, "reference") os.mkdir(index_path) run(["tar", "-xzvf", self.index, "-C", index_path], check=True) if self.alignment_algorithm == "gsnap": # Hack to determine gsnap vs gsnapl error_message = run([ 'gsnapl.avx2-2018-10-26', '-D', index_path, '-d', self.genome_name ], input='>'.encode('utf-8'), stderr=PIPE, stdout=PIPE).stderr # note, alignment uses a pinned version of gmap/gsnap gsnap_command = "gsnap.avx2-2018-10-26" if 'please run gsnap instead' in error_message.decode( 'utf-8') else "gsnapl.avx2-2018-10-26" else: gsnap_command = None # rapsearch2 expects the filename of the primary file, but still depends on the .info file being a sibbling if self.alignment_algorithm == "rapsearch2": for filename in os.listdir(index_path): if not filename.endswith(".info"): index_path = os.path.join(index_path, filename) cmd = self._get_command(index_path, input_fas, output_m8, threads=multiprocessing.cpu_count(), gsnap_command=gsnap_command) log.write(f"running command {cmd}") run(cmd, check=True)
def fq2fa(input_fastq, output_fasta): ''' FASTQ to FASTA conversion ''' step = "FASTQ to FASTA conversion" log.write(f"Starting {step}...") cmd = f"sed -n '1~4s/^@/>/p;2~4p' <{input_fastq} >{output_fasta}" command.execute(cmd) log.write(f"Finished {step}.")
def subsample_fastas(input_fas, output_fas, max_fragments): ''' In memory subsampling ''' paired = len(input_fas) >= 2 # count lines cmd_output = command.execute_with_output( command_patterns.SingleCommand(cmd="wc", args=["-l", input_fas[0]])) lines_count = int(cmd_output.strip().split(' ')[0]) total_records = lines_count // 2 log.write("total reads: %d" % total_records) log.write("target reads: %d" % max_fragments) if total_records <= max_fragments: for infile, outfile in zip(input_fas, output_fas): command.copy_file(infile, outfile) return # total_records > max_fragments, sample randgen = random.Random(x=hash(input_fas[0])) records_to_keep = randgen.sample(range(total_records), max_fragments) PipelineStepRunSubsample.subset(input_fas[0], output_fas[0], records_to_keep) if paired: PipelineStepRunSubsample.subset(input_fas[1], output_fas[1], records_to_keep) if len(input_fas) == 3 and len(output_fas) == 3: # subset the merged fasta records_to_keep_merged = [] for r in records_to_keep: records_to_keep_merged += [2 * r, 2 * r + 1] PipelineStepRunSubsample.subset(input_fas[2], output_fas[2], records_to_keep_merged)
def manage_reference_downloads_cache(self): command.make_dirs(PURGE_SENTINEL_DIR) command.touch(PURGE_SENTINEL) time.sleep( 3 ) # 1 should be enough for the sentinel to predate all current stage downloads present_set, missing_set = self.references_roll_call() total_set = present_set | missing_set if total_set: present = len(present_set) missing = len(missing_set) total = len(total_set) log.write( f"Reference download cache: {present} of {total} large files ({100 * present / total:3.1f} percent) already exist locally." ) structured_report = { "summary_stats": { "cache_requests": total, "cache_hits": present, "cache_misses": missing, "cache_hit_rate": present / total }, "per_request_stats": { f: "hit" if f in present_set else "miss" for f in sorted(total_set) } } log.log_event("Reference downloads cache efficiency report", values=structured_report) idseq_dag.util.s3.make_space( ) # make sure to touch this stage's files before deleting LRU ones
def get_wiki_content_for_page(taxid, pageid, taxname, taxid2wikicontent, mutex, semaphore, max_attempt=3): ''' Fetch wiki content for pageid ''' for attempt in range(max_attempt): try: page = None if pageid: log.write(f"fetching wiki {pageid} for {taxid}") page = wikipedia.page(pageid=pageid) elif taxname: search_results = wikipedia.search(taxname) if len(search_results) > 0: wikiname = str(search_results[0]) if taxname.lower() == wikiname.lower(): page = wikipedia.page(wikiname) if not page: # query the page directly try: page = wikipedia.page(taxname.replace(" ", "_")) except: page = None if page: output = { "pageid" : page.pageid, "description": page.content[:1000], "title": page.title, "summary": page.summary } with mutex: taxid2wikicontent[taxid] = output break except: log.write(f"having trouble fetching {taxid} wiki {pageid} attempt {attempt}") semaphore.release()
def main(): from idseq_dag.engine.pipeline_flow import PipelineFlow import idseq_dag.util.log as log log.configure_logger() parser = argparse.ArgumentParser() parser.add_argument( '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) parser.add_argument('dag_json', help='pipeline dag in json file format') parser.add_argument('--no-lazy-run', dest='lazy_run', action='store_false') parser.add_argument('--key-path-s3', dest='key_path_s3', help='ssh key') parser.add_argument('--no-versioned-output', dest='versioned_output', action='store_false') parser.set_defaults(lazy_run=True) parser.set_defaults(versioned_output=True) args = parser.parse_args() if args.key_path_s3: os.environ["KEY_PATH_S3"] = args.key_path_s3 try: flow = PipelineFlow(lazy_run=args.lazy_run, dag_json=args.dag_json, versioned_output=args.versioned_output) log.write("everything is awesome. idseq dag is valid~") except: parser.print_help() raise log.write("start executing the dag") flow.start()
def process_hit(hit_type, hit_name): hit_data = read_data if hit_type == "read" else contig_data if hit_name not in hit_data: log.write(f"Could not find {hit_type} in map: {hit_name}") return hit_obj = hit_data[hit_name] # hitsummary is more strict than reassigned. # Sometimes reassigned will have a value for accession, but hitsummary won't. if hit_obj["accession"] != accession_id: log.write(f"Mismatched accession for {hit_name}: {hit_obj['accession']} (reassigned) versus {accession_id} (hitsummary)") return (accession_start, accession_end) = _align_interval(_decrement_lower_bound((hit_obj["subject_start"], hit_obj["subject_end"]))) # If the hit is larger than the bin size, treat it as an individual hit. if accession_end - accession_start >= bin_size: if hit_type == "read": individual_reads.append(hit_name) else: individual_contigs.append(hit_name) # Otherwise, put the hit into a bin based on its midpoint else: hit_midpoint = (accession_end + accession_start) / 2 hit_bin_index = _floor_with_min(hit_midpoint / bin_size, 0) if hit_type == "read": read_bins[hit_bin_index].append(hit_name) else: contig_bins[hit_bin_index].append(hit_name)
def _calculate_rpms(rpm_df, amr_df, total_reads): """Matches each gene in the amr_results dataframe to it's associated number of reads in the matched reads table and calculates the reads per million. Both total reads and reads per million are appended in order in lists and then bulk appended back in _append_rpm_to_results() to the proc_amr_results dataframe.""" rpm_list = [] total_reads_list = [] for row in amr_df.itertuples(): if len(rpm_df[rpm_df["allele"] == row.allele] ["reads"].values) == 0: # This should never happen, but it has happened before # because there were typos in allele names in ARGannot_r2.fasta that caused mismatches with argannot_genome.bed. # Log an error. The following line will crash the pipeline step, which is intended. # We prefer failing the pipeline step to showing incorrect or missing data while failing silently to the user. log.write( f"[Datadog] AmrAlleleMismatchError: {row.allele} (from ARGannot_r2.fasta) could not be found in argannot_genome.bed" ) reads_for_allele = rpm_df[rpm_df["allele"] == row.allele]["reads"].values[0] total_reads_list.append(reads_for_allele) rpm_for_allele = reads_for_allele * 1000000 / total_reads rpm_list.append(rpm_for_allele) return [rpm_list, total_reads_list]
def print_update(self, t_elapsed): """Log an update after every polling period to indicate the code section is still active. """ log.write( "LongRunningCodeSection %d (%s) still running after %3.1f seconds." % (self.id, self.name, t_elapsed))
def thread_run(self): ''' Actually running the step ''' self.status = StepStatus.STARTED self.update_status_json_file("instantiated") v = {"step": self.name} with log.log_context("dag_step", v): with log.log_context("substep_wait_for_input_files", v): self.wait_for_input_files() with log.log_context("substep_validate_input_files", v): self.validate_input_files() # If an input file error was detected, stop execution. if self.input_file_error: log.write("Invalid input detected for step %s" % self.name) self.status = StepStatus.INVALID_INPUT self.update_status_json_file("user_errored") return with log.log_context("substep_run", v): self.update_status_json_file("running") self.run() with log.log_context("substep_validate", v): self.validate() with log.log_context("substep_save_progress", v): self.save_progress() with log.log_context("substep_save_counts", v): self.save_counts() self.upload_thread = threading.Thread(target=self.uploading_results) self.upload_thread.start() self.status = StepStatus.FINISHED self.update_status_json_file("finished_running")
def wrapped_operation(*args, **kwargs): randgen = None remaining_attempts = MAX_TRIES delay = 1.0 while remaining_attempts > 1: try: t_start = time.time() return operation(*args, **kwargs) except: t_end = time.time() if randgen == None: invocation[0] += 1 randgen = random.Random( os.getpid() * 10000 + invocation[0] ).random # seed based on pid so subprocesses won't retry in lockstep if t_end - t_start > 30: # For longer operations, the delay should increase, so that the backoff will meaningfully reduce load on the failing service. delay = (t_end - t_start) * 0.2 wait_time = delay * (1.0 + 2.0 * randgen()) log.write( f"Sleeping {wait_time} seconds before retry {MAX_TRIES - remaining_attempts + 1} of {operation} with {args}, {kwargs}." ) time.sleep(wait_time) delay *= 3.0 remaining_attempts -= 1 # The last attempt is outside try/catch so caller can handle exception return operation(*args, **kwargs)
def test_write_info_obj_data(self, mock_getLogger): '''Test writing a info object data to the logs''' log.write(obj_data={"name_a": 1, "name_b": "fake_string", "name_c": 3.1415}) mock_getLogger.assert_has_calls([ call().info(msg=None, extra={"obj_data": {"name_a": 1, "name_b": "fake_string", "name_c": 3.1415}}) ])
def really_make_space(): refdir = config['REF_DIR'] while need_more_space(refdir): lru = find_oldest_reference(refdir) if not lru: log.write("WARNING: Too little available space on instance, and could not find any reference downloads to delete.") break command.remove_rf(lru)
def test_write_debug_text(self, mock_getLogger): '''Test writing a info text message to the logs''' log.write("Text message", debug=True) mock_getLogger.assert_has_calls([ call().debug(msg='Text message', extra={"obj_data": None}) ])
def execute( command: Union[command_patterns.CommandPattern, str], progress_file: str = None, timeout: int = None, grace_period: int = None, capture_stdout: bool = False, merge_stderr: bool = False, log_context_mode: log.LogContextMode = log.LogContextMode. START_END_LOG_EVENTS ) -> Union[str, None]: """Primary way to start external commands in subprocesses and handle execution with logging. """ if not isinstance(command, command_patterns.CommandPattern): # log warning if using legacy format log.write( warning=True, message= f"Command parameter is using legacy type str. Use idseq_dag.util.command_patterns.", obj_data={ "cmd": command, "type": type(command) }) cmd = command_patterns.ShellScriptCommand(script=command, args=[]) else: cmd = command with CommandTracker() as ct: log_values = {"cid": f"Command {ct.id}", "command": cmd.as_dict()} with log.log_context('command_execute', values=log_values, log_context_mode=log_context_mode) as lctx: with ProgressFile(progress_file): if timeout: ct.timeout = timeout if grace_period: ct.grace_period = grace_period if capture_stdout: # Capture only stdout. Child stderr = parent stderr unless # merge_stderr specified. Child input = parent stdin. ct.proc = cmd.open(stdin=sys.stdin.fileno(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT if merge_stderr else sys.stderr.fileno()) stdout, _ = ct.proc.communicate() else: # Capture nothing. Child inherits parent stdin/out/err. ct.proc = cmd.open() ct.proc.wait() stdout = None lctx.values.update({"returncode": ct.proc.returncode}) if ct.proc.returncode: raise subprocess.CalledProcessError( ct.proc.returncode, str(command), stdout) if capture_stdout: return stdout
def test_write_info_non_serializable(self, mock_getLogger): '''Test writing log entry with a non serializable field''' non_serializable_byte_field = bytes() log.write(obj_data={"name_a": 1, "name_b": non_serializable_byte_field, "name_c": 3.1415}) mock_getLogger.assert_has_calls([ call().info(msg=None, extra={"obj_data": {"name_a": 1, "name_b": non_serializable_byte_field, "name_c": 3.1415}}) ])
def write_invalid_input_json(self, error_json): ''' Upload an invalid_step_input.json file for this step, which can be detected by other services like idseq-web. ''' log.write("Writing invalid_step_input.json") local_input_errors_file = f"{self.output_dir_local}/invalid_step_input.json" with open(local_input_errors_file, 'w') as input_errors_file: json.dump(error_json, input_errors_file) idseq_dag.util.s3.upload_with_retries(local_input_errors_file, self.output_dir_s3 + "/")