def concatenate_files(chunk_output_files, output_m8): with log.log_context("run_alignment_remote.concatenate_files", {"chunk_output_files": chunk_output_files}): with open(output_m8, 'wb') as outf: for f in chunk_output_files: with log.log_context("run_alignment_remote.concatenate_files#chunk", {"f": f}): with open(f, 'rb') as fd: shutil.copyfileobj(fd, outf)
def fetch_target_from_s3(self, target): ''' .done file should be written to the result dir when the download is complete ''' log.write("Downloading target %s" % target) if target in self.given_targets: input_path_s3 = self.given_targets[target]["s3_dir"] else: input_path_s3 = self.output_dir_s3 with log.log_context("fetch_input_files_from_s3", {"target": target}): PipelineFlow.fetch_input_files_from_s3( input_files=self.targets[target], input_dir_s3=input_path_s3, result_dir_local=self.output_dir_local) if target in self.given_targets and self.given_targets[target].get( "count_reads"): with log.log_context("count_input_reads", {"target": target}): try: PipelineFlow.count_input_reads( input_files=self.targets[target], result_dir_local=self.output_dir_local, result_dir_s3=self.output_dir_s3, target_name=target, max_fragments=self.given_targets[target] ["max_fragments"]) except AssertionError as e: # The counting methods may raise assertion errors if assumptions # about input format are not satisfied. self.write_invalid_input_json({ "error": str(e), "step": None })
def prefetch_large_files(self, touch_only=False): successes, failures = set(), set() with log.log_context("touch_large_files_and_make_space" if touch_only else "prefetch_large_files", values={"file_list": self.large_file_list}): for f in self.large_file_list: with log.log_context("fetch_reference", values={"file": f, "touch_only": touch_only}): success = idseq_dag.util.s3.fetch_reference( f, self.ref_dir_local, auto_unzip=True, auto_untar=True, allow_s3mi=True, touch_only=touch_only) if success: successes.add(f) else: failures.add(f) return successes, failures
def remove_rf(path: str): '''Mimics behavior of rm -rf linux command.''' def _remove_entry(path_entry): if os.path.isdir(path_entry) and not os.path.islink(path_entry): shutil.rmtree(path_entry) elif os.path.exists(path_entry): os.remove(path_entry) with log.log_context(context_name='command.remove_rf', values={'path': path}, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT): path_list = _glob.glob(path) if len(path_list) == 1 and path_list[0] == path: _remove_entry(path) else: for path_entry in path_list: with log.log_context(context_name='command.remove_rf._remove_entry', values={'path_entry': path_entry}, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT): _remove_entry(path_entry)
def glob(glob_pattern: str, strip_folder_names: bool = False, max_results: int = 0): ''' Execute a glob pattern to local file system. Parameters: glob_pattern(str): A glob pattern. Ex: /tmp/*.gz max_results(int): Limit the number of results to be returned. Zero means not limit is set. strip_folder_names(bool): Return only the file names without folder information. Ex: "/tmp/123.txt" is returned as "123.txt" Returns: Array of strings containing the files found. Empty array if none is found. ''' values = { 'glob_pattern': glob_pattern, 'strip_folder_names': strip_folder_names, 'max_results': max_results } with log.log_context(context_name='command.glob', values=values, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT): results = _glob.glob(glob_pattern) results.sort() if max_results > 0: results = results[:max_results] if strip_folder_names: results = list(map(os.path.basename, results)) values["results"] = results return results
def list_s3_keys(s3_path_prefix): """Returns a list of s3 keys prefixed by s3_path_prefix.""" with log.log_context(context_name="s3.list_s3_objects", values={'s3_path_prefix': s3_path_prefix}, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT): parsed_url = urlparse(s3_path_prefix, allow_fragments=False) bucket = parsed_url.netloc prefix = parsed_url.path.lstrip('/') # Use the AWS CLI instead of boto for thread safety raw_response = command.execute( command_patterns.SingleCommand( cmd="aws", args=[ "s3api", "list-objects-v2", "--bucket", bucket, "--prefix", prefix, ], env=dict(os.environ, **refreshed_credentials()), ), capture_stdout=True, ) parsed_response = json.loads(raw_response) return [item['Key'] for item in parsed_response['Contents']]
def make_dirs(path: str): if not os.path.isdir(path): with log.log_context( context_name="command.make_dirs", values={'path': path}, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT): os.makedirs(path, exist_ok=True)
def move_file(src: str, dst: str): with log.log_context(context_name='command.move_file', values={ 'src': src, 'dest': dst }, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT): shutil.move(src, dst)
def _ensure_table_exists(self, conn): ''' Private: Fail if the table does not exist. Called when self._db_conn is still none. ''' self._assert_lock_held() with log.log_context(f"db_assert_table", {"db_path": self.db_path}): with conn: res = conn.execute(f"SELECT count(*) FROM sqlite_master WHERE type='table' AND name='{SQLITE_TABLE_NAME}';") table_exists = res.fetchone()[0] != 0 assert table_exists, f"table {SQLITE_TABLE_NAME} doesn't exist in db {self.db_path}"
def rename(src: str, dst: str): with log.log_context(context_name='command.rename', values={ 'src': src, 'dest': dst }, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT): os.rename(src, dst)
def execute( command: Union[command_patterns.CommandPattern, str], progress_file: str = None, timeout: int = None, grace_period: int = None, capture_stdout: bool = False, merge_stderr: bool = False, log_context_mode: log.LogContextMode = log.LogContextMode. START_END_LOG_EVENTS ) -> Union[str, None]: """Primary way to start external commands in subprocesses and handle execution with logging. """ if not isinstance(command, command_patterns.CommandPattern): # log warning if using legacy format log.write( warning=True, message= f"Command parameter is using legacy type str. Use idseq_dag.util.command_patterns.", obj_data={ "cmd": command, "type": type(command) }) cmd = command_patterns.ShellScriptCommand(script=command, args=[]) else: cmd = command with CommandTracker() as ct: log_values = {"cid": f"Command {ct.id}", "command": cmd.as_dict()} with log.log_context('command_execute', values=log_values, log_context_mode=log_context_mode) as lctx: with ProgressFile(progress_file): if timeout: ct.timeout = timeout if grace_period: ct.grace_period = grace_period if capture_stdout: # Capture only stdout. Child stderr = parent stderr unless # merge_stderr specified. Child input = parent stdin. ct.proc = cmd.open(stdin=sys.stdin.fileno(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT if merge_stderr else sys.stderr.fileno()) stdout, _ = ct.proc.communicate() else: # Capture nothing. Child inherits parent stdin/out/err. ct.proc = cmd.open() ct.proc.wait() stdout = None lctx.values.update({"returncode": ct.proc.returncode}) if ct.proc.returncode: raise subprocess.CalledProcessError( ct.proc.returncode, str(command), stdout) if capture_stdout: return stdout
def thread_run(self): ''' Actually running the step ''' self.status = StepStatus.STARTED self.update_status_json_file("instantiated") v = {"step": self.name} with log.log_context("dag_step", v): with log.log_context("substep_wait_for_input_files", v): self.wait_for_input_files() with log.log_context("substep_validate_input_files", v): self.validate_input_files() # If an input file error was detected, stop execution. if self.input_file_error: log.write("Invalid input detected for step %s" % self.name) self.status = StepStatus.INVALID_INPUT self.update_status_json_file("user_errored") return with log.log_context("substep_run", v): self.update_status_json_file("running") self.run() with log.log_context("substep_validate", v): self.validate() with log.log_context("substep_save_progress", v): self.save_progress() with log.log_context("substep_save_counts", v): self.save_counts() self.upload_thread = threading.Thread(target=self.uploading_results) self.upload_thread.start() self.status = StepStatus.FINISHED self.update_status_json_file("finished_running")
def write_text_to_file(text: str, file_path: str): with log.log_context(context_name='command.write_text_to_file', values={ 'path': file_path, 'text': text }, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT): with open(file_path, "w") as f: print(text, file=f)
def chmod(path: str, mode: int): '''Execute a chmod operation. Parameter 'mode' must be in octal format. Ex: chmod('/tmp/test.txt', 0o400)''' with log.log_context(context_name='command.chmod', values={ 'path': path, 'mode': oct(mode) }, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT): os.chmod(path, mode)
def build_should_keep_filter( deuterostome_path, taxon_whitelist_path, taxon_blacklist_path ): # See also HOMO_SAPIENS_TAX_IDS in idseq-web taxids_to_remove = set(['9605', '9606']) if taxon_blacklist_path: with log.log_context("generate_taxon_count_json_from_m8", {"substep": "read_blacklist_into_set"}): taxids_to_remove.update(read_file_into_set(taxon_blacklist_path)) if deuterostome_path: with log.log_context("generate_taxon_count_json_from_m8", {"substep": "read_file_into_set"}): taxids_to_remove.update(read_file_into_set(deuterostome_path)) if taxon_whitelist_path: with log.log_context("generate_taxon_count_json_from_m8", {"substep": "read_whitelist_into_set"}): taxids_to_keep = read_file_into_set(taxon_whitelist_path) def is_blacklisted(hits: Iterable[str]): for taxid in hits: if int(taxid) >= 0 and taxid in taxids_to_remove: return True return False def is_whitelisted(hits: Iterable[str]): if not taxon_whitelist_path: return True for taxid in hits: if int(taxid) >= 0 and taxid in taxids_to_keep: return True return False def should_keep(hits: Iterable[str]): # In some places in the code taxids are ints rather than strings, this would lead # to a silent failure here so it is worth the explicit check. non_strings = [h for h in hits if type(h) != str] assert not non_strings, f"should_keep recieved non-string inputs {non_strings}" return is_whitelisted(hits) and not is_blacklisted(hits) return should_keep
def build_should_keep_filter(deuterostome_path, taxon_whitelist_path, taxon_blacklist_path): # See also HOMO_SAPIENS_TAX_IDS in idseq-web taxids_to_remove = set(['9605', '9606']) if taxon_blacklist_path: with log.log_context("generate_taxon_count_json_from_m8", {"substep": "read_blacklist_into_set"}): taxids_to_remove.update(read_file_into_set(taxon_blacklist_path)) if deuterostome_path: with log.log_context("generate_taxon_count_json_from_m8", {"substep": "read_file_into_set"}): taxids_to_remove.update(read_file_into_set(deuterostome_path)) if taxon_whitelist_path: with log.log_context("generate_taxon_count_json_from_m8", {"substep": "read_whitelist_into_set"}): taxids_to_keep = read_file_into_set(taxon_whitelist_path) def is_blacklisted(hits): for taxid in hits: if int(taxid) >= 0 and taxid in taxids_to_remove: return True return False def is_whitelisted(hits): if not taxon_whitelist_path: return True for taxid in hits: if int(taxid) >= 0 and taxid in taxids_to_keep: return True return False def should_keep(hits): return is_whitelisted(hits) and not is_blacklisted(hits) return should_keep
def _check_s3_presence(s3_path, allow_zero_byte_files): """True if s3_path exists. False otherwise.""" with log.log_context(context_name="s3.check_s3_presence", values={'s3_path': s3_path}, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT) as lc: parsed_url = urlparse(s3_path, allow_fragments=False) bucket = parsed_url.netloc key = parsed_url.path.lstrip('/') try: o = boto3.resource('s3').Object( bucket, key ) size = o.content_length lc.values['size'] = size exists = (allow_zero_byte_files and size >= 0) or (not allow_zero_byte_files and size > 0) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": exists = False else: # Something else has gone wrong. raise lc.values['exists'] = exists return exists
def _ensure_table_exists(self, conn): ''' Create writable table if one doesn't exist. ''' self._assert_lock_held() with log.log_context(f"db_assert_table", {"db_path": self.db_path}): with conn: conn.execute(f"CREATE TABLE IF NOT EXISTS {SQLITE_TABLE_NAME} (dict_key VARCHAR(255) PRIMARY KEY, dict_value text)")
def remove_file(file_path: str): with log.log_context(context_name='command.remove_file', values={'path': file_path}, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT): os.remove(file_path)
def touch(path, exist_ok=True): with log.log_context(context_name='command.touch', values={'path': path}, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT): pathlib.Path(path).touch(exist_ok=exist_ok)
def generate_taxon_count_json_from_m8(m8_file, hit_level_file, e_value_type, count_type, lineage_map_path, deuterostome_path, taxon_whitelist_path, taxon_blacklist_path, cdhit_cluster_sizes_path, output_json_file): # Parse through hit file and m8 input file and format a JSON file with # our desired attributes, including aggregated statistics. cdhit_cluster_sizes = load_cdhit_cluster_sizes(cdhit_cluster_sizes_path) should_keep = build_should_keep_filter(deuterostome_path, taxon_whitelist_path, taxon_blacklist_path) # Setup aggregation = {} with open(hit_level_file, 'r', encoding='utf-8') as hit_f, \ open(m8_file, 'r', encoding='utf-8') as m8_f, \ open_file_db_by_extension(lineage_map_path, IdSeqDictValue.VALUE_TYPE_ARRAY) as lineage_map: # noqa # Lines in m8_file and hit_level_file correspond (same read_id) hit_line = hit_f.readline() m8_line = m8_f.readline() num_ranks = len(lineage.NULL_LINEAGE) # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022 with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_1"}): while hit_line and m8_line: # Retrieve data values from files hit_line_columns = hit_line.rstrip("\n").split("\t") read_id = hit_line_columns[0] hit_level = hit_line_columns[1] hit_taxid = hit_line_columns[2] if int(hit_level) < 0: # Skip negative levels and continue hit_line = hit_f.readline() m8_line = m8_f.readline() continue # m8 files correspond to BLAST tabular output format 6: # Columns: read_id | _ref_id | percent_identity | alignment_length... # # * read_id = query (e.g., gene) sequence id # * _ref_id = subject (e.g., reference genome) sequence id # * percent_identity = percentage of identical matches # * alignment_length = length of the alignments # * e_value = the expect value # # See: # * http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 # * http://www.metagenomics.wiki/tools/blast/evalue m8_line_columns = m8_line.split("\t") msg = "read_ids in %s and %s do not match: %s vs. %s" % ( os.path.basename(m8_file), os.path.basename(hit_level_file), m8_line_columns[0], hit_line_columns[0]) assert m8_line_columns[0] == hit_line_columns[0], msg percent_identity = float(m8_line_columns[2]) alignment_length = float(m8_line_columns[3]) e_value = float(m8_line_columns[10]) # These have been filtered out before the creation of m8_f and hit_f assert alignment_length > 0 assert -0.25 < percent_identity < 100.25 assert e_value == e_value if e_value_type != 'log10': # e_value could be 0 when large contigs are mapped if e_value <= MIN_NORMAL_POSITIVE_DOUBLE: e_value = MIN_NORMAL_POSITIVE_DOUBLE e_value = math.log10(e_value) # Retrieve the taxon lineage and mark meaningless calls with fake # taxids. hit_taxids_all_levels = lineage_map.get( hit_taxid, lineage.NULL_LINEAGE) cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage( hit_taxids_all_levels, hit_taxid, hit_level) assert num_ranks == len(cleaned_hit_taxids_all_levels) if should_keep(cleaned_hit_taxids_all_levels): # Aggregate each level and collect statistics agg_key = tuple(cleaned_hit_taxids_all_levels) while agg_key: agg_bucket = aggregation.get(agg_key) if not agg_bucket: agg_bucket = { 'nonunique_count': 0, 'unique_count': 0, 'sum_percent_identity': 0.0, 'sum_alignment_length': 0.0, 'sum_e_value': 0.0 } aggregation[agg_key] = agg_bucket agg_bucket['nonunique_count'] += get_read_cluster_size( cdhit_cluster_sizes, read_id) agg_bucket['unique_count'] += 1 agg_bucket['sum_percent_identity'] += percent_identity agg_bucket['sum_alignment_length'] += alignment_length agg_bucket['sum_e_value'] += e_value # Chomp off the lowest rank as we aggregate up the tree agg_key = agg_key[1:] hit_line = hit_f.readline() m8_line = m8_f.readline() # Produce the final output taxon_counts_attributes = [] with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_2"}): for agg_key, agg_bucket in aggregation.items(): unique_count = agg_bucket['unique_count'] nonunique_count = agg_bucket['nonunique_count'] tax_level = num_ranks - len(agg_key) + 1 # TODO: Extend taxonomic ranks as indicated on the commented out lines. taxon_counts_attributes.append({ "tax_id": agg_key[0], "tax_level": tax_level, # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100", 'genus_taxid': agg_key[2 - tax_level] if tax_level <= 2 else "-200", 'family_taxid': agg_key[3 - tax_level] if tax_level <= 3 else "-300", # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400", # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500", # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600", # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700", # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800", "count": # this field will be consumed by the webapp nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count, "nonunique_count": nonunique_count, "unique_count": unique_count, "dcr": nonunique_count / unique_count, "percent_identity": agg_bucket['sum_percent_identity'] / unique_count, "alignment_length": agg_bucket['sum_alignment_length'] / unique_count, "e_value": agg_bucket['sum_e_value'] / unique_count, "count_type": count_type }) output_dict = { "pipeline_output": { "taxon_counts_attributes": taxon_counts_attributes } } with log.log_context("generate_taxon_count_json_from_m8", { "substep": "json_dump", "output_json_file": output_json_file }): with open(output_json_file, 'w') as outf: json.dump(output_dict, outf) outf.flush()
def generate_taxon_count_json_from_m8( blastn_6_path, hit_level_path, count_type, lineage_map_path, deuterostome_path, taxon_whitelist_path, taxon_blacklist_path, duplicate_cluster_sizes_path, output_json_file): # Parse through hit file and m8 input file and format a JSON file with # our desired attributes, including aggregated statistics. duplicate_cluster_sizes = load_duplicate_cluster_sizes(duplicate_cluster_sizes_path) should_keep = build_should_keep_filter( deuterostome_path, taxon_whitelist_path, taxon_blacklist_path) # Setup aggregation = {} with open(hit_level_path) as hit_level_f, \ open(blastn_6_path) as blastn_6_f, \ open_file_db_by_extension(lineage_map_path) as lineage_map: num_ranks = len(lineage.NULL_LINEAGE) # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022 with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_1"}): # Lines in m8_file and hit_level_file correspond (same read_id) for hit_row, blastn_6_row in zip(HitSummaryMergedReader(hit_level_f), BlastnOutput6NTRerankedReader(blastn_6_f)): # Retrieve data values from files read_id = hit_row["read_id"] hit_level = hit_row["level"] hit_taxid = hit_row["taxid"] if hit_level < 0: log.write('hit_level < 0', debug=True) hit_source_count_type = hit_row.get("source_count_type") msg = "read_ids in %s and %s do not match: %s vs. %s" % ( os.path.basename(blastn_6_path), os.path.basename(hit_level_path), blastn_6_row["qseqid"], read_id) assert blastn_6_row["qseqid"] == read_id, msg percent_identity = blastn_6_row["pident"] alignment_length = blastn_6_row["length"] if count_type == 'merged_NT_NR' and hit_source_count_type == 'NR': # NOTE: At the moment of the change, applied ONLY in the scope of the prototype of NT/NR consensus project. # Protein alignments (NR) are done at amino acid level. Each amino acid is composed of 3 nucleotides. # To make alignment length values comparable across NT and NR alignments (for combined statistics), # the NR alignment lengths are multiplied by 3. alignment_length *= 3 e_value = blastn_6_row["evalue"] # These have been filtered out before the creation of blastn_6_f and hit_level_f assert alignment_length > 0 assert -0.25 < percent_identity < 100.25 assert e_value == e_value if count_type == "NT" or hit_source_count_type == "NT": # e_value could be 0 when large contigs are mapped if e_value <= MIN_NORMAL_POSITIVE_DOUBLE: e_value = MIN_NORMAL_POSITIVE_DOUBLE e_value = math.log10(e_value) # Retrieve the taxon lineage and mark meaningless calls with fake # taxids. # lineage_map expects string ids hit_taxids_all_levels = lineage_map.get( str(hit_taxid), lineage.NULL_LINEAGE) cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage( hit_taxids_all_levels, hit_taxid, hit_level) assert num_ranks == len(cleaned_hit_taxids_all_levels) if should_keep(cleaned_hit_taxids_all_levels): # Aggregate each level and collect statistics agg_key = tuple(cleaned_hit_taxids_all_levels) while agg_key: agg_bucket = aggregation.get(agg_key) if not agg_bucket: agg_bucket = { 'nonunique_count': 0, 'unique_count': 0, 'sum_percent_identity': 0.0, 'sum_alignment_length': 0.0, 'sum_e_value': 0.0 } aggregation[agg_key] = agg_bucket agg_bucket['nonunique_count'] += get_read_cluster_size( duplicate_cluster_sizes, read_id) agg_bucket['unique_count'] += 1 agg_bucket['sum_percent_identity'] += percent_identity agg_bucket['sum_alignment_length'] += alignment_length agg_bucket['sum_e_value'] += e_value if hit_source_count_type: agg_bucket.setdefault('source_count_type', set()).add(hit_source_count_type) # Chomp off the lowest rank as we aggregate up the tree agg_key = agg_key[1:] # Produce the final output taxon_counts_attributes = [] with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_2"}): for agg_key, agg_bucket in aggregation.items(): unique_count = agg_bucket['unique_count'] nonunique_count = agg_bucket['nonunique_count'] tax_level = num_ranks - len(agg_key) + 1 # TODO: Extend taxonomic ranks as indicated on the commented out lines. taxon_counts_row = { "tax_id": agg_key[0], "tax_level": tax_level, # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100", 'genus_taxid': agg_key[2 - tax_level] if tax_level <= 2 else "-200", 'family_taxid': agg_key[3 - tax_level] if tax_level <= 3 else "-300", # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400", # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500", # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600", # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700", # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800", "count": # this field will be consumed by the webapp nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count, "nonunique_count": nonunique_count, "unique_count": unique_count, "dcr": nonunique_count / unique_count, "percent_identity": agg_bucket['sum_percent_identity'] / unique_count, "alignment_length": agg_bucket['sum_alignment_length'] / unique_count, "e_value": agg_bucket['sum_e_value'] / unique_count, "count_type": count_type } if agg_bucket.get('source_count_type'): taxon_counts_row['source_count_type'] = list(agg_bucket['source_count_type']) taxon_counts_attributes.append(taxon_counts_row) output_dict = { "pipeline_output": { "taxon_counts_attributes": taxon_counts_attributes } } with log.log_context( "generate_taxon_count_json_from_m8", {"substep": "json_dump", "output_json_file": output_json_file} ): with open(output_json_file, 'w') as outf: json.dump(output_dict, outf) outf.flush()
def _connect(self): self._assert_lock_held() uri_db_path = self._uri_base() with log.log_context(f"db_open", {"db_path": self.db_path, "uri_db_path": uri_db_path}): return sqlite3.connect(uri_db_path, uri=True)
def run(self): ''' 1. summarize hits 2. built blast index 3. blast assembled contigs to the index 4. update the summary ''' _align_m8, deduped_m8, hit_summary, orig_counts_with_dcr = self.input_files_local[0] assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[1] reference_fasta, = self.input_files_local[2] duplicate_cluster_sizes_path, = self.input_files_local[3] blast_m8, refined_m8, refined_hit_summary, refined_counts_with_dcr, contig_summary_json, blast_top_m8 = self.output_files_local() assert refined_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local() assert orig_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local() db_type = self.additional_attributes["db_type"] no_assembled_results = ( os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE) if no_assembled_results: # No assembled results or refseq fasta available. # Create empty output files. command.write_text_to_file(' ', blast_m8) command.write_text_to_file(' ', blast_top_m8) command.copy_file(deduped_m8, refined_m8) command.copy_file(hit_summary, refined_hit_summary) command.copy_file(orig_counts_with_dcr, refined_counts_with_dcr) command.write_text_to_file('[]', contig_summary_json) return # return in the middle of the function (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8) read2contig = {} PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, duplicate_cluster_sizes_path) (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict( read2contig, blast_top_m8, read_dict, accession_dict, db_type) self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8, hit_summary, deduped_m8, refined_hit_summary, refined_m8) # Generating taxon counts based on updated results lineage_db = s3.fetch_reference( self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=False) # Too small to waste s3mi deuterostome_db = None if self.additional_files.get("deuterostome_db"): deuterostome_db = s3.fetch_reference(self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=False) # Too small for s3mi blacklist_s3_file = self.additional_files.get('taxon_blacklist', DEFAULT_BLACKLIST_S3) taxon_blacklist = s3.fetch_reference(blacklist_s3_file, self.ref_dir_local) taxon_whitelist = None if self.additional_attributes.get("use_taxon_whitelist"): taxon_whitelist = s3.fetch_reference(self.additional_files.get("taxon_whitelist", DEFAULT_WHITELIST_S3), self.ref_dir_local) with TraceLock("PipelineStepBlastContigs-CYA", PipelineStepBlastContigs.cya_lock, debug=False): with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_count_json_from_m8", "db_type": db_type, "refined_counts": refined_counts_with_dcr}): m8.generate_taxon_count_json_from_m8(refined_m8, refined_hit_summary, db_type.upper(), lineage_db, deuterostome_db, taxon_whitelist, taxon_blacklist, duplicate_cluster_sizes_path, refined_counts_with_dcr) # generate contig stats at genus/species level with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary"}): contig_taxon_summary = self.generate_taxon_summary( read2contig, contig2lineage, updated_read_dict, added_reads, db_type, duplicate_cluster_sizes_path, # same filter as applied in generate_taxon_count_json_from_m8 m8.build_should_keep_filter(deuterostome_db, taxon_whitelist, taxon_blacklist) ) with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary_json", "contig_summary_json": contig_summary_json}): with open(contig_summary_json, 'w') as contig_outf: json.dump(contig_taxon_summary, contig_outf) # Upload additional file contig2lineage_json = os.path.join(os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json") with log.log_context("PipelineStepBlastContigs", {"substep": "contig2lineage_json", "contig2lineage_json": contig2lineage_json}): with open(contig2lineage_json, 'w') as c2lf: json.dump(contig2lineage, c2lf) self.additional_output_files_hidden.append(contig2lineage_json)
def run(self): ''' 1. summarize hits 2. built blast index 3. blast assembled contigs to the index 4. update the summary ''' (_align_m8, deduped_m8, hit_summary, orig_counts) = self.input_files_local[0] assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[ 1] reference_fasta = self.input_files_local[2][0] (blast_m8, refined_m8, refined_hit_summary, refined_counts, contig_summary_json, blast_top_m8) = self.output_files_local() db_type = self.additional_attributes["db_type"] if os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or \ os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE: # No assembled results or refseq fasta available. # Create empty output files. command.write_text_to_file(' ', blast_m8) command.write_text_to_file(' ', blast_top_m8) command.copy_file(deduped_m8, refined_m8) command.copy_file(hit_summary, refined_hit_summary) command.copy_file(orig_counts, refined_counts) command.write_text_to_file('[]', contig_summary_json) return # return in the middle of the function (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8) read2contig = {} contig_stats = defaultdict(int) PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, contig_stats) (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict(read2contig, blast_top_m8, read_dict, accession_dict, db_type) self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8, hit_summary, deduped_m8, refined_hit_summary, refined_m8) # Generating taxon counts based on updated results lineage_db = s3.fetch_reference( self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=False) # Too small to waste s3mi deuterostome_db = None evalue_type = 'raw' if self.additional_files.get("deuterostome_db"): deuterostome_db = s3.fetch_reference( self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=False) # Too small for s3mi with TraceLock("PipelineStepBlastContigs-CYA", PipelineStepBlastContigs.cya_lock, debug=False): with log.log_context( "PipelineStepBlastContigs", { "substep": "generate_taxon_count_json_from_m8", "db_type": db_type, "refined_counts": refined_counts }): m8.generate_taxon_count_json_from_m8( refined_m8, refined_hit_summary, evalue_type, db_type.upper(), lineage_db, deuterostome_db, refined_counts) # generate contig stats at genus/species level with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary"}): contig_taxon_summary = self.generate_taxon_summary( read2contig, contig2lineage, updated_read_dict, added_reads, db_type) with log.log_context( "PipelineStepBlastContigs", { "substep": "generate_taxon_summary_json", "contig_summary_json": contig_summary_json }): with open(contig_summary_json, 'w') as contig_outf: json.dump(contig_taxon_summary, contig_outf) # Upload additional file contig2lineage_json = os.path.join( os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json") with log.log_context( "PipelineStepBlastContigs", { "substep": "contig2lineage_json", "contig2lineage_json": contig2lineage_json }): with open(contig2lineage_json, 'w') as c2lf: json.dump(contig2lineage, c2lf) self.additional_output_files_hidden.append(contig2lineage_json)