def __init__(self, fna_file, contamination_report): self.sequence_file = fna_file self.contamination_file = contamination_report self.contamination_lines = Utilities.load_2d_array( self.contamination_file) exclude_index = self.find_index(self.contamination_lines, ["Exclude:"]) trim_index = self.find_index(self.contamination_lines, ["Trim:"]) duplicated_index = self.find_index(self.contamination_lines, ["Duplicated:"]) # Issue lines order: exclude, trim, duplicated headers_to_remove = [] if exclude_index: exclude_lines = [] if trim_index: exclude_lines = self.contamination_lines[exclude_index + 2:trim_index] elif duplicated_index: exclude_lines = self.contamination_lines[exclude_index + 2:duplicated_index] else: exclude_lines = self.contamination_lines[exclude_index + 2:] exclude_lines = Utilities.remove_empty_values( [i[0] for i in exclude_lines]) headers_to_remove.extend(exclude_lines) if trim_index: trim_lines_processed = dict() if duplicated_index: trim_lines_raw = self.contamination_lines[trim_index + 2:duplicated_index] else: trim_lines_raw = self.contamination_lines[trim_index + 2:] for trim_line_raw in Utilities.remove_empty_values(trim_lines_raw): for trim_span in Utilities.remove_empty_values( trim_line_raw[2].split(",")): trim_indices = [ int(i.strip()) for i in trim_span.split("..") ] # It seems that reported sequence positions are not zero-based trim_indices[0] -= 1 trim_lines_processed[trim_line_raw[0]] = trim_indices headers_to_remove.extend(list(trim_lines_processed.keys())) if duplicated_index: processed_duplicated_lines = list() duplicated_lines = [ i for i in self.contamination_lines[duplicated_index + 2:] ] # Removing only the first occurrence in Duplicates for duplicated_line in duplicated_lines: duplicated_str = duplicated_line[0] if duplicated_str.startswith("# "): continue processed_duplicated_lines.append( duplicated_str.strip().split(" ")[0]) headers_to_remove.extend(processed_duplicated_lines) self.seq_records = list(SeqIO.parse(self.sequence_file, "fasta")) print( f"Imported {len(self.seq_records)} raw records from '{self.sequence_file}'" ) headers_to_remove = sorted( set(Utilities.remove_empty_values(headers_to_remove))) print("{} headers were marked to remove: '{}'".format( len(headers_to_remove), "', '".join(headers_to_remove))) out_records = [] for record_raw in Utilities.remove_duplicate_sequences( self.seq_records): record_id = record_raw.id.split(" ")[0].strip() if record_id not in headers_to_remove: out_records.append(record_raw) # Some positions from the "Trim" entry after NCBI processing were moved into the "Exclude" entry # The point is removing them instead of trimming self.valid_records = [ i for i in out_records if len(i) >= self._NCBI_MIN_SEQ_LENGTH ]
"{}_assembly_contigs_number_valid".format( assembly_type)] = len(seq_records_valid) assemblies_annotation["{}_assembly_bp_valid".format( assembly_type)] = sum([len(i) for i in seq_records_valid]) for seq_record_raw in seq_records_valid: # Example `contigs.fasta` header: # '>NODE_1_length_42950_cov_12.6852_component_0' # contig_number = int(Utilities.safe_findall("^NODE_([0-9]+)", seq_record_raw.id)) # Processed FASTA header example: # >contig02 [organism=Clostridium difficile] [strain=ABDC] [plasmid-name=pABDC1] [topology=circular] [completeness=complete] seq_record_processed = deepcopy(seq_record_raw) if assembly_type == "plasmid": plasmid_counter += 1 seq_record_processed.description += " PLASMID" seq_records_processed.append(seq_record_processed) seq_records_processed = Utilities.remove_duplicate_sequences( seq_records_processed) for idx, seq_record_processed in enumerate(seq_records_processed): seq_record_processed.id = "contig{a:03d} [organism={b}] [strain={c}_{d}]".format( a=idx + 1, b=ORGANISM, c=ISOLATE_PREFIX, d=sample_number) if seq_record_processed.description.endswith(" PLASMID"): plasmid_counter += 1 seq_record_processed.description = "[plasmid-name=unnamed{0:02d}]".format( plasmid_counter) else: seq_record_processed.description = "" assemblies_annotations.append(assemblies_annotation) # SeqIO.write(seq_records_processed, assembly_target_file, "fasta") INDEX_COL_NAME = "sample_name" assemblies_statistics_df = pd.DataFrame(assemblies_annotations).set_index(