Beispiel #1
0
    def __init__(self, fna_file, contamination_report):
        self.sequence_file = fna_file
        self.contamination_file = contamination_report
        self.contamination_lines = Utilities.load_2d_array(
            self.contamination_file)

        exclude_index = self.find_index(self.contamination_lines, ["Exclude:"])
        trim_index = self.find_index(self.contamination_lines, ["Trim:"])
        duplicated_index = self.find_index(self.contamination_lines,
                                           ["Duplicated:"])
        # Issue lines order: exclude, trim, duplicated
        headers_to_remove = []
        if exclude_index:
            exclude_lines = []
            if trim_index:
                exclude_lines = self.contamination_lines[exclude_index +
                                                         2:trim_index]
            elif duplicated_index:
                exclude_lines = self.contamination_lines[exclude_index +
                                                         2:duplicated_index]
            else:
                exclude_lines = self.contamination_lines[exclude_index + 2:]
            exclude_lines = Utilities.remove_empty_values(
                [i[0] for i in exclude_lines])
            headers_to_remove.extend(exclude_lines)

        if trim_index:
            trim_lines_processed = dict()
            if duplicated_index:
                trim_lines_raw = self.contamination_lines[trim_index +
                                                          2:duplicated_index]
            else:
                trim_lines_raw = self.contamination_lines[trim_index + 2:]
            for trim_line_raw in Utilities.remove_empty_values(trim_lines_raw):
                for trim_span in Utilities.remove_empty_values(
                        trim_line_raw[2].split(",")):
                    trim_indices = [
                        int(i.strip()) for i in trim_span.split("..")
                    ]
                    # It seems that reported sequence positions are not zero-based
                    trim_indices[0] -= 1
                    trim_lines_processed[trim_line_raw[0]] = trim_indices
            headers_to_remove.extend(list(trim_lines_processed.keys()))

        if duplicated_index:
            processed_duplicated_lines = list()
            duplicated_lines = [
                i for i in self.contamination_lines[duplicated_index + 2:]
            ]
            # Removing only the first occurrence in Duplicates
            for duplicated_line in duplicated_lines:
                duplicated_str = duplicated_line[0]
                if duplicated_str.startswith("# "):
                    continue
                processed_duplicated_lines.append(
                    duplicated_str.strip().split(" ")[0])
            headers_to_remove.extend(processed_duplicated_lines)

        self.seq_records = list(SeqIO.parse(self.sequence_file, "fasta"))
        print(
            f"Imported {len(self.seq_records)} raw records from '{self.sequence_file}'"
        )
        headers_to_remove = sorted(
            set(Utilities.remove_empty_values(headers_to_remove)))
        print("{} headers were marked to remove: '{}'".format(
            len(headers_to_remove), "', '".join(headers_to_remove)))
        out_records = []
        for record_raw in Utilities.remove_duplicate_sequences(
                self.seq_records):
            record_id = record_raw.id.split(" ")[0].strip()
            if record_id not in headers_to_remove:
                out_records.append(record_raw)
            # Some positions from the "Trim" entry after NCBI processing were moved into the "Exclude" entry
            # The point is removing them instead of trimming
        self.valid_records = [
            i for i in out_records if len(i) >= self._NCBI_MIN_SEQ_LENGTH
        ]
                    "{}_assembly_contigs_number_valid".format(
                        assembly_type)] = len(seq_records_valid)
                assemblies_annotation["{}_assembly_bp_valid".format(
                    assembly_type)] = sum([len(i) for i in seq_records_valid])
                for seq_record_raw in seq_records_valid:
                    # Example `contigs.fasta` header:
                    # '>NODE_1_length_42950_cov_12.6852_component_0'
                    # contig_number = int(Utilities.safe_findall("^NODE_([0-9]+)", seq_record_raw.id))
                    # Processed FASTA header example:
                    # >contig02 [organism=Clostridium difficile] [strain=ABDC] [plasmid-name=pABDC1] [topology=circular] [completeness=complete]
                    seq_record_processed = deepcopy(seq_record_raw)
                    if assembly_type == "plasmid":
                        plasmid_counter += 1
                        seq_record_processed.description += " PLASMID"
                    seq_records_processed.append(seq_record_processed)
    seq_records_processed = Utilities.remove_duplicate_sequences(
        seq_records_processed)
    for idx, seq_record_processed in enumerate(seq_records_processed):
        seq_record_processed.id = "contig{a:03d} [organism={b}] [strain={c}_{d}]".format(
            a=idx + 1, b=ORGANISM, c=ISOLATE_PREFIX, d=sample_number)
        if seq_record_processed.description.endswith(" PLASMID"):
            plasmid_counter += 1
            seq_record_processed.description = "[plasmid-name=unnamed{0:02d}]".format(
                plasmid_counter)
        else:
            seq_record_processed.description = ""
    assemblies_annotations.append(assemblies_annotation)
    #
    SeqIO.write(seq_records_processed, assembly_target_file, "fasta")

INDEX_COL_NAME = "sample_name"
assemblies_statistics_df = pd.DataFrame(assemblies_annotations).set_index(