def test_suitability(self): self.sequences[0].id = "NOT_CONTIG_1" with self.assertRaises(ValueError) as err: gff_parser.check_gff_suitability(self.config, self.sequences) assert "GFF3 record IDs don't match sequence file record IDs" in str(err.exception) # doesn't test very much self.sequences[0].id = "CONTIG_1" gff_parser.run(self.sequences[0], self.single_entry, self.config) # insert the features assert not gff_parser.check_gff_suitability(self.config, self.sequences) # test force correlation self.sequences = self.sequences[1:] # CONTIG_2 assert gff_parser.check_gff_suitability(self.config, self.sequences)
def test_suitability(self): self.sequences[0].id = "NOT_CONTIG_1" with self.assertRaisesRegex(errors.AntismashInputError, "GFF3 record IDs don't match sequence file record IDs"): gff_parser.check_gff_suitability(self.config, self.sequences) # doesn't test very much self.sequences[0].id = "CONTIG_1" gff_parser.run(self.sequences[0], self.single_entry, self.config) # insert the features assert not gff_parser.check_gff_suitability(self.config, self.sequences) # test force correlation self.sequences = self.sequences[1:] # CONTIG_2 assert gff_parser.check_gff_suitability(self.config, self.sequences)
def test_suitability(self): self.sequences[0].id = "NOT_CONTIG_1" with self.assertRaisesRegex( errors.AntismashInputError, "GFF3 record IDs don't match sequence file record IDs"): gff_parser.check_gff_suitability(self.gff_file, self.sequences) self.sequences[0].id = "CONTIG_1" cdses = gff_parser.run("CONTIG_1", self.single_entry, self.gff_file) self.sequences[0].features.extend(cdses) assert not gff_parser.check_gff_suitability(self.gff_file, self.sequences) # test force correlation self.sequences = self.sequences[1:] # CONTIG_2 assert gff_parser.check_gff_suitability(self.gff_file, self.sequences)
def test_suitability(self): self.sequences[0].id = "NOT_CONTIG_1" with self.assertRaisesRegex( errors.AntismashInputError, "GFF3 record IDs don't match sequence file record IDs"): gff_parser.check_gff_suitability(self.gff_file, self.sequences) self.sequences[0].id = "CONTIG_1" gff_parser.check_gff_suitability(self.gff_file, self.sequences) # test force correlation self.sequences = self.sequences[1:] # CONTIG_2 gff_parser.check_gff_suitability(self.gff_file, self.sequences)
def pre_process_sequences(sequences: List[Record], options: ConfigType, genefinding: AntismashModule) -> List[Record]: """ hmm - gaps removed - record ids adjusted to be unique - record ids are valid Note: Record instances will be altered in-place. Arguments: sequences: the secmet.Record instances to process options: an antismash Config instance genefinding: the module to use for genefinding, must have run_on_record() implemented Returns: A list of altered secmet.Record """ logging.debug("Preprocessing %d sequences", len(sequences)) # catch WGS master or supercontig entries if records_contain_shotgun_scaffolds(sequences): raise AntismashInputError("incomplete whole genome shotgun records are not supported") for i, seq in enumerate(sequences): seq.record_index = i + 1 # 1-indexed checking_required = not (options.reuse_results or options.skip_sanitisation) # keep sequences as clean as possible and make sure they're valid if checking_required: logging.debug("Sanitising record sequences") if len(sequences) == 1: sequences = [sanitise_sequence(sequences[0])] sequences = [check_content(sequences[0])] else: sequences = parallel_function(sanitise_sequence, ([record] for record in sequences)) sequences = parallel_function(check_content, ([sequence] for sequence in sequences)) for record in sequences: if record.skip or not record.seq: logging.warning("Record %s has no sequence, skipping.", record.id) if not record.id: raise AntismashInputError("record has no name") # skip anything not matching the filter filter_records_by_name(sequences, options.limit_to_record) # Now remove small contigs < minimum length again logging.debug("Removing sequences smaller than %d bases", options.minlength) for sequence in sequences: if len(sequence.seq) < options.minlength: sequence.skip = "smaller than minimum length (%d)" % options.minlength # Make sure we don't waste weeks of runtime on huge records, unless requested by the user limit_hit = filter_records_by_count(sequences, options.limit) if limit_hit: logging.warning("Only analysing the first %d records (increase via --limit)", options.limit) update_config({"triggered_limit": limit_hit}) # Check GFF suitability single_entry = False if options.genefinding_gff3: try: single_entry = gff_parser.check_gff_suitability(options, sequences) except AntismashInputError: raise except Exception as err: raise AntismashInputError("could not parse records from GFF3 file") from err if checking_required: # ensure CDS features have all relevant information logging.debug("Ensuring CDS features have all required information") assert hasattr(genefinding, "run_on_record") partial = functools.partial(ensure_cds_info, single_entry, genefinding.run_on_record) sequences = parallel_function(partial, ([sequence] for sequence in sequences)) # Check if no duplicate locus tags / gene IDs are found logging.debug("Ensuring CDS features do not have duplicate IDs") ensure_no_duplicate_cds_gene_ids(sequences) all_record_ids = {seq.id for seq in sequences} # Ensure all records have unique names if len(all_record_ids) < len(sequences): all_record_ids = set() for record in sequences: if record.id in all_record_ids: record.original_id = record.id record.id = generate_unique_id(record.id, all_record_ids)[0] all_record_ids.add(record.id) assert len(all_record_ids) == len(sequences), "%d != %d" % (len(all_record_ids), len(sequences)) # Ensure all records have valid names for record in sequences: fix_record_name_id(record, all_record_ids) return sequences
def pre_process_sequences(sequences, options, genefinding) -> List[Record]: """ hmm - gaps removed - record ids adjusted to be unique - record ids are valid Note: Record instances will be altered in-place. Arguments: sequences: the secmet.Record instances to process options: an antismash Config instance genefinding: the module to use for genefinding, must have run_on_record() implemented Returns: A list of altered secmet.Record """ logging.debug("Preprocessing %d sequences", len(sequences)) # catch WGS master or supercontig entries if records_contain_shotgun_scaffolds(sequences): raise RuntimeError( "Incomplete whole genome shotgun records are not supported") # keep count of how many records matched filter matching_filter = 0 for i, seq in enumerate(sequences): seq.record_index = i checking_required = not (options.reuse_results or options.skip_sanitisation) # keep sequences as clean as possible and make sure they're valid if checking_required: logging.debug("Sanitising record sequences") if len(sequences) == 1: sequences = [sanitise_sequence(sequences[0])] sequences = [check_content(sequences[0])] else: sequences = parallel_function(sanitise_sequence, ([record] for record in sequences)) sequences = parallel_function(check_content, ([sequence] for sequence in sequences)) for record in sequences: if record.skip or not record.seq: logging.warning("Record %s has no sequence, skipping.", record.id) if options.limit_to_record: logging.debug("Limiting to record id: %s", options.limit_to_record) # run the filter for sequence in sequences: if options.limit_to_record and options.limit_to_record != sequence.id: sequence.skip = "did not match filter: %s" % options.limit_to_record else: matching_filter += 1 limit = options.limit_to_record if matching_filter == 0: logging.error("No sequences matched filter: %s", limit) raise ValueError("No sequences matched filter: %s" % limit) elif matching_filter != len(sequences): logging.info("Skipped %d sequences not matching filter: %s", len(sequences) - matching_filter, limit) # Now remove small contigs < minimum length again logging.debug("Removing sequences smaller than %d bases", options.minlength) for sequence in sequences: if len(sequence.seq) < options.minlength: sequence.skip = "smaller than minimum length (%d)" % options.minlength # Make sure we don't waste weeks of runtime on huge records, unless requested by the user warned = False if options.limit > -1: meaningful = 0 for sequence in sequences: if sequence.skip: continue meaningful += 1 if meaningful > options.limit: if not warned: logging.warning( "Only analysing the first %d records (increase via --limit)", options.limit) warned = True sequence.skip = "skipping all but first {0} meaningful records (--limit {0}) ".format( options.limit) options = update_config({"triggered_limit": warned}) # TODO is there a better way # Check GFF suitability single_entry = False if options.genefinding_gff3: single_entry = gff_parser.check_gff_suitability(options, sequences) if checking_required: # ensure CDS features have all relevant information logging.debug("Ensuring CDS features have all required information") partial = functools.partial(ensure_cds_info, single_entry, genefinding.run_on_record) sequences = parallel_function(partial, ([sequence] for sequence in sequences)) # Check if no duplicate locus tags / gene IDs are found logging.debug("Ensuring CDS features do not have duplicate IDs") ensure_no_duplicate_cds_gene_ids(sequences) all_record_ids = {seq.id for seq in sequences} # Ensure all records have unique names if len(all_record_ids) < len(sequences): all_record_ids = set() for record in sequences: if record.id in all_record_ids: record.original_id = record.id record.id = generate_unique_id(record.id, all_record_ids)[0] all_record_ids.add(record.id) assert len(all_record_ids) == len( sequences), "%d != %d" % (len(all_record_ids), len(sequences)) # Ensure all records have valid names for record in sequences: fix_record_name_id(record, all_record_ids) return sequences
def parse_input_sequence(filename: str, taxon: str = "bacteria", minimum_length: int = -1, start: int = -1, end: int = -1, gff_file: str = "") -> List[Record]: """ Parse input records contained in a file Arguments: filename: the path of the file to read taxon: the taxon of the input, e.g. 'bacteria', 'fungi' minimum_length: records with length less than this will be ignored if not positive, all records are included start: a start location for trimming the sequence, or -1 to use all end: an end location for trimming the sequence, or -1 to use all gff_file: a GFF file to use for gene/CDS annotations Returns: A list of secmet.Record instances, one for each record in the file """ logging.info('Parsing input sequence %r', filename) if not isinstance(minimum_length, int): raise TypeError("minimum_length must be an int") records = [] # type: List[SeqRecord] for record in _strict_parse(filename): if minimum_length < 1 \ or len(record.seq) >= minimum_length \ or 'contig' in record.annotations \ or 'wgs_scafld' in record.annotations \ or 'wgs' in record.annotations: records.append(record) # if no records are left, that's a problem if not records: raise AntismashInputError( "all input records smaller than minimum length (%d)" % minimum_length) for record in records: if isinstance( record.seq.alphabet, Bio.Alphabet.ProteinAlphabet) or not is_nucl_seq(record.seq): raise AntismashInputError("protein records are not supported: %s" % record.id) # before conversion to secmet records, trim if required if start > -1 or end > -1: if len(records) > 1: raise ValueError( "--start and --end options cannot be used with multiple records" ) records[0] = trim_sequence(records[0], max(start, 0), min(len(records[0]), end)) # add GFF features before conversion, if relevant if gff_file: logging.debug("Loading annotations from GFF file") # check GFF suitability first try: gff_parser.check_gff_suitability(gff_file, records) except AntismashInputError: raise except Exception as err: # avoid swallowing details if possible if str(err): logging.error(err) raise AntismashInputError( "could not parse records from GFF3 file") from err gff_features = gff_parser.run(gff_file) for record in records: if any(feature.type == "CDS" for feature in record.features): continue record.features.extend(gff_features.get(record.id, [])) # remove any previous or obselete antiSMASH annotations to minimise incompatabilities for record in records: strip_record(record) logging.debug("Converting records from biopython to secmet") try: records = [Record.from_biopython(record, taxon) for record in records] except SecmetInvalidInputError as err: raise AntismashInputError(str(err)) from err # if parsable by secmet, it has a better context on what to strip, so run # the secmet stripping to ensure there's no surprises for record in records: record.strip_antismash_annotations() return records
def parse_input_sequence(filename: str, taxon: str = "bacteria", minimum_length: int = -1, start: int = -1, end: int = -1, gff_file: str = "") -> List[Record]: """ Parse input records contained in a file Arguments: filename: the path of the file to read taxon: the taxon of the input, e.g. 'bacteria', 'fungi' minimum_length: records with length less than this will be ignored if not positive, all records are included start: a start location for trimming the sequence, or -1 to use all end: an end location for trimming the sequence, or -1 to use all gff_file: a GFF file to use for gene/CDS annotations Returns: A list of secmet.Record instances, one for each record in the file """ logging.info('Parsing input sequence %r', filename) if not isinstance(minimum_length, int): raise TypeError("minimum_length must be an int") records = [] # type: List[SeqRecord] for record in _strict_parse(filename): if minimum_length < 1 \ or len(record.seq) >= minimum_length \ or 'contig' in record.annotations \ or 'wgs_scafld' in record.annotations \ or 'wgs' in record.annotations: records.append(record) # if no records are left, that's a problem if not records: raise AntismashInputError("no valid records found in file %r" % filename) for record in records: if isinstance(record.seq.alphabet, Bio.Alphabet.ProteinAlphabet) or not is_nucl_seq(record.seq): raise AntismashInputError("protein records are not supported") # before conversion to secmet records, trim if required if start > -1 or end > -1: if len(records) > 1: raise ValueError("--start and --end options cannot be used with multiple records") records[0] = trim_sequence(records[0], max(start, 0), min(len(records[0]), end)) # add GFF features before conversion, if relevant if gff_file: logging.debug("Loading annotations from GFF file") # check GFF suitability first single_entry = False try: single_entry = gff_parser.check_gff_suitability(gff_file, records) except AntismashInputError: raise except Exception as err: raise AntismashInputError("could not parse records from GFF3 file") from err # then add any features found for any record with no CDS features partial = functools.partial(_add_gff_features, single_entry, gff_file) records = parallel_function(partial, ([record] for record in records)) for record in records: if any(feature.type == "CDS" for feature in record.features): continue gff_features = gff_parser.run(record.id, single_entry, gff_file) record.features.extend(gff_features) # remove any previous or obselete antiSMASH features so conversion can be clean for record in records: strip_record(record) logging.debug("Converting records from biopython to secmet") try: return [Record.from_biopython(record, taxon) for record in records] except SecmetInvalidInputError as err: raise AntismashInputError(str(err)) from err