def _strict_parse(filename: str) -> List[SeqRecord]: """ Parses the input record with extra wrappers to catch biopython warnings as errors. Arguments: filename: the name of the file to parse Returns: a list of SeqRecords parsed """ filter_messages = [ r".*invalid location.*", r".*Expected sequence length.*", r".*Couldn't parse feature location.*", ] try: # prepend warning filters to raise exceptions on certain messages for message in filter_messages: warnings.filterwarnings("error", message=message) records = list(seqio.parse(filename)) except Exception as err: message = str(err) # strip the "Ignoring" part, since it's not being ignored if message.startswith("Ignoring invalid location"): message = message[9:] logging.error('Parsing %r failed: %s', filename, message) raise AntismashInputError(message) from err finally: # remove the new warning filters (functions in at least 3.5 and 3.6) # since mypy doesn't recognise this attribute, ignore the type warnings.filters = warnings.filters[len(filter_messages):] # type: ignore if not records: raise AntismashInputError("no valid records found in file %s" % filename) return records
def parse_input_sequence(filename: str, taxon: str = "bacteria", minimum_length: int = -1, start: int = -1, end: int = -1) -> List[Record]: """ Parse input records contained in a file Arguments: filename: the path of the file to read taxon: the taxon of the input, e.g. 'bacteria', 'fungi' minimum_length: records with length less than this will be ignored if not positive, all records are included start: a start location for trimming the sequence, or -1 to use all end: an end location for trimming the sequence, or -1 to use all Returns: A list of secmet.Record instances, one for each record in the file """ logging.info('Parsing input sequence %r', filename) if not isinstance(minimum_length, int): raise TypeError("minimum_length must be an int") records = [] # type: List[SeqRecord] try: record_list = list(seqio.parse(filename)) except Exception as err: logging.error('Parsing %r failed: %s', filename, err) raise AntismashInputError(str(err)) from err for record in record_list: if minimum_length < 1 \ or len(record.seq) >= minimum_length \ or 'contig' in record.annotations \ or 'wgs_scafld' in record.annotations \ or 'wgs' in record.annotations: records.append(record) # if no records are left, that's a problem if not records: raise AntismashInputError("no valid records found in file %r" % filename) for record in records: if isinstance(record.seq.alphabet, Bio.Alphabet.ProteinAlphabet): raise AntismashInputError("protein records are not supported") # before conversion to secmet records, trim if required if start > -1 or end > -1: if len(records) > 1: raise ValueError( "--start and --end options cannot be used with multiple records" ) records[0] = trim_sequence(records[0], max(start, 0), min(len(records[0]), end)) try: return [Record.from_biopython(record, taxon) for record in records] except SecmetInvalidInputError as err: raise AntismashInputError(str(err)) from err
def filter_records_by_name(sequences: List[Record], target: str) -> None: """ Mark records as skipped if their id does not match the given target or they are above . If the target is an empty string, all records will match. If not records match, an error will be raised. Arguments: sequences: the Records to filter target: the name to match, must be exact Returns: None """ if not target: return logging.debug("Limiting to record id: %s", target) # run the filter matching_filter = 0 for sequence in sequences: if sequence.id != target: sequence.skip = "did not match filter: %s" % target else: matching_filter += 1 if matching_filter == 0: logging.error("No sequences matched filter: %s", target) raise AntismashInputError("no sequences matched filter: %s" % target) logging.info("Skipped %d sequences not matching filter: %s", len(sequences) - matching_filter, target)
def check_gff_suitability(gff_file: str, sequences: List[SeqRecord]) -> None: """ Checks that the provided GFF3 file is acceptable If only a single record is contained in both sequences and GFF, they are assumed to be the same. Arguments: gff_file: the path of the GFF file to check sequences: a list of SeqRecords Returns: None """ try: examiner = GFF.GFFExaminer() # file handle is automatically closed by GFF lib gff_data = examiner.available_limits(open(gff_file)) # Check if at least one GFF locus appears in sequence gff_ids = set([n[0] for n in gff_data['gff_id']]) if len(gff_ids) == 1 and len(sequences) == 1: # If both inputs only have one record, assume is the same, # but first check coordinate compatibility logging.info("GFF3 and sequence have only one record. Assuming is " "the same as long as coordinates are compatible.") limit_info = dict(gff_type=['CDS']) record_iter = GFF.parse(open(gff_file), limit_info=limit_info) try: record = next(record_iter) except StopIteration: raise AntismashInputError("could not parse records from GFF3 file") if not record.features: raise AntismashInputError('GFF3 record %s contains no features' % record.id) coord_max = max([n.location.end.real for n in record.features]) if coord_max > len(sequences[0]): logging.error('GFF3 record and sequence coordinates are not compatible.') raise AntismashInputError('incompatible GFF record and sequence coordinates') elif not gff_ids.intersection({seq.id for seq in sequences}): logging.error('No GFF3 record IDs match any sequence record IDs.') raise AntismashInputError("GFF3 record IDs don't match sequence file record IDs.") # Check GFF contains CDSs if not ('CDS',) in gff_data['gff_type']: logging.error('GFF3 does not contain any CDS.') raise AntismashInputError("no CDS features in GFF3 file.") # Check CDS are childless but not parentless if 'CDS' in set([n for key in examiner.parent_child_map(open(gff_file)) for n in key]): logging.error('GFF3 structure is not suitable. CDS features must be childless but not parentless.') raise AntismashInputError('GFF3 structure is not suitable.') except AssertionError as err: logging.error('Parsing %r failed: %s', gff_file, err) raise AntismashInputError(str(err)) from err
def get_features_from_file( record: Record, handle: IO, limit_to_seq_id: Union[bool, Dict[str, List[str]]] = False ) -> List[SeqFeature]: """ Generates new SeqFeatures from a GFF file. Arguments: record: the Record that features belong to handle: a file handle/stream with the GFF contents limit_to_seq_id: False or a dictionary of GFF.parse options Returns: a list of SeqFeatures parsed from the GFF file """ features = [] try: gff_records = list(GFF.parse(handle, limit_info=limit_to_seq_id)) except Exception as err: raise AntismashInputError( "could not parse records from GFF3 file") from err for gff_record in gff_records: for feature in gff_record.features: if feature.type == 'CDS': new_features = [feature] else: new_features = check_sub(feature, record) if not new_features: continue name = feature.id locus_tag = feature.qualifiers.get("locus_tag") for qtype in ["gene", "name", "Name"]: if qtype in feature.qualifiers: name_tmp = feature.qualifiers[qtype][0] # Assume name/Name to be sane if they don't contain a space if " " in name_tmp: continue name = name_tmp break for i, new_feature in enumerate(new_features): variant = name if len(new_features) > 1: variant = "{0}_{1}".format(name, i) new_feature.qualifiers['gene'] = [variant] if locus_tag is not None: new_feature.qualifiers["locus_tag"] = locus_tag features.append(new_feature) return features
def generate_details_from_subfeature( sub_feature: SeqFeature, existing_qualifiers: Dict[str, List[str]], locations: List[FeatureLocation], trans_locations: List[FeatureLocation]) -> Set[str]: """ Finds the locations of a subfeature and any mismatching qualifiers Arguments: sub_feature: the GFF subfeature to work on existing_qualifiers: a dict of any existing qualifiers from other subfeatures locations: a list of any existing FeatureLocations from other subfeatures trans_locations: a list of any existing FeatureLocations for translations Returns: a set of qualifiers from the subfeature for which an existing qualifier existed but had a different value """ mismatching_qualifiers = set() start = sub_feature.location.start.real end = sub_feature.location.end.real if MODIFY_LOCATIONS_BY_PHASE: phase = int(sub_feature.qualifiers.get('phase', [0])[0]) if sub_feature.strand == 1: start += phase else: end -= phase try: locations.append(FeatureLocation(start, end, strand=sub_feature.strand)) except ValueError as err: raise AntismashInputError(str(err)) from err # Make sure CDSs lengths are multiple of three. Otherwise extend to next full codon. # This only applies for translation. modulus = (end - start) % 3 if modulus and sub_feature.strand == 1: end += 3 - modulus elif modulus and sub_feature.strand == -1: start -= 3 - modulus trans_locations.append( FeatureLocation(start, end, strand=sub_feature.strand)) # For split features (CDSs), the final feature will have the same qualifiers as the children ONLY if # they're the same, i.e.: all children have the same "protein_ID" (key and value). for qual in sub_feature.qualifiers: if qual not in existing_qualifiers: existing_qualifiers[qual] = sub_feature.qualifiers[qual] elif existing_qualifiers[qual] != sub_feature.qualifiers[qual]: mismatching_qualifiers.add(qual) return mismatching_qualifiers
def get_features_from_file(handle: IO) -> Dict[str, List[SeqFeature]]: """ Generates new SeqFeatures from a GFF file. Arguments: handle: a file handle/stream with the GFF contents Returns: a dictionary mapping record ID to a list of SeqFeatures for that record """ try: gff_records = list(GFF.parse(handle)) except Exception as err: raise AntismashInputError( "could not parse records from GFF3 file") from err results = {} for gff_record in gff_records: features = [] for feature in gff_record.features: if feature.type == 'CDS': new_features = [feature] else: new_features = check_sub(feature) if not new_features: continue name = feature.id locus_tag = feature.qualifiers.get("locus_tag") for qtype in ["gene", "name", "Name"]: if qtype in feature.qualifiers: name_tmp = feature.qualifiers[qtype][0] # Assume name/Name to be sane if they don't contain a space if " " in name_tmp: continue name = name_tmp break for i, new_feature in enumerate(new_features): variant = name if len(new_features) > 1: variant = "{0}_{1}".format(name, i) new_feature.qualifiers['gene'] = [variant] if locus_tag is not None: new_feature.qualifiers["locus_tag"] = locus_tag features.append(new_feature) results[gff_record.id] = features return results
def run_on_record(record: Record, options: ConfigType) -> None: """ Find genes in a Record using glimmerhmm or prodigal. Genes will be added to the record as they are found. """ if options.genefinding_tool == 'error': raise AntismashInputError( f"Record {record.id} contains no genes and no genefinding tool specified" ) if options.taxon == 'fungi': if options.genefinding_tool == ["none"]: return None assert options.genefinding_tool == "glimmerhmm" logging.debug("Running glimmerhmm genefinding") return run_glimmerhmm(record) if options.genefinding_tool in ["prodigal", "prodigal-m"]: logging.debug("Running prodigal based genefinding") return run_prodigal(record, options) raise ValueError("Unknown genefinding tool: %s" % options.genefinding_tool)
def pre_process_sequences(sequences: List[Record], options: ConfigType, genefinding: AntismashModule) -> List[Record]: """ hmm - gaps removed - record ids adjusted to be unique - record ids are valid Note: Record instances will be altered in-place. Arguments: sequences: the secmet.Record instances to process options: an antismash Config instance genefinding: the module to use for genefinding, must have run_on_record() implemented Returns: A list of altered secmet.Record """ logging.debug("Preprocessing %d sequences", len(sequences)) # catch WGS master or supercontig entries if records_contain_shotgun_scaffolds(sequences): raise AntismashInputError("incomplete whole genome shotgun records are not supported") for i, seq in enumerate(sequences): seq.record_index = i + 1 # 1-indexed checking_required = not (options.reuse_results or options.skip_sanitisation) # keep sequences as clean as possible and make sure they're valid if checking_required: logging.debug("Sanitising record sequences") if len(sequences) == 1: sequences = [sanitise_sequence(sequences[0])] sequences = [check_content(sequences[0])] else: sequences = parallel_function(sanitise_sequence, ([record] for record in sequences)) sequences = parallel_function(check_content, ([sequence] for sequence in sequences)) for record in sequences: if record.skip or not record.seq: logging.warning("Record %s has no sequence, skipping.", record.id) if not record.id: raise AntismashInputError("record has no name") # skip anything not matching the filter filter_records_by_name(sequences, options.limit_to_record) # Now remove small contigs < minimum length again logging.debug("Removing sequences smaller than %d bases", options.minlength) for sequence in sequences: if len(sequence.seq) < options.minlength: sequence.skip = "smaller than minimum length (%d)" % options.minlength # Make sure we don't waste weeks of runtime on huge records, unless requested by the user limit_hit = filter_records_by_count(sequences, options.limit) if limit_hit: logging.warning("Only analysing the first %d records (increase via --limit)", options.limit) update_config({"triggered_limit": limit_hit}) # Check GFF suitability single_entry = False if options.genefinding_gff3: try: single_entry = gff_parser.check_gff_suitability(options, sequences) except AntismashInputError: raise except Exception as err: raise AntismashInputError("could not parse records from GFF3 file") from err if checking_required: # ensure CDS features have all relevant information logging.debug("Ensuring CDS features have all required information") assert hasattr(genefinding, "run_on_record") partial = functools.partial(ensure_cds_info, single_entry, genefinding.run_on_record) sequences = parallel_function(partial, ([sequence] for sequence in sequences)) # Check if no duplicate locus tags / gene IDs are found logging.debug("Ensuring CDS features do not have duplicate IDs") ensure_no_duplicate_cds_gene_ids(sequences) all_record_ids = {seq.id for seq in sequences} # Ensure all records have unique names if len(all_record_ids) < len(sequences): all_record_ids = set() for record in sequences: if record.id in all_record_ids: record.original_id = record.id record.id = generate_unique_id(record.id, all_record_ids)[0] all_record_ids.add(record.id) assert len(all_record_ids) == len(sequences), "%d != %d" % (len(all_record_ids), len(sequences)) # Ensure all records have valid names for record in sequences: fix_record_name_id(record, all_record_ids) return sequences
def parse_input_sequence(filename: str, taxon: str = "bacteria", minimum_length: int = -1, start: int = -1, end: int = -1, gff_file: str = "") -> List[Record]: """ Parse input records contained in a file Arguments: filename: the path of the file to read taxon: the taxon of the input, e.g. 'bacteria', 'fungi' minimum_length: records with length less than this will be ignored if not positive, all records are included start: a start location for trimming the sequence, or -1 to use all end: an end location for trimming the sequence, or -1 to use all gff_file: a GFF file to use for gene/CDS annotations Returns: A list of secmet.Record instances, one for each record in the file """ logging.info('Parsing input sequence %r', filename) if not isinstance(minimum_length, int): raise TypeError("minimum_length must be an int") records = [] # type: List[SeqRecord] for record in _strict_parse(filename): if minimum_length < 1 \ or len(record.seq) >= minimum_length \ or 'contig' in record.annotations \ or 'wgs_scafld' in record.annotations \ or 'wgs' in record.annotations: records.append(record) # if no records are left, that's a problem if not records: raise AntismashInputError( "all input records smaller than minimum length (%d)" % minimum_length) for record in records: if isinstance( record.seq.alphabet, Bio.Alphabet.ProteinAlphabet) or not is_nucl_seq(record.seq): raise AntismashInputError("protein records are not supported: %s" % record.id) # before conversion to secmet records, trim if required if start > -1 or end > -1: if len(records) > 1: raise ValueError( "--start and --end options cannot be used with multiple records" ) records[0] = trim_sequence(records[0], max(start, 0), min(len(records[0]), end)) # add GFF features before conversion, if relevant if gff_file: logging.debug("Loading annotations from GFF file") # check GFF suitability first try: gff_parser.check_gff_suitability(gff_file, records) except AntismashInputError: raise except Exception as err: # avoid swallowing details if possible if str(err): logging.error(err) raise AntismashInputError( "could not parse records from GFF3 file") from err gff_features = gff_parser.run(gff_file) for record in records: if any(feature.type == "CDS" for feature in record.features): continue record.features.extend(gff_features.get(record.id, [])) # remove any previous or obselete antiSMASH annotations to minimise incompatabilities for record in records: strip_record(record) logging.debug("Converting records from biopython to secmet") try: records = [Record.from_biopython(record, taxon) for record in records] except SecmetInvalidInputError as err: raise AntismashInputError(str(err)) from err # if parsable by secmet, it has a better context on what to strip, so run # the secmet stripping to ensure there's no surprises for record in records: record.strip_antismash_annotations() return records
def parse_input_sequence(filename: str, taxon: str = "bacteria", minimum_length: int = -1, start: int = -1, end: int = -1, gff_file: str = "") -> List[Record]: """ Parse input records contained in a file Arguments: filename: the path of the file to read taxon: the taxon of the input, e.g. 'bacteria', 'fungi' minimum_length: records with length less than this will be ignored if not positive, all records are included start: a start location for trimming the sequence, or -1 to use all end: an end location for trimming the sequence, or -1 to use all gff_file: a GFF file to use for gene/CDS annotations Returns: A list of secmet.Record instances, one for each record in the file """ logging.info('Parsing input sequence %r', filename) if not isinstance(minimum_length, int): raise TypeError("minimum_length must be an int") records = [] # type: List[SeqRecord] for record in _strict_parse(filename): if minimum_length < 1 \ or len(record.seq) >= minimum_length \ or 'contig' in record.annotations \ or 'wgs_scafld' in record.annotations \ or 'wgs' in record.annotations: records.append(record) # if no records are left, that's a problem if not records: raise AntismashInputError("no valid records found in file %r" % filename) for record in records: if isinstance(record.seq.alphabet, Bio.Alphabet.ProteinAlphabet) or not is_nucl_seq(record.seq): raise AntismashInputError("protein records are not supported") # before conversion to secmet records, trim if required if start > -1 or end > -1: if len(records) > 1: raise ValueError("--start and --end options cannot be used with multiple records") records[0] = trim_sequence(records[0], max(start, 0), min(len(records[0]), end)) # add GFF features before conversion, if relevant if gff_file: logging.debug("Loading annotations from GFF file") # check GFF suitability first single_entry = False try: single_entry = gff_parser.check_gff_suitability(gff_file, records) except AntismashInputError: raise except Exception as err: raise AntismashInputError("could not parse records from GFF3 file") from err # then add any features found for any record with no CDS features partial = functools.partial(_add_gff_features, single_entry, gff_file) records = parallel_function(partial, ([record] for record in records)) for record in records: if any(feature.type == "CDS" for feature in record.features): continue gff_features = gff_parser.run(record.id, single_entry, gff_file) record.features.extend(gff_features) # remove any previous or obselete antiSMASH features so conversion can be clean for record in records: strip_record(record) logging.debug("Converting records from biopython to secmet") try: return [Record.from_biopython(record, taxon) for record in records] except SecmetInvalidInputError as err: raise AntismashInputError(str(err)) from err