Example #1
0
def _strict_parse(filename: str) -> List[SeqRecord]:
    """ Parses the input record with extra wrappers to catch biopython warnings
        as errors.

        Arguments:
            filename: the name of the file to parse

        Returns:
            a list of SeqRecords parsed
    """
    filter_messages = [
        r".*invalid location.*",
        r".*Expected sequence length.*",
        r".*Couldn't parse feature location.*",
    ]
    try:
        # prepend warning filters to raise exceptions on certain messages
        for message in filter_messages:
            warnings.filterwarnings("error", message=message)
        records = list(seqio.parse(filename))
    except Exception as err:
        message = str(err)
        # strip the "Ignoring" part, since it's not being ignored
        if message.startswith("Ignoring invalid location"):
            message = message[9:]
        logging.error('Parsing %r failed: %s', filename, message)
        raise AntismashInputError(message) from err
    finally:
        # remove the new warning filters (functions in at least 3.5 and 3.6)
        # since mypy doesn't recognise this attribute, ignore the type
        warnings.filters = warnings.filters[len(filter_messages):]   # type: ignore

    if not records:
        raise AntismashInputError("no valid records found in file %s" % filename)
    return records
Example #2
0
def parse_input_sequence(filename: str,
                         taxon: str = "bacteria",
                         minimum_length: int = -1,
                         start: int = -1,
                         end: int = -1) -> List[Record]:
    """ Parse input records contained in a file

        Arguments:
            filename: the path of the file to read
            taxon: the taxon of the input, e.g. 'bacteria', 'fungi'
            minimum_length: records with length less than this will be ignored
                            if not positive, all records are included
            start: a start location for trimming the sequence, or -1 to use all
            end: an end location for trimming the sequence, or -1 to use all

        Returns:
            A list of secmet.Record instances, one for each record in the file
    """
    logging.info('Parsing input sequence %r', filename)
    if not isinstance(minimum_length, int):
        raise TypeError("minimum_length must be an int")

    records = []  # type: List[SeqRecord]
    try:
        record_list = list(seqio.parse(filename))
    except Exception as err:
        logging.error('Parsing %r failed: %s', filename, err)
        raise AntismashInputError(str(err)) from err

    for record in record_list:
        if minimum_length < 1 \
                or len(record.seq) >= minimum_length \
                or 'contig' in record.annotations \
                or 'wgs_scafld' in record.annotations \
                or 'wgs' in record.annotations:
            records.append(record)

    # if no records are left, that's a problem
    if not records:
        raise AntismashInputError("no valid records found in file %r" %
                                  filename)

    for record in records:
        if isinstance(record.seq.alphabet, Bio.Alphabet.ProteinAlphabet):
            raise AntismashInputError("protein records are not supported")

    # before conversion to secmet records, trim if required
    if start > -1 or end > -1:
        if len(records) > 1:
            raise ValueError(
                "--start and --end options cannot be used with multiple records"
            )
        records[0] = trim_sequence(records[0], max(start, 0),
                                   min(len(records[0]), end))

    try:
        return [Record.from_biopython(record, taxon) for record in records]
    except SecmetInvalidInputError as err:
        raise AntismashInputError(str(err)) from err
Example #3
0
def filter_records_by_name(sequences: List[Record], target: str) -> None:
    """ Mark records as skipped if their id does not match the given target or
        they are above .

        If the target is an empty string, all records will match.
        If not records match, an error will be raised.

        Arguments:
            sequences: the Records to filter
            target: the name to match, must be exact

        Returns:
            None
    """
    if not target:
        return

    logging.debug("Limiting to record id: %s", target)

    # run the filter
    matching_filter = 0
    for sequence in sequences:
        if sequence.id != target:
            sequence.skip = "did not match filter: %s" % target
        else:
            matching_filter += 1

    if matching_filter == 0:
        logging.error("No sequences matched filter: %s", target)
        raise AntismashInputError("no sequences matched filter: %s" % target)

    logging.info("Skipped %d sequences not matching filter: %s",
                 len(sequences) - matching_filter, target)
def check_gff_suitability(gff_file: str, sequences: List[SeqRecord]) -> None:
    """
        Checks that the provided GFF3 file is acceptable

        If only a single record is contained in both sequences and GFF, they
        are assumed to be the same.

        Arguments:
            gff_file: the path of the GFF file to check
            sequences: a list of SeqRecords

        Returns:
            None
    """
    try:
        examiner = GFF.GFFExaminer()
        # file handle is automatically closed by GFF lib
        gff_data = examiner.available_limits(open(gff_file))
        # Check if at least one GFF locus appears in sequence
        gff_ids = set([n[0] for n in gff_data['gff_id']])

        if len(gff_ids) == 1 and len(sequences) == 1:
            # If both inputs only have one record, assume is the same,
            # but first check coordinate compatibility
            logging.info("GFF3 and sequence have only one record. Assuming is "
                         "the same as long as coordinates are compatible.")
            limit_info = dict(gff_type=['CDS'])

            record_iter = GFF.parse(open(gff_file), limit_info=limit_info)
            try:
                record = next(record_iter)
            except StopIteration:
                raise AntismashInputError("could not parse records from GFF3 file")

            if not record.features:
                raise AntismashInputError('GFF3 record %s contains no features' % record.id)

            coord_max = max([n.location.end.real for n in record.features])
            if coord_max > len(sequences[0]):
                logging.error('GFF3 record and sequence coordinates are not compatible.')
                raise AntismashInputError('incompatible GFF record and sequence coordinates')

        elif not gff_ids.intersection({seq.id for seq in sequences}):
            logging.error('No GFF3 record IDs match any sequence record IDs.')
            raise AntismashInputError("GFF3 record IDs don't match sequence file record IDs.")

        # Check GFF contains CDSs
        if not ('CDS',) in gff_data['gff_type']:
            logging.error('GFF3 does not contain any CDS.')
            raise AntismashInputError("no CDS features in GFF3 file.")

        # Check CDS are childless but not parentless
        if 'CDS' in set([n for key in examiner.parent_child_map(open(gff_file)) for n in key]):
            logging.error('GFF3 structure is not suitable. CDS features must be childless but not parentless.')
            raise AntismashInputError('GFF3 structure is not suitable.')

    except AssertionError as err:
        logging.error('Parsing %r failed: %s', gff_file, err)
        raise AntismashInputError(str(err)) from err
Example #5
0
def get_features_from_file(
    record: Record,
    handle: IO,
    limit_to_seq_id: Union[bool, Dict[str, List[str]]] = False
) -> List[SeqFeature]:
    """ Generates new SeqFeatures from a GFF file.

        Arguments:
            record: the Record that features belong to
            handle: a file handle/stream with the GFF contents
            limit_to_seq_id: False or a dictionary of GFF.parse options

        Returns:
            a list of SeqFeatures parsed from the GFF file
    """
    features = []
    try:
        gff_records = list(GFF.parse(handle, limit_info=limit_to_seq_id))
    except Exception as err:
        raise AntismashInputError(
            "could not parse records from GFF3 file") from err

    for gff_record in gff_records:
        for feature in gff_record.features:
            if feature.type == 'CDS':
                new_features = [feature]
            else:
                new_features = check_sub(feature, record)
                if not new_features:
                    continue

            name = feature.id
            locus_tag = feature.qualifiers.get("locus_tag")

            for qtype in ["gene", "name", "Name"]:
                if qtype in feature.qualifiers:
                    name_tmp = feature.qualifiers[qtype][0]
                    # Assume name/Name to be sane if they don't contain a space
                    if " " in name_tmp:
                        continue
                    name = name_tmp
                    break

            for i, new_feature in enumerate(new_features):
                variant = name
                if len(new_features) > 1:
                    variant = "{0}_{1}".format(name, i)
                new_feature.qualifiers['gene'] = [variant]
                if locus_tag is not None:
                    new_feature.qualifiers["locus_tag"] = locus_tag
                features.append(new_feature)
    return features
Example #6
0
def generate_details_from_subfeature(
        sub_feature: SeqFeature, existing_qualifiers: Dict[str, List[str]],
        locations: List[FeatureLocation],
        trans_locations: List[FeatureLocation]) -> Set[str]:
    """ Finds the locations of a subfeature and any mismatching qualifiers

        Arguments:
            sub_feature: the GFF subfeature to work on
            existing_qualifiers: a dict of any existing qualifiers from other
                                 subfeatures
            locations: a list of any existing FeatureLocations from other
                       subfeatures
            trans_locations: a list of any existing FeatureLocations for
                             translations

        Returns:
            a set of qualifiers from the subfeature for which an existing
            qualifier existed but had a different value
    """
    mismatching_qualifiers = set()
    start = sub_feature.location.start.real
    end = sub_feature.location.end.real
    if MODIFY_LOCATIONS_BY_PHASE:
        phase = int(sub_feature.qualifiers.get('phase', [0])[0])
        if sub_feature.strand == 1:
            start += phase
        else:
            end -= phase
    try:
        locations.append(FeatureLocation(start, end,
                                         strand=sub_feature.strand))
    except ValueError as err:
        raise AntismashInputError(str(err)) from err
    # Make sure CDSs lengths are multiple of three. Otherwise extend to next full codon.
    # This only applies for translation.
    modulus = (end - start) % 3
    if modulus and sub_feature.strand == 1:
        end += 3 - modulus
    elif modulus and sub_feature.strand == -1:
        start -= 3 - modulus
    trans_locations.append(
        FeatureLocation(start, end, strand=sub_feature.strand))
    # For split features (CDSs), the final feature will have the same qualifiers as the children ONLY if
    # they're the same, i.e.: all children have the same "protein_ID" (key and value).
    for qual in sub_feature.qualifiers:
        if qual not in existing_qualifiers:
            existing_qualifiers[qual] = sub_feature.qualifiers[qual]
        elif existing_qualifiers[qual] != sub_feature.qualifiers[qual]:
            mismatching_qualifiers.add(qual)
    return mismatching_qualifiers
Example #7
0
def get_features_from_file(handle: IO) -> Dict[str, List[SeqFeature]]:
    """ Generates new SeqFeatures from a GFF file.

        Arguments:
            handle: a file handle/stream with the GFF contents

        Returns:
            a dictionary mapping record ID to a list of SeqFeatures for that record
    """
    try:
        gff_records = list(GFF.parse(handle))
    except Exception as err:
        raise AntismashInputError(
            "could not parse records from GFF3 file") from err

    results = {}
    for gff_record in gff_records:
        features = []
        for feature in gff_record.features:
            if feature.type == 'CDS':
                new_features = [feature]
            else:
                new_features = check_sub(feature)
                if not new_features:
                    continue

            name = feature.id
            locus_tag = feature.qualifiers.get("locus_tag")

            for qtype in ["gene", "name", "Name"]:
                if qtype in feature.qualifiers:
                    name_tmp = feature.qualifiers[qtype][0]
                    # Assume name/Name to be sane if they don't contain a space
                    if " " in name_tmp:
                        continue
                    name = name_tmp
                    break

            for i, new_feature in enumerate(new_features):
                variant = name
                if len(new_features) > 1:
                    variant = "{0}_{1}".format(name, i)
                new_feature.qualifiers['gene'] = [variant]
                if locus_tag is not None:
                    new_feature.qualifiers["locus_tag"] = locus_tag
                features.append(new_feature)
        results[gff_record.id] = features
    return results
Example #8
0
def run_on_record(record: Record, options: ConfigType) -> None:
    """ Find genes in a Record using glimmerhmm or prodigal.
        Genes will be added to the record as they are found.
    """
    if options.genefinding_tool == 'error':
        raise AntismashInputError(
            f"Record {record.id} contains no genes and no genefinding tool specified"
        )

    if options.taxon == 'fungi':
        if options.genefinding_tool == ["none"]:
            return None
        assert options.genefinding_tool == "glimmerhmm"
        logging.debug("Running glimmerhmm genefinding")
        return run_glimmerhmm(record)

    if options.genefinding_tool in ["prodigal", "prodigal-m"]:
        logging.debug("Running prodigal based genefinding")
        return run_prodigal(record, options)

    raise ValueError("Unknown genefinding tool: %s" % options.genefinding_tool)
Example #9
0
def pre_process_sequences(sequences: List[Record], options: ConfigType, genefinding: AntismashModule) -> List[Record]:
    """ hmm

        - gaps removed
        - record ids adjusted to be unique
        - record ids are valid

        Note: Record instances will be altered in-place.

        Arguments:
            sequences: the secmet.Record instances to process
            options: an antismash Config instance
            genefinding: the module to use for genefinding, must have
                         run_on_record() implemented

        Returns:
            A list of altered secmet.Record
    """
    logging.debug("Preprocessing %d sequences", len(sequences))

    # catch WGS master or supercontig entries
    if records_contain_shotgun_scaffolds(sequences):
        raise AntismashInputError("incomplete whole genome shotgun records are not supported")

    for i, seq in enumerate(sequences):
        seq.record_index = i + 1  # 1-indexed

    checking_required = not (options.reuse_results or options.skip_sanitisation)

    # keep sequences as clean as possible and make sure they're valid
    if checking_required:
        logging.debug("Sanitising record sequences")
        if len(sequences) == 1:
            sequences = [sanitise_sequence(sequences[0])]
            sequences = [check_content(sequences[0])]
        else:
            sequences = parallel_function(sanitise_sequence, ([record] for record in sequences))
            sequences = parallel_function(check_content, ([sequence] for sequence in sequences))

    for record in sequences:
        if record.skip or not record.seq:
            logging.warning("Record %s has no sequence, skipping.", record.id)
        if not record.id:
            raise AntismashInputError("record has no name")

    # skip anything not matching the filter
    filter_records_by_name(sequences, options.limit_to_record)

    # Now remove small contigs < minimum length again
    logging.debug("Removing sequences smaller than %d bases", options.minlength)
    for sequence in sequences:
        if len(sequence.seq) < options.minlength:
            sequence.skip = "smaller than minimum length (%d)" % options.minlength

    # Make sure we don't waste weeks of runtime on huge records, unless requested by the user
    limit_hit = filter_records_by_count(sequences, options.limit)
    if limit_hit:
        logging.warning("Only analysing the first %d records (increase via --limit)", options.limit)
    update_config({"triggered_limit": limit_hit})

    # Check GFF suitability
    single_entry = False
    if options.genefinding_gff3:
        try:
            single_entry = gff_parser.check_gff_suitability(options, sequences)
        except AntismashInputError:
            raise
        except Exception as err:
            raise AntismashInputError("could not parse records from GFF3 file") from err

    if checking_required:
        # ensure CDS features have all relevant information
        logging.debug("Ensuring CDS features have all required information")
        assert hasattr(genefinding, "run_on_record")
        partial = functools.partial(ensure_cds_info, single_entry, genefinding.run_on_record)
        sequences = parallel_function(partial, ([sequence] for sequence in sequences))

        # Check if no duplicate locus tags / gene IDs are found
        logging.debug("Ensuring CDS features do not have duplicate IDs")
        ensure_no_duplicate_cds_gene_ids(sequences)

        all_record_ids = {seq.id for seq in sequences}
        # Ensure all records have unique names
        if len(all_record_ids) < len(sequences):
            all_record_ids = set()
            for record in sequences:
                if record.id in all_record_ids:
                    record.original_id = record.id
                    record.id = generate_unique_id(record.id, all_record_ids)[0]
                all_record_ids.add(record.id)
            assert len(all_record_ids) == len(sequences), "%d != %d" % (len(all_record_ids), len(sequences))
        # Ensure all records have valid names
        for record in sequences:
            fix_record_name_id(record, all_record_ids)

    return sequences
Example #10
0
def parse_input_sequence(filename: str,
                         taxon: str = "bacteria",
                         minimum_length: int = -1,
                         start: int = -1,
                         end: int = -1,
                         gff_file: str = "") -> List[Record]:
    """ Parse input records contained in a file

        Arguments:
            filename: the path of the file to read
            taxon: the taxon of the input, e.g. 'bacteria', 'fungi'
            minimum_length: records with length less than this will be ignored
                            if not positive, all records are included
            start: a start location for trimming the sequence, or -1 to use all
            end: an end location for trimming the sequence, or -1 to use all
            gff_file: a GFF file to use for gene/CDS annotations

        Returns:
            A list of secmet.Record instances, one for each record in the file
    """
    logging.info('Parsing input sequence %r', filename)
    if not isinstance(minimum_length, int):
        raise TypeError("minimum_length must be an int")

    records = []  # type: List[SeqRecord]

    for record in _strict_parse(filename):
        if minimum_length < 1 \
                or len(record.seq) >= minimum_length \
                or 'contig' in record.annotations \
                or 'wgs_scafld' in record.annotations \
                or 'wgs' in record.annotations:
            records.append(record)

    # if no records are left, that's a problem
    if not records:
        raise AntismashInputError(
            "all input records smaller than minimum length (%d)" %
            minimum_length)

    for record in records:
        if isinstance(
                record.seq.alphabet,
                Bio.Alphabet.ProteinAlphabet) or not is_nucl_seq(record.seq):
            raise AntismashInputError("protein records are not supported: %s" %
                                      record.id)

    # before conversion to secmet records, trim if required
    if start > -1 or end > -1:
        if len(records) > 1:
            raise ValueError(
                "--start and --end options cannot be used with multiple records"
            )
        records[0] = trim_sequence(records[0], max(start, 0),
                                   min(len(records[0]), end))

    # add GFF features before conversion, if relevant
    if gff_file:
        logging.debug("Loading annotations from GFF file")
        # check GFF suitability first
        try:
            gff_parser.check_gff_suitability(gff_file, records)
        except AntismashInputError:
            raise
        except Exception as err:
            # avoid swallowing details if possible
            if str(err):
                logging.error(err)
            raise AntismashInputError(
                "could not parse records from GFF3 file") from err
        gff_features = gff_parser.run(gff_file)
        for record in records:
            if any(feature.type == "CDS" for feature in record.features):
                continue
            record.features.extend(gff_features.get(record.id, []))

    # remove any previous or obselete antiSMASH annotations to minimise incompatabilities
    for record in records:
        strip_record(record)

    logging.debug("Converting records from biopython to secmet")
    try:
        records = [Record.from_biopython(record, taxon) for record in records]
    except SecmetInvalidInputError as err:
        raise AntismashInputError(str(err)) from err

    # if parsable by secmet, it has a better context on what to strip, so run
    # the secmet stripping to ensure there's no surprises
    for record in records:
        record.strip_antismash_annotations()

    return records
Example #11
0
def parse_input_sequence(filename: str, taxon: str = "bacteria", minimum_length: int = -1,
                         start: int = -1, end: int = -1, gff_file: str = "") -> List[Record]:
    """ Parse input records contained in a file

        Arguments:
            filename: the path of the file to read
            taxon: the taxon of the input, e.g. 'bacteria', 'fungi'
            minimum_length: records with length less than this will be ignored
                            if not positive, all records are included
            start: a start location for trimming the sequence, or -1 to use all
            end: an end location for trimming the sequence, or -1 to use all
            gff_file: a GFF file to use for gene/CDS annotations

        Returns:
            A list of secmet.Record instances, one for each record in the file
    """
    logging.info('Parsing input sequence %r', filename)
    if not isinstance(minimum_length, int):
        raise TypeError("minimum_length must be an int")

    records = []  # type: List[SeqRecord]

    for record in _strict_parse(filename):
        if minimum_length < 1 \
                or len(record.seq) >= minimum_length \
                or 'contig' in record.annotations \
                or 'wgs_scafld' in record.annotations \
                or 'wgs' in record.annotations:
            records.append(record)

    # if no records are left, that's a problem
    if not records:
        raise AntismashInputError("no valid records found in file %r" % filename)

    for record in records:
        if isinstance(record.seq.alphabet, Bio.Alphabet.ProteinAlphabet) or not is_nucl_seq(record.seq):
            raise AntismashInputError("protein records are not supported")

    # before conversion to secmet records, trim if required
    if start > -1 or end > -1:
        if len(records) > 1:
            raise ValueError("--start and --end options cannot be used with multiple records")
        records[0] = trim_sequence(records[0], max(start, 0), min(len(records[0]), end))

    # add GFF features before conversion, if relevant
    if gff_file:
        logging.debug("Loading annotations from GFF file")
        # check GFF suitability first
        single_entry = False
        try:
            single_entry = gff_parser.check_gff_suitability(gff_file, records)
        except AntismashInputError:
            raise
        except Exception as err:
            raise AntismashInputError("could not parse records from GFF3 file") from err
        # then add any features found for any record with no CDS features
        partial = functools.partial(_add_gff_features, single_entry, gff_file)
        records = parallel_function(partial, ([record] for record in records))
        for record in records:
            if any(feature.type == "CDS" for feature in record.features):
                continue
            gff_features = gff_parser.run(record.id, single_entry, gff_file)
            record.features.extend(gff_features)

    # remove any previous or obselete antiSMASH features so conversion can be clean
    for record in records:
        strip_record(record)

    logging.debug("Converting records from biopython to secmet")
    try:
        return [Record.from_biopython(record, taxon) for record in records]
    except SecmetInvalidInputError as err:
        raise AntismashInputError(str(err)) from err