Ejemplo n.º 1
0
    def test_suitability(self):
        self.sequences[0].id = "NOT_CONTIG_1"
        with self.assertRaises(ValueError) as err:
            gff_parser.check_gff_suitability(self.config, self.sequences)
        assert "GFF3 record IDs don't match sequence file record IDs" in str(err.exception)

        # doesn't test very much
        self.sequences[0].id = "CONTIG_1"
        gff_parser.run(self.sequences[0], self.single_entry, self.config)  # insert the features
        assert not gff_parser.check_gff_suitability(self.config, self.sequences)

        # test force correlation
        self.sequences = self.sequences[1:]  # CONTIG_2
        assert gff_parser.check_gff_suitability(self.config, self.sequences)
Ejemplo n.º 2
0
    def test_suitability(self):
        self.sequences[0].id = "NOT_CONTIG_1"
        with self.assertRaisesRegex(errors.AntismashInputError,
                                    "GFF3 record IDs don't match sequence file record IDs"):
            gff_parser.check_gff_suitability(self.config, self.sequences)

        # doesn't test very much
        self.sequences[0].id = "CONTIG_1"
        gff_parser.run(self.sequences[0], self.single_entry, self.config)  # insert the features
        assert not gff_parser.check_gff_suitability(self.config, self.sequences)

        # test force correlation
        self.sequences = self.sequences[1:]  # CONTIG_2
        assert gff_parser.check_gff_suitability(self.config, self.sequences)
Ejemplo n.º 3
0
    def test_suitability(self):
        self.sequences[0].id = "NOT_CONTIG_1"
        with self.assertRaisesRegex(
                errors.AntismashInputError,
                "GFF3 record IDs don't match sequence file record IDs"):
            gff_parser.check_gff_suitability(self.gff_file, self.sequences)

        self.sequences[0].id = "CONTIG_1"
        cdses = gff_parser.run("CONTIG_1", self.single_entry, self.gff_file)
        self.sequences[0].features.extend(cdses)
        assert not gff_parser.check_gff_suitability(self.gff_file,
                                                    self.sequences)

        # test force correlation
        self.sequences = self.sequences[1:]  # CONTIG_2
        assert gff_parser.check_gff_suitability(self.gff_file, self.sequences)
    def test_suitability(self):
        self.sequences[0].id = "NOT_CONTIG_1"
        with self.assertRaisesRegex(
                errors.AntismashInputError,
                "GFF3 record IDs don't match sequence file record IDs"):
            gff_parser.check_gff_suitability(self.gff_file, self.sequences)

        self.sequences[0].id = "CONTIG_1"
        gff_parser.check_gff_suitability(self.gff_file, self.sequences)

        # test force correlation
        self.sequences = self.sequences[1:]  # CONTIG_2
        gff_parser.check_gff_suitability(self.gff_file, self.sequences)
Ejemplo n.º 5
0
def pre_process_sequences(sequences: List[Record], options: ConfigType, genefinding: AntismashModule) -> List[Record]:
    """ hmm

        - gaps removed
        - record ids adjusted to be unique
        - record ids are valid

        Note: Record instances will be altered in-place.

        Arguments:
            sequences: the secmet.Record instances to process
            options: an antismash Config instance
            genefinding: the module to use for genefinding, must have
                         run_on_record() implemented

        Returns:
            A list of altered secmet.Record
    """
    logging.debug("Preprocessing %d sequences", len(sequences))

    # catch WGS master or supercontig entries
    if records_contain_shotgun_scaffolds(sequences):
        raise AntismashInputError("incomplete whole genome shotgun records are not supported")

    for i, seq in enumerate(sequences):
        seq.record_index = i + 1  # 1-indexed

    checking_required = not (options.reuse_results or options.skip_sanitisation)

    # keep sequences as clean as possible and make sure they're valid
    if checking_required:
        logging.debug("Sanitising record sequences")
        if len(sequences) == 1:
            sequences = [sanitise_sequence(sequences[0])]
            sequences = [check_content(sequences[0])]
        else:
            sequences = parallel_function(sanitise_sequence, ([record] for record in sequences))
            sequences = parallel_function(check_content, ([sequence] for sequence in sequences))

    for record in sequences:
        if record.skip or not record.seq:
            logging.warning("Record %s has no sequence, skipping.", record.id)
        if not record.id:
            raise AntismashInputError("record has no name")

    # skip anything not matching the filter
    filter_records_by_name(sequences, options.limit_to_record)

    # Now remove small contigs < minimum length again
    logging.debug("Removing sequences smaller than %d bases", options.minlength)
    for sequence in sequences:
        if len(sequence.seq) < options.minlength:
            sequence.skip = "smaller than minimum length (%d)" % options.minlength

    # Make sure we don't waste weeks of runtime on huge records, unless requested by the user
    limit_hit = filter_records_by_count(sequences, options.limit)
    if limit_hit:
        logging.warning("Only analysing the first %d records (increase via --limit)", options.limit)
    update_config({"triggered_limit": limit_hit})

    # Check GFF suitability
    single_entry = False
    if options.genefinding_gff3:
        try:
            single_entry = gff_parser.check_gff_suitability(options, sequences)
        except AntismashInputError:
            raise
        except Exception as err:
            raise AntismashInputError("could not parse records from GFF3 file") from err

    if checking_required:
        # ensure CDS features have all relevant information
        logging.debug("Ensuring CDS features have all required information")
        assert hasattr(genefinding, "run_on_record")
        partial = functools.partial(ensure_cds_info, single_entry, genefinding.run_on_record)
        sequences = parallel_function(partial, ([sequence] for sequence in sequences))

        # Check if no duplicate locus tags / gene IDs are found
        logging.debug("Ensuring CDS features do not have duplicate IDs")
        ensure_no_duplicate_cds_gene_ids(sequences)

        all_record_ids = {seq.id for seq in sequences}
        # Ensure all records have unique names
        if len(all_record_ids) < len(sequences):
            all_record_ids = set()
            for record in sequences:
                if record.id in all_record_ids:
                    record.original_id = record.id
                    record.id = generate_unique_id(record.id, all_record_ids)[0]
                all_record_ids.add(record.id)
            assert len(all_record_ids) == len(sequences), "%d != %d" % (len(all_record_ids), len(sequences))
        # Ensure all records have valid names
        for record in sequences:
            fix_record_name_id(record, all_record_ids)

    return sequences
Ejemplo n.º 6
0
def pre_process_sequences(sequences, options, genefinding) -> List[Record]:
    """ hmm

        - gaps removed
        - record ids adjusted to be unique
        - record ids are valid

        Note: Record instances will be altered in-place.

        Arguments:
            sequences: the secmet.Record instances to process
            options: an antismash Config instance
            genefinding: the module to use for genefinding, must have
                         run_on_record() implemented

        Returns:
            A list of altered secmet.Record
    """
    logging.debug("Preprocessing %d sequences", len(sequences))

    # catch WGS master or supercontig entries
    if records_contain_shotgun_scaffolds(sequences):
        raise RuntimeError(
            "Incomplete whole genome shotgun records are not supported")

    # keep count of how many records matched filter
    matching_filter = 0

    for i, seq in enumerate(sequences):
        seq.record_index = i

    checking_required = not (options.reuse_results
                             or options.skip_sanitisation)

    # keep sequences as clean as possible and make sure they're valid
    if checking_required:
        logging.debug("Sanitising record sequences")
        if len(sequences) == 1:
            sequences = [sanitise_sequence(sequences[0])]
            sequences = [check_content(sequences[0])]
        else:
            sequences = parallel_function(sanitise_sequence,
                                          ([record] for record in sequences))
            sequences = parallel_function(check_content,
                                          ([sequence]
                                           for sequence in sequences))

    for record in sequences:
        if record.skip or not record.seq:
            logging.warning("Record %s has no sequence, skipping.", record.id)

    if options.limit_to_record:
        logging.debug("Limiting to record id: %s", options.limit_to_record)
        # run the filter
        for sequence in sequences:
            if options.limit_to_record and options.limit_to_record != sequence.id:
                sequence.skip = "did not match filter: %s" % options.limit_to_record
            else:
                matching_filter += 1
        limit = options.limit_to_record
        if matching_filter == 0:
            logging.error("No sequences matched filter: %s", limit)
            raise ValueError("No sequences matched filter: %s" % limit)
        elif matching_filter != len(sequences):
            logging.info("Skipped %d sequences not matching filter: %s",
                         len(sequences) - matching_filter, limit)

    # Now remove small contigs < minimum length again
    logging.debug("Removing sequences smaller than %d bases",
                  options.minlength)
    for sequence in sequences:
        if len(sequence.seq) < options.minlength:
            sequence.skip = "smaller than minimum length (%d)" % options.minlength

    # Make sure we don't waste weeks of runtime on huge records, unless requested by the user
    warned = False
    if options.limit > -1:
        meaningful = 0
        for sequence in sequences:
            if sequence.skip:
                continue
            meaningful += 1
            if meaningful > options.limit:
                if not warned:
                    logging.warning(
                        "Only analysing the first %d records (increase via --limit)",
                        options.limit)
                    warned = True
                sequence.skip = "skipping all but first {0} meaningful records (--limit {0}) ".format(
                    options.limit)

    options = update_config({"triggered_limit":
                             warned})  # TODO is there a better way

    # Check GFF suitability
    single_entry = False
    if options.genefinding_gff3:
        single_entry = gff_parser.check_gff_suitability(options, sequences)

    if checking_required:
        # ensure CDS features have all relevant information
        logging.debug("Ensuring CDS features have all required information")
        partial = functools.partial(ensure_cds_info, single_entry,
                                    genefinding.run_on_record)
        sequences = parallel_function(partial,
                                      ([sequence] for sequence in sequences))

        # Check if no duplicate locus tags / gene IDs are found
        logging.debug("Ensuring CDS features do not have duplicate IDs")
        ensure_no_duplicate_cds_gene_ids(sequences)

        all_record_ids = {seq.id for seq in sequences}
        # Ensure all records have unique names
        if len(all_record_ids) < len(sequences):
            all_record_ids = set()
            for record in sequences:
                if record.id in all_record_ids:
                    record.original_id = record.id
                    record.id = generate_unique_id(record.id,
                                                   all_record_ids)[0]
                all_record_ids.add(record.id)
            assert len(all_record_ids) == len(
                sequences), "%d != %d" % (len(all_record_ids), len(sequences))
        # Ensure all records have valid names
        for record in sequences:
            fix_record_name_id(record, all_record_ids)

    return sequences
Ejemplo n.º 7
0
def parse_input_sequence(filename: str,
                         taxon: str = "bacteria",
                         minimum_length: int = -1,
                         start: int = -1,
                         end: int = -1,
                         gff_file: str = "") -> List[Record]:
    """ Parse input records contained in a file

        Arguments:
            filename: the path of the file to read
            taxon: the taxon of the input, e.g. 'bacteria', 'fungi'
            minimum_length: records with length less than this will be ignored
                            if not positive, all records are included
            start: a start location for trimming the sequence, or -1 to use all
            end: an end location for trimming the sequence, or -1 to use all
            gff_file: a GFF file to use for gene/CDS annotations

        Returns:
            A list of secmet.Record instances, one for each record in the file
    """
    logging.info('Parsing input sequence %r', filename)
    if not isinstance(minimum_length, int):
        raise TypeError("minimum_length must be an int")

    records = []  # type: List[SeqRecord]

    for record in _strict_parse(filename):
        if minimum_length < 1 \
                or len(record.seq) >= minimum_length \
                or 'contig' in record.annotations \
                or 'wgs_scafld' in record.annotations \
                or 'wgs' in record.annotations:
            records.append(record)

    # if no records are left, that's a problem
    if not records:
        raise AntismashInputError(
            "all input records smaller than minimum length (%d)" %
            minimum_length)

    for record in records:
        if isinstance(
                record.seq.alphabet,
                Bio.Alphabet.ProteinAlphabet) or not is_nucl_seq(record.seq):
            raise AntismashInputError("protein records are not supported: %s" %
                                      record.id)

    # before conversion to secmet records, trim if required
    if start > -1 or end > -1:
        if len(records) > 1:
            raise ValueError(
                "--start and --end options cannot be used with multiple records"
            )
        records[0] = trim_sequence(records[0], max(start, 0),
                                   min(len(records[0]), end))

    # add GFF features before conversion, if relevant
    if gff_file:
        logging.debug("Loading annotations from GFF file")
        # check GFF suitability first
        try:
            gff_parser.check_gff_suitability(gff_file, records)
        except AntismashInputError:
            raise
        except Exception as err:
            # avoid swallowing details if possible
            if str(err):
                logging.error(err)
            raise AntismashInputError(
                "could not parse records from GFF3 file") from err
        gff_features = gff_parser.run(gff_file)
        for record in records:
            if any(feature.type == "CDS" for feature in record.features):
                continue
            record.features.extend(gff_features.get(record.id, []))

    # remove any previous or obselete antiSMASH annotations to minimise incompatabilities
    for record in records:
        strip_record(record)

    logging.debug("Converting records from biopython to secmet")
    try:
        records = [Record.from_biopython(record, taxon) for record in records]
    except SecmetInvalidInputError as err:
        raise AntismashInputError(str(err)) from err

    # if parsable by secmet, it has a better context on what to strip, so run
    # the secmet stripping to ensure there's no surprises
    for record in records:
        record.strip_antismash_annotations()

    return records
Ejemplo n.º 8
0
def parse_input_sequence(filename: str, taxon: str = "bacteria", minimum_length: int = -1,
                         start: int = -1, end: int = -1, gff_file: str = "") -> List[Record]:
    """ Parse input records contained in a file

        Arguments:
            filename: the path of the file to read
            taxon: the taxon of the input, e.g. 'bacteria', 'fungi'
            minimum_length: records with length less than this will be ignored
                            if not positive, all records are included
            start: a start location for trimming the sequence, or -1 to use all
            end: an end location for trimming the sequence, or -1 to use all
            gff_file: a GFF file to use for gene/CDS annotations

        Returns:
            A list of secmet.Record instances, one for each record in the file
    """
    logging.info('Parsing input sequence %r', filename)
    if not isinstance(minimum_length, int):
        raise TypeError("minimum_length must be an int")

    records = []  # type: List[SeqRecord]

    for record in _strict_parse(filename):
        if minimum_length < 1 \
                or len(record.seq) >= minimum_length \
                or 'contig' in record.annotations \
                or 'wgs_scafld' in record.annotations \
                or 'wgs' in record.annotations:
            records.append(record)

    # if no records are left, that's a problem
    if not records:
        raise AntismashInputError("no valid records found in file %r" % filename)

    for record in records:
        if isinstance(record.seq.alphabet, Bio.Alphabet.ProteinAlphabet) or not is_nucl_seq(record.seq):
            raise AntismashInputError("protein records are not supported")

    # before conversion to secmet records, trim if required
    if start > -1 or end > -1:
        if len(records) > 1:
            raise ValueError("--start and --end options cannot be used with multiple records")
        records[0] = trim_sequence(records[0], max(start, 0), min(len(records[0]), end))

    # add GFF features before conversion, if relevant
    if gff_file:
        logging.debug("Loading annotations from GFF file")
        # check GFF suitability first
        single_entry = False
        try:
            single_entry = gff_parser.check_gff_suitability(gff_file, records)
        except AntismashInputError:
            raise
        except Exception as err:
            raise AntismashInputError("could not parse records from GFF3 file") from err
        # then add any features found for any record with no CDS features
        partial = functools.partial(_add_gff_features, single_entry, gff_file)
        records = parallel_function(partial, ([record] for record in records))
        for record in records:
            if any(feature.type == "CDS" for feature in record.features):
                continue
            gff_features = gff_parser.run(record.id, single_entry, gff_file)
            record.features.extend(gff_features)

    # remove any previous or obselete antiSMASH features so conversion can be clean
    for record in records:
        strip_record(record)

    logging.debug("Converting records from biopython to secmet")
    try:
        return [Record.from_biopython(record, taxon) for record in records]
    except SecmetInvalidInputError as err:
        raise AntismashInputError(str(err)) from err