def actual_phage_cds(gbkf, verbose=False):
    """
    Read the genbank file and return a list of features that are actually phage regions
    :param gbkf: the test genbank file with CDS marked with is_phage
    :param verbose: more output
    :return: a set of phage features
    """

    if verbose:
        sys.stderr.write(f"Reading {gbkf}\n")

    phage = {}
    nonphage = {}
    for seq in genbank_seqio(gbkf):
        for feat in seq.features:
            if feat.type == 'CDS':
                if 'product' not in feat.qualifiers:
                    feat.qualifiers['product'] = [
                        f"Hypothetical protein (not annotated in {gbkf})"
                    ]
                if 'is_phage' in feat.qualifiers:
                    phage[str(feat.translate(seq, cds=False).seq).upper(
                    )] = feat.qualifiers['product'][0]
                else:
                    nonphage[str(feat.translate(seq, cds=False).seq).upper(
                    )] = feat.qualifiers['product'][0]

    return phage, nonphage
def predicted_genbank(predf, verbose=False):
    """
    Read the predictions from the genbank file and return
    a set of features
    :param predf: the predictions file
    :param verbose: more output
    :return: a set of predicted phage genes
    """

    if verbose:
        sys.stderr.write(f"Reading {predf}\n")

    predicted = {}
    for seq in genbank_seqio(predf):
        for feat in seq.features:
            if feat.type == 'CDS':
                if 'product' in feat.qualifiers:
                    predicted[str(feat.translate(seq, cds=False).seq).upper(
                    )] = feat.qualifiers['product'][0]
                else:
                    predicted[str(feat.translate(seq, cds=False).seq).upper()] = \
                        f"Hypothetical protein (not annotated in {predf})"

    if verbose:
        sys.stderr.write(
            f"Found {len(predicted)} predicted prophage features\n")

    return predicted
def predicted_regions(regf, gbkf, verbose):
    """
    Pull the phage genes from the regions
    :param regf: the regions file with contigs/start/stop
    :param gbkf: the genbank file used to make those predictions
    :param verbose: more output
    :return: a set of predicted phage genes
    """

    regions = {}
    if verbose:
        sys.stderr.write(f"Reading {regf}\n")
    with open(regf, 'r') as f:
        for l in f:
            p = l.strip().split("\t")
            assert (len(p) == 3
                    ), f"Expected a tple of [contig, start, stop] in {regf}"
            p[1] = int(p[1])
            p[2] = int(p[2])
            # print("Found a region predicted from {} to {} into {} ".format(p[1],p[2],p[0]))
            if p[0] not in regions:
                regions[p[0]] = []
            if p[2] < p[1]:
                regions[p[0]].append([p[2], p[1]])
            else:
                regions[p[0]].append([p[1], p[2]])

    if verbose:
        sys.stderr.write(f"Reading {gbkf} again to get the phage regions\n")

    predicted = {}
    for seq in genbank_seqio(gbkf):
        # print("Now testing {}".format(seq))
        if seq.id in regions:
            for loc in regions[seq.id]:
                if verbose:
                    sys.stderr.write(f"Getting from {loc[0]} to {loc[1]}\n")
                for feat in seq[loc[0]:loc[1]].features:
                    if feat.type == 'CDS':
                        if 'product' in feat.qualifiers:
                            predicted[str(feat.translate(seq[loc[0]:loc[1]], cds=False).seq).upper()] = \
                                feat.qualifiers['product'][0]
                        else:
                            predicted[str(feat.translate(seq[loc[0]:loc[1]], cds=False).seq).upper()] = \
                                f"Hypothetical protein (not annotated in {gbkf})"

    if verbose:
        sys.stderr.write(
            f"Found {len(predicted)} predicted prophage features\n")

    return predicted
Example #4
0
def genbank_to_pandas(gbkf,
                      mincontiglen,
                      ignorepartials=True,
                      convert_selenocysteine=False):
    """
    This is a bit of a specific format used by phage_boost. its a simple dataframe with a couple of
    additional columns:
        ['contig',
         'id',
         'start',
         'stop',
         'direction',
         'partial',
         'DNAseq',
         'AAseq',
         'header']
    :param mincontiglen: minimum contig length
    :param ignorepartials: Ignore any gene call with a frameshift (ie. a stop codon in the middle of the sequence)
    :param convert_selenocysteine: PhageBoost crashes with a selenocysteine protein because it is not in Biopython
    :param gbkf: Genbank file to parse
    :return: a pandas data frame
    """

    c = 0
    genes = []
    for seq in genbank_seqio(gbkf):
        if len(seq) < mincontiglen:
            sys.stderr.write(
                f"Skipped {seq.id} because it's length ({len(seq)}) is less than the "
                + "minimum contig length ({mincontiglen})\n")
            continue
        for feat in seq.features:
            if feat.type != 'CDS':
                continue

            tid = seq.id + "_" + str(c)
            partial = 0
            # I don't think this is exactly right
            if 'truncated' in feat.qualifiers:
                partial = 1

            dnaseq = str(feat.extract(seq).seq)
            if len(dnaseq) == 0:
                sys.stderr.write(
                    f"The DNA sequence for {feature_id(seq, feat)} was zero, so skipped\n"
                )
                continue

            # we just do a de novo translation rather than relying on the translation provided
            # in the genbank file that is often wrong
            trans = str(feat.extract(seq).translate().seq)

            while trans.endswith('*'):
                trans = trans[:-1]

            # Partial amino acid codes we should ignore. These are not present in BioPython's SeqUtils::ProtParam
            # and it crashes the system
            paa = {'B', 'Z', 'J', 'X', '*'}

            keeporf = True

            if ignorepartials:
                for aa in paa:
                    if aa in trans:
                        sys.stderr.write(
                            f"There is a {aa} in  {feature_id(seq, feat)} so skipped.\n"
                        )
                        keeporf = False

            if not keeporf:
                continue

            if len(trans) == 0:
                sys.stderr.write(
                    f"The translation for {feature_id(seq, feat)} was zero, so skipped.\n"
                )
                continue

            if convert_selenocysteine:
                trans = trans.replace('U', 'C')
            row = [
                seq.id, c, feat.location.start.position,
                feat.location.end.position, feat.strand, partial, dnaseq,
                trans, tid
            ]
            c += 1

            genes.append(row)

    gc = pd.DataFrame(genes,
                      columns=[
                          'contig', 'id', 'start', 'stop', 'direction',
                          'partial', 'DNAseq', 'AAseq', 'header'
                      ])
    return gc