def actual_phage_cds(gbkf, verbose=False): """ Read the genbank file and return a list of features that are actually phage regions :param gbkf: the test genbank file with CDS marked with is_phage :param verbose: more output :return: a set of phage features """ if verbose: sys.stderr.write(f"Reading {gbkf}\n") phage = {} nonphage = {} for seq in genbank_seqio(gbkf): for feat in seq.features: if feat.type == 'CDS': if 'product' not in feat.qualifiers: feat.qualifiers['product'] = [ f"Hypothetical protein (not annotated in {gbkf})" ] if 'is_phage' in feat.qualifiers: phage[str(feat.translate(seq, cds=False).seq).upper( )] = feat.qualifiers['product'][0] else: nonphage[str(feat.translate(seq, cds=False).seq).upper( )] = feat.qualifiers['product'][0] return phage, nonphage
def predicted_genbank(predf, verbose=False): """ Read the predictions from the genbank file and return a set of features :param predf: the predictions file :param verbose: more output :return: a set of predicted phage genes """ if verbose: sys.stderr.write(f"Reading {predf}\n") predicted = {} for seq in genbank_seqio(predf): for feat in seq.features: if feat.type == 'CDS': if 'product' in feat.qualifiers: predicted[str(feat.translate(seq, cds=False).seq).upper( )] = feat.qualifiers['product'][0] else: predicted[str(feat.translate(seq, cds=False).seq).upper()] = \ f"Hypothetical protein (not annotated in {predf})" if verbose: sys.stderr.write( f"Found {len(predicted)} predicted prophage features\n") return predicted
def predicted_regions(regf, gbkf, verbose): """ Pull the phage genes from the regions :param regf: the regions file with contigs/start/stop :param gbkf: the genbank file used to make those predictions :param verbose: more output :return: a set of predicted phage genes """ regions = {} if verbose: sys.stderr.write(f"Reading {regf}\n") with open(regf, 'r') as f: for l in f: p = l.strip().split("\t") assert (len(p) == 3 ), f"Expected a tple of [contig, start, stop] in {regf}" p[1] = int(p[1]) p[2] = int(p[2]) # print("Found a region predicted from {} to {} into {} ".format(p[1],p[2],p[0])) if p[0] not in regions: regions[p[0]] = [] if p[2] < p[1]: regions[p[0]].append([p[2], p[1]]) else: regions[p[0]].append([p[1], p[2]]) if verbose: sys.stderr.write(f"Reading {gbkf} again to get the phage regions\n") predicted = {} for seq in genbank_seqio(gbkf): # print("Now testing {}".format(seq)) if seq.id in regions: for loc in regions[seq.id]: if verbose: sys.stderr.write(f"Getting from {loc[0]} to {loc[1]}\n") for feat in seq[loc[0]:loc[1]].features: if feat.type == 'CDS': if 'product' in feat.qualifiers: predicted[str(feat.translate(seq[loc[0]:loc[1]], cds=False).seq).upper()] = \ feat.qualifiers['product'][0] else: predicted[str(feat.translate(seq[loc[0]:loc[1]], cds=False).seq).upper()] = \ f"Hypothetical protein (not annotated in {gbkf})" if verbose: sys.stderr.write( f"Found {len(predicted)} predicted prophage features\n") return predicted
def genbank_to_pandas(gbkf, mincontiglen, ignorepartials=True, convert_selenocysteine=False): """ This is a bit of a specific format used by phage_boost. its a simple dataframe with a couple of additional columns: ['contig', 'id', 'start', 'stop', 'direction', 'partial', 'DNAseq', 'AAseq', 'header'] :param mincontiglen: minimum contig length :param ignorepartials: Ignore any gene call with a frameshift (ie. a stop codon in the middle of the sequence) :param convert_selenocysteine: PhageBoost crashes with a selenocysteine protein because it is not in Biopython :param gbkf: Genbank file to parse :return: a pandas data frame """ c = 0 genes = [] for seq in genbank_seqio(gbkf): if len(seq) < mincontiglen: sys.stderr.write( f"Skipped {seq.id} because it's length ({len(seq)}) is less than the " + "minimum contig length ({mincontiglen})\n") continue for feat in seq.features: if feat.type != 'CDS': continue tid = seq.id + "_" + str(c) partial = 0 # I don't think this is exactly right if 'truncated' in feat.qualifiers: partial = 1 dnaseq = str(feat.extract(seq).seq) if len(dnaseq) == 0: sys.stderr.write( f"The DNA sequence for {feature_id(seq, feat)} was zero, so skipped\n" ) continue # we just do a de novo translation rather than relying on the translation provided # in the genbank file that is often wrong trans = str(feat.extract(seq).translate().seq) while trans.endswith('*'): trans = trans[:-1] # Partial amino acid codes we should ignore. These are not present in BioPython's SeqUtils::ProtParam # and it crashes the system paa = {'B', 'Z', 'J', 'X', '*'} keeporf = True if ignorepartials: for aa in paa: if aa in trans: sys.stderr.write( f"There is a {aa} in {feature_id(seq, feat)} so skipped.\n" ) keeporf = False if not keeporf: continue if len(trans) == 0: sys.stderr.write( f"The translation for {feature_id(seq, feat)} was zero, so skipped.\n" ) continue if convert_selenocysteine: trans = trans.replace('U', 'C') row = [ seq.id, c, feat.location.start.position, feat.location.end.position, feat.strand, partial, dnaseq, trans, tid ] c += 1 genes.append(row) gc = pd.DataFrame(genes, columns=[ 'contig', 'id', 'start', 'stop', 'direction', 'partial', 'DNAseq', 'AAseq', 'header' ]) return gc