def save_genbank_m4(gb_m3, gb_m4): """Generate modified-IV GenBank file (reduced version of the genome). Args: gb_m3 (Bio.SeqRecord.SeqRecord): modified-III GenBank annotation. gb_m4 (str): modified-IV GenBank file path. """ deletions = [] for feature in gb_m3.features: if is_deletion(feature): deletions.append(feature.location) deletions = CompoundLocation(deletions) non_deletions = complementary_compoundloc(0, len(gb_m3), deletions) reduced_annot = SeqRecord(seq=non_deletions.extract(gb_m3.seq), id=gb_m3.id, name=gb_m3.name, description=gb_m3.description, dbxrefs=gb_m3.dbxrefs, annotations=gb_m3.annotations) # Shift features' positions according to deletions end = 0 for nondel in non_deletions.parts: offset = nondel.start - end for feature in gb_m3.features: if (feature.location.start in nondel and feature.location.end in nondel): feature.location = feature.location + (-offset) reduced_annot.features.append(feature) end = nondel.end - offset + 2 SeqIO.write(reduced_annot, gb_m4, "genbank")
def best_deletion_order(gb_m3, ori, ter, log): """Calculate optimal deletion order for minimising replichore imbalance at each step. Args: gb_m3 (Bio.SeqRecord.SeqRecord): modified-III GenBank annotation. ori (int): position of origin of replication. ter (int): position of terminus of replication. log (str): report file path. """ # Replichore imbalance = len(genome)/2 - len(0-180º replichore) if ter > ori: len_1 = ter - ori else: len_1 = len(gb_m3) - (ori - ter) imbalance = len(gb_m3) / 2 - len_1 # Calculate each deletion's contribution to the imbalance deletions = [] for feature in gb_m3.features: if is_deletion(feature): name = feature.qualifiers["note"][0].split()[-1] midpoint = (feature.location.end + feature.location.start) / 2 if midpoint > ori or midpoint < ter: # 0-180º replichore contrib = len(feature) else: # 180-360º replichore contrib = -len(feature) deletions.append((name, contrib)) # Best next deletion is the one that minimises the imbalance with open(log, "a") as f: f.write("Initial replichore imbalance: %d\n" % imbalance) f.write("Best deletion order for minimising imbalance at each step:\n") n = 0 while deletions: n += 1 best = deletions.pop( np.argmin( [abs(imbalance + deletion[1]) for deletion in deletions])) imbalance += best[1] f.write(" %d. %s -> imbalance = %d\n" % (n, best[0], imbalance)) f.write(sep)
def get_stats(gb_m3): """Calculate essentiality prediction and deletion statistics. Args: gb_m3 (Bio.SeqRecord.SeqRecord): modified-III GenBank annotation. Returns: stats (dict): results. """ l = 0 e = 0 deletions = [] for feature in gb_m3.features: if is_deletion(feature): deletions.append(feature.location) if not (l and e): l = int(feature.qualifiers["note"][1].split("=")[-1]) e = float(feature.qualifiers["note"][2].split("=")[-1]) deletions = CompoundLocation(deletions) stats = { "org": gb_m3.annotations["organism"], "size": len(gb_m3), "l": l, "e": e, "del_n": 0, "del_kb": 0, "del_perc": 0, "del_genes_pseudo": 0, "del_genes_hypot": 0, "del_genes_nonhypot": 0, "ess_genes_rna": 0, "ess_genes_hypot": 0, "ess_genes_nonhypot": 0, "ness_genes_pseudo": 0, "ness_genes_hypot": 0, "ness_genes_nonhypot": 0 } for feature in gb_m3.features: if is_essential(feature, e): # essential if feature.type in ("tRNA", "rRNA", "tmRNA", "ncRNA"): stats["ess_genes_rna"] += 1 elif feature.type == "CDS" and "locus_tag" in feature.qualifiers: if ("product" in feature.qualifiers and "hypothetical" in feature.qualifiers["product"][0]): stats["ess_genes_hypot"] += 1 else: stats["ess_genes_nonhypot"] += 1 elif feature.type == "CDS" and "locus_tag" in feature.qualifiers: # non-essential if "pseudo" in feature.qualifiers: stats["ness_genes_pseudo"] += 1 elif ("product" in feature.qualifiers and "hypothetical" in feature.qualifiers["product"][0]): stats["ness_genes_hypot"] += 1 else: stats["ness_genes_nonhypot"] += 1 if (feature.location.start in deletions or feature.location.end in deletions): # in a deletion if "pseudo" in feature.qualifiers: stats["del_genes_pseudo"] += 1 elif ("product" in feature.qualifiers and "hypothetical" in feature.qualifiers["product"][0]): stats["del_genes_hypot"] += 1 else: stats["del_genes_nonhypot"] += 1 elif is_deletion(feature): stats["del_n"] += 1 stats["del_kb"] += len(feature) / 1000 stats["del_perc"] += len(feature) / len(gb_m3) * 100 return stats
os.makedirs(OUT_DIR, exist_ok=True) genbank_id = os.path.splitext(os.path.basename(GENBANK_M3))[0] GENBANK_M4 = os.path.join(OUT_DIR, genbank_id + ".gbm4") OUT_IMG = os.path.join(OUT_DIR, "genome_reduction") REPORT_LOG = os.path.join(OUT_DIR, "reduction_stats.txt") ORI = args.ORI TER = args.TER # Check input try: genbank_m3 = SeqIO.read(GENBANK_M3, "genbank") except (FileNotFoundError, ValueError): raise SystemExit("\n\terror: could not read .gbm3 file\n") contains_dels = False for feature in genbank_m3.features: if is_deletion(feature): contains_dels = True break if not contains_dels: raise SystemExit("\n\terror: invalid GenBank file (must be .gbm3)\n") if ORI and TER: if not all( [coord in range(1, len(genbank_m3) + 1) for coord in (ORI, TER)]): raise SystemExit("\n\terror: invalid ori/ter coordinates\n") # Get ori + ter coordinates if not (ORI and TER): ORI, TER = find_ori_ter(genbank_m3) # Draw circular genome plot