Exemple #1
0
def save_genbank_m4(gb_m3, gb_m4):
    """Generate modified-IV GenBank file (reduced version of the genome).

    Args:
        gb_m3 (Bio.SeqRecord.SeqRecord): modified-III GenBank annotation.
        gb_m4 (str): modified-IV GenBank file path.
    """
    deletions = []
    for feature in gb_m3.features:
        if is_deletion(feature):
            deletions.append(feature.location)
    deletions = CompoundLocation(deletions)
    non_deletions = complementary_compoundloc(0, len(gb_m3), deletions)
    reduced_annot = SeqRecord(seq=non_deletions.extract(gb_m3.seq),
                              id=gb_m3.id,
                              name=gb_m3.name,
                              description=gb_m3.description,
                              dbxrefs=gb_m3.dbxrefs,
                              annotations=gb_m3.annotations)
    # Shift features' positions according to deletions
    end = 0
    for nondel in non_deletions.parts:
        offset = nondel.start - end
        for feature in gb_m3.features:
            if (feature.location.start in nondel
                    and feature.location.end in nondel):
                feature.location = feature.location + (-offset)
                reduced_annot.features.append(feature)
        end = nondel.end - offset + 2
    SeqIO.write(reduced_annot, gb_m4, "genbank")
Exemple #2
0
def best_deletion_order(gb_m3, ori, ter, log):
    """Calculate optimal deletion order for minimising replichore
    imbalance at each step.

    Args:
        gb_m3 (Bio.SeqRecord.SeqRecord): modified-III GenBank annotation.
        ori (int): position of origin of replication.
        ter (int): position of terminus of replication.
        log (str): report file path.
    """
    # Replichore imbalance = len(genome)/2 - len(0-180º replichore)
    if ter > ori:
        len_1 = ter - ori
    else:
        len_1 = len(gb_m3) - (ori - ter)
    imbalance = len(gb_m3) / 2 - len_1
    # Calculate each deletion's contribution to the imbalance
    deletions = []
    for feature in gb_m3.features:
        if is_deletion(feature):
            name = feature.qualifiers["note"][0].split()[-1]
            midpoint = (feature.location.end + feature.location.start) / 2
            if midpoint > ori or midpoint < ter:  # 0-180º replichore
                contrib = len(feature)
            else:  # 180-360º replichore
                contrib = -len(feature)
            deletions.append((name, contrib))
    # Best next deletion is the one that minimises the imbalance
    with open(log, "a") as f:
        f.write("Initial replichore imbalance: %d\n" % imbalance)
        f.write("Best deletion order for minimising imbalance at each step:\n")
        n = 0
        while deletions:
            n += 1
            best = deletions.pop(
                np.argmin(
                    [abs(imbalance + deletion[1]) for deletion in deletions]))
            imbalance += best[1]
            f.write("  %d. %s -> imbalance = %d\n" % (n, best[0], imbalance))
        f.write(sep)
Exemple #3
0
def get_stats(gb_m3):
    """Calculate essentiality prediction and deletion statistics.

    Args:
        gb_m3 (Bio.SeqRecord.SeqRecord): modified-III GenBank annotation.
    Returns:
        stats (dict): results.
    """
    l = 0
    e = 0
    deletions = []
    for feature in gb_m3.features:
        if is_deletion(feature):
            deletions.append(feature.location)
            if not (l and e):
                l = int(feature.qualifiers["note"][1].split("=")[-1])
                e = float(feature.qualifiers["note"][2].split("=")[-1])
    deletions = CompoundLocation(deletions)
    stats = {
        "org": gb_m3.annotations["organism"],
        "size": len(gb_m3),
        "l": l,
        "e": e,
        "del_n": 0,
        "del_kb": 0,
        "del_perc": 0,
        "del_genes_pseudo": 0,
        "del_genes_hypot": 0,
        "del_genes_nonhypot": 0,
        "ess_genes_rna": 0,
        "ess_genes_hypot": 0,
        "ess_genes_nonhypot": 0,
        "ness_genes_pseudo": 0,
        "ness_genes_hypot": 0,
        "ness_genes_nonhypot": 0
    }
    for feature in gb_m3.features:
        if is_essential(feature, e):  # essential
            if feature.type in ("tRNA", "rRNA", "tmRNA", "ncRNA"):
                stats["ess_genes_rna"] += 1
            elif feature.type == "CDS" and "locus_tag" in feature.qualifiers:
                if ("product" in feature.qualifiers and "hypothetical"
                        in feature.qualifiers["product"][0]):
                    stats["ess_genes_hypot"] += 1
                else:
                    stats["ess_genes_nonhypot"] += 1
        elif feature.type == "CDS" and "locus_tag" in feature.qualifiers:  # non-essential
            if "pseudo" in feature.qualifiers:
                stats["ness_genes_pseudo"] += 1
            elif ("product" in feature.qualifiers
                  and "hypothetical" in feature.qualifiers["product"][0]):
                stats["ness_genes_hypot"] += 1
            else:
                stats["ness_genes_nonhypot"] += 1
            if (feature.location.start in deletions
                    or feature.location.end in deletions):  # in a deletion
                if "pseudo" in feature.qualifiers:
                    stats["del_genes_pseudo"] += 1
                elif ("product" in feature.qualifiers
                      and "hypothetical" in feature.qualifiers["product"][0]):
                    stats["del_genes_hypot"] += 1
                else:
                    stats["del_genes_nonhypot"] += 1
        elif is_deletion(feature):
            stats["del_n"] += 1
            stats["del_kb"] += len(feature) / 1000
            stats["del_perc"] += len(feature) / len(gb_m3) * 100
    return stats
Exemple #4
0
    os.makedirs(OUT_DIR, exist_ok=True)
    genbank_id = os.path.splitext(os.path.basename(GENBANK_M3))[0]
    GENBANK_M4 = os.path.join(OUT_DIR, genbank_id + ".gbm4")
    OUT_IMG = os.path.join(OUT_DIR, "genome_reduction")
    REPORT_LOG = os.path.join(OUT_DIR, "reduction_stats.txt")
    ORI = args.ORI
    TER = args.TER

    # Check input
    try:
        genbank_m3 = SeqIO.read(GENBANK_M3, "genbank")
    except (FileNotFoundError, ValueError):
        raise SystemExit("\n\terror: could not read .gbm3 file\n")
    contains_dels = False
    for feature in genbank_m3.features:
        if is_deletion(feature):
            contains_dels = True
            break
    if not contains_dels:
        raise SystemExit("\n\terror: invalid GenBank file (must be .gbm3)\n")
    if ORI and TER:
        if not all(
            [coord in range(1,
                            len(genbank_m3) + 1) for coord in (ORI, TER)]):
            raise SystemExit("\n\terror: invalid ori/ter coordinates\n")

    # Get ori + ter coordinates
    if not (ORI and TER):
        ORI, TER = find_ori_ter(genbank_m3)

    # Draw circular genome plot