def main(target, strain_lst): print("START: query about {} strains".format(len(strain_lst))) dct_lst = [] for strain in strain_lst: refseqFilepath = "/data/mitsuki/data/denovo/{}/annotation/refseq/gff/{}.gff".format( target, strain) refseq_df = read_gff(refseqFilepath, ["orf_id"]) dbFilepath = "/data/mitsuki/data/denovo/{}/annotation/prodigal/sup/{}.sq3".format( target, strain) sdc = ScoreDbController(dbFilepath) infoCount = 0 for _, row in refseq_df.iterrows(): dct = {"orf_id": row["orf_id"]} info_dct = sdc.info(row["seqname"], row["start"], row["end"]) if len(info_dct) > 0: infoCount += 1 dct.update(info_dct) dct_lst.append(dct) print("\tDONE: found information for {}/{} genes in {}".format( infoCount, refseq_df.shape[0], strain)) score_df = pd.DataFrame(dct_lst) score_df = score_df[[ "orf_id", "Beg", "End", "Std", "Total", "CodPot", "StrtSc", "Codon", "RBSMot", "Spacer", "RBSScr", "UpsScr", "TypeScr", "GCCont" ]] outFilepath = "../data/{}/orf2score.csv".format(target) score_df.to_csv(outFilepath, index=False) print("DONE: output {}".format(outFilepath))
def main(strain, clusterFilepath, inFilepath, outFilepath): cluster_df = pd.read_csv(clusterFilepath, delimiter="\t") filter_df = cluster_df[~cluster_df[strain].isnull()] orf2family = defaultdict( list) # key: orf_id , val: list of families which belongs to for family, orfIds in zip(filter_df["family"], filter_df[strain]): for orfId in orfIds.split(","): orf2family[orfId].append(family) # add family information to attribute gff_df = read_gff(inFilepath, additional_lst=["orf_id"]) attribute_lst = [] assignCount = 0 for _, row in gff_df.iterrows(): #!!! assuming every row is CDS if row["orf_id"] in orf2family.keys(): assignCount += 1 att = "{};family={}".format(row["attribute"], ",".join(orf2family[row["orf_id"]])) attribute_lst.append(att) else: # when this CDS is pseudogene, no sequence information in FASTA so that no family information is given # print("DEBUG: {} has no family information".format(row["orf_id"])) attribute_lst.append(row["attribute"]) assert assignCount == len( orf2family ) #every orf_id should appear exactly once in gff, as orf_id is uniquely defined gff_df["attribute"] = attribute_lst write_gff(gff_df, outFilepath) print("DONE: output {}".format(outFilepath)) print("\tassigned family to {}/{} CDS".format(assignCount, gff_df.shape[0]))
def main(target, strain_lst): annotType="refseq" direc="/data/mitsuki/data/denovo/{}/annotation/{}".format(target, annotType) print("START: parse {} * 2 FASTA files".format(len(strain_lst))) family2fna=defaultdict(list) family2faa=defaultdict(list) for strain in strain_lst: gffFilepath="{}/gff/{}.gff".format(direc, strain) fnaFilepath="{}/fna/{}.fna".format(direc, strain) faaFilepath="{}/faa/{}.faa".format(direc, strain) gff_df=read_gff(gffFilepath, ["orf_id","family"]) id2family={} for _, row in gff_df.iterrows(): id2family[row["orf_id"]]=row["family"] for rec in SeqIO.parse(fnaFilepath, "fasta"): family=id2family[rec.id] family2fna[family].append(rec) for rec in SeqIO.parse(faaFilepath, "fasta"): family=id2family[rec.id] family2faa[family].append(rec) print("START: output fna & faa for every family") outDirec="{}/family/fna".format(direc) os.makedirs(outDirec, exist_ok=True) output_family2rec(family2fna, outDirec, "fna") print("\tDONE: output {} family in {}".format(len(family2fna), outDirec)) outDirec="{}/family/faa".format(direc) os.makedirs(outDirec, exist_ok=True) output_family2rec(family2faa, outDirec, "faa") print("\tDONE: output {} family in {}".format(len(family2faa), outDirec))
def main(inFilepath, outFilepath): """ add orf_id column to .gff created by prodigal """ gff_df = read_gff(inFilepath, additional_lst=["ID"]) attribute_lst = [] for _, row in gff_df.iterrows(): orfId = "{}_{}".format(row["seqname"], row["ID"].split("_")[-1]) att = "{};orf_id={}".format(row["attribute"], orfId) attribute_lst.append(att) gff_df["attribute"] = attribute_lst write_gff(gff_df, outFilepath) print("DONE: output {}".format(outFilepath))
def refseq(inFilepath, outFilepath): """ delete some information in attribute, and renew ID column with unique id """ refseq_df = read_gff(inFilepath, ["family", "orf_id"]) att_lst = [] for _, row in refseq_df.iterrows(): delCol_lst = ["Parent", "ID", "gene"] addCol_dct = {"ID": "{}({})".format(row["family"], row["orf_id"])} att_lst.append( format_attribute(row["attribute"], delCol_lst=delCol_lst, addCol_dct=addCol_dct)) refseq_df["attribute"] = att_lst write_gff(refseq_df, outFilepath)
def main(strain, hitFilepath, geneFilepath, overlapFilepath): hit_df = pd.read_csv(hitFilepath) gff_df = read_gff(gffFilepath, ["orf_id", "family"]) ovr_df = get_overlap_df(gff_df, hit_df) ovr_df = add_sbjct_pos(ovr_df, gff_df) ovr_df = add_query_pos(ovr_df, hit_df) column_lst = [ "overlap_id", "region_id", "ostart", "oend", "olength", "chr_name", "qstrain", "sstrain", "qfamily", "sfamily", "qstrand", "sstrand", "qorf_id", "sorf_id", "qostart_dna", "qostart_pro", "qoend_dna", "qoend_pro", "sostart_dna", "sostart_pro", "soend_dna", "soend_pro", "qstart", "qend", "sstart", "send", "cstart", "cend", "qosp", "qoep" ] ovr_df = ovr_df[column_lst] ovr_df = ovr_df.set_index("overlap_id") ovr_df.to_csv(overlapFilepath) print("DONE: {} overlaps in {}".format(ovr_df.shape[0], overlapFilepath))
def get_orf2synteny(strain_lst, gffDirec): """ key: orf_id val: dictionary of ["neighbor", "left", "right", "strand"] """ orf2synteny = {} for strain in strain_lst: gffFilepath="{}/{}.gff".format(gffDirec, strain) gff_df = read_gff(gffFilepath, ["orf_id", "family"]) gff_df = gff_df[~gff_df["family"].isnull()] #drop rows for pseudogenes which does not have family information for seqname in set(gff_df["seqname"]): filtered_df = gff_df[gff_df["seqname"]==seqname].copy() filtered_df = filtered_df.sort_values(by=["start", "end"]) filtered_df = filtered_df.reset_index(drop=True) family_lst = list(filtered_df["family"]) for key, row in filtered_df.iterrows(): pre_lst = family_lst[max(0, key - 3): key] post_lst = family_lst[key + 1: key + 4] dct = {} dct["neighbor"] = pre_lst + post_lst dct["left"], dct["right"] = "", "" if len(pre_lst) > 0: if row["strand"] == "+": dct["left"] = pre_lst[-1] elif row["strand"] == "-": dct["right"] = pre_lst[-1] else: print("ERROR: unknown strand {}".format(row["strand"])) if len(post_lst) > 0: if row["strand"] == "+": dct["right"] = post_lst[0] elif row["strand"] == "-": dct["left"] = post_lst[0] else: print("ERROR: unknown strand {}".format(row["strand"])) orf2synteny[row["orf_id"]] = dct return orf2synteny
def main(): options = parse_options() exon_out_f = open(options.exon_output_filename, "w") gene_out_f = open(options.gene_output_filename, "w") util.info.write_info(exon_out_f, options) util.info.write_info(gene_out_f, options) chrom_dict = chromosome.get_chromosome_dict(options.chrom_file) gene_dict, tr_dict, gene_chrom_dict, tr_chrom_dict = \ gff.read_gff(options.gff, chrom_dict, region_chrom=options.chrom, region_start=options.start, region_end=options.end) gene_num = 0 for gene in gene_chrom_dict[options.chrom]: exons = gene.get_merged_exons() gene_num += 1 gene_out_f.write("%s %s %s %d %d %d %d\n" % (gene.gene_id, gene.gene_name, gene.chrom.name, gene_num, gene.start, gene.end, gene.strand)) exon_num = 0 if gene.strand == -1: exons = exons[::-1] for ex in exons: exon_num += 1 exon_out_f.write("%s %s %s %d %d %d %d\n" % (gene.gene_id, gene.gene_name, gene.chrom.name, exon_num, ex.start, ex.end, gene.strand)) exon_out_f.close() gene_out_f.close()
def filter_gff(inFilepath, outFilepath): gff_df = read_gff(inFilepath) filtered_df = gff_df[gff_df["seqname"] != "train"] write_gff(outFilepath, filtered_df)
def calc_stat(target, strain): seqFilepath = "/data/mitsuki/data/denovo/{}/dnaseq/{}.dnaseq".format(target, strain) rec_lst = [] for rec in SeqIO.parse(seqFilepath, "fasta") : rec_lst.append(rec) hitFilepath = "../blastn/result/{}/{}.csv".format(target, strain) hit_df = pd.read_csv(hitFilepath) gffFilepath = "/data/mitsuki/data/denovo/{}/annotation/refseq/gff/{}.gff".format(target, strain) gff_df = read_gff(gffFilepath, ["pseudo"]) if "pseudo" in gff_df.columns: gff_df["pseudo_b"] = gff_df["pseudo"].notnull() else: gff_df["pseudo_b"] = False length = 0 genicSum = 0 pseudoSum = 0 interSum = 0 hitSum = 0 hitJustSum = 0 genicHitSum = 0 pseudoHitSum= 0 interHitSum = 0 for rec in rec_lst: seqname = rec.id # define genic_lst, psedo_lst, inter_lst according to gff_df genic_lst = [] pseudo_lst = [] for _, row in gff_df[gff_df["seqname"] == seqname].iterrows(): if row["pseudo_b"]: pseudo_lst.append( Interval(row["start"] -1, row["end"]) ) else: genic_lst.append( Interval(row["start"] -1, row["end"]) ) inter_lst = interval_not(genic_lst + pseudo_lst, len(rec)) # define hit_lst according to hit_df hit_lst = [] for _, row in hit_df[hit_df["sseqid"] == seqname].iterrows(): if row["hit_strand"] == 1: start = row["sstart"] - 1 end = row["send"] else: start = row["send"] - 1 end = row["sstart"] hit_lst.append(Interval(start, end)) length += len(rec) genicSum += interval_sum(genic_lst) pseudoSum += interval_sum(pseudo_lst) interSum += interval_sum(inter_lst) hitSum += interval_sum(hit_lst) hitJustSum += interval_justsum(hit_lst) genicHitSum += interval_and(genic_lst, hit_lst) pseudoHitSum += interval_and(pseudo_lst, hit_lst) interHitSum += interval_and(inter_lst, hit_lst) ret = { "length": length, "genic_sum" : genicSum, "pseudo_sum": pseudoSum, "inter_sum": interSum, "hit_sum": hitSum, "hit_justsum": hitJustSum, "genic_hit_sum": genicHitSum, "pseudo_hit_sum": pseudoHitSum, "inter_hit_sum": interHitSum } return ret
def main(strain, seedFilepath, gffFilepath): for record in SeqIO.parse(seedFilepath, "fasta"): seedRec = record break gff_df = read_gff(gffFilepath) #get all the shuffle region prv = 0 pos_lst = [] for _, row in gff_df.iterrows(): pos_lst.append(("nc", prv, row["start"] - 1, "+")) pos_lst.append(("c", row["start"] - 1, row["end"], row["strand"])) prv = row["end"] pos_lst.append(("nc", prv, len(seedRec), "+")) # configuration for evolution treeFilepath = "tmp.tree" mytree = pyvolve.read_tree(file=treeFilepath) ncm = pyvolve.Model("nucleotide") # non-coding model cm = pyvolve.Model("ECMrest") # coding model outputSeq_lst = [Seq("") for _ in range(4)] # assuming tree has 4 nodes for pos in pos_lst: category, start, end, strand = pos # get rootSeq according to start, end, strand info rootSeq = seedRec.seq[start:end] if strand == "-": rootSeq = rootSeq.reverse_complement() rootSeq = str(rootSeq) # get simulated sequences if category == "nc": # partition = pyvolve.Partition(models = ncm, root_sequence = rootSeq) # evolver = pyvolve.Evolver(partition = partition, tree = mytree) # rec_lst = get_evolved(evolver) rec_lst = [SeqRecord(Seq(rootSeq)) for _ in range(4)] elif category == "c": partition = pyvolve.Partition( models=cm, root_sequence=rootSeq[3:-3]) #remove start & stop codon evolver = pyvolve.Evolver(partition=partition, tree=mytree) rec_lst = get_evolved(evolver) for rec in rec_lst: rec.seq = rootSeq[:3] + rec.seq + rootSeq[ -3:] #add last stop codon back assert len(rec_lst) == len(outputSeq_lst) # concat to outputSeq_lst for i, rec in enumerate(rec_lst): simSeq = rec.seq if strand == "-": simSeq = simSeq.reverse_complement() outputSeq_lst[i] += simSeq for i, outputSeq in enumerate(outputSeq_lst): genomeId = "{}_sim{}".format(strain, i + 1) outFilepath = "../data/dnaseq/{}.dnaseq".format(genomeId) with open(outFilepath, "w") as f: seqname = "{}:seq".format(genomeId) rec = SeqRecord(outputSeq, id=seqname, description="") SeqIO.write(rec, f, "fasta") print("DONE: output {}".format(outFilepath))
import sys import os import pandas as pd sys.path.append("../helper") from gff import read_gff from myio import * target=sys.argv[1] annotType="refseq" strain_lst = get_strain_lst(target) # check cluster.tsv cluster_df = get_cluster_df(target) for col in ["family", "lineage", "size"] + strain_lst: if not(col in cluster_df.columns): print("ERROR: {} dose not have column {}".format(clusterFilepath, col), file = sys.stderr) exit(1) # check gff for strain in strain_lst: gffFilepath = "/data/mitsuki/data/denovo/{}/annotation/{}/gff/{}.gff".format(target, annotType, strain) try: read_gff(gffFilepath, ["family"]) except KeyError: print("ERROR: {} does not have family information".format(gffFilepath), file = sys.stderr) exit(2) exit(0)
#!/usr/bin/env python3 import sys import os sys.path.append("../helper") from myio import get_strain_lst from gff import read_gff target = sys.argv[1] strain_lst = get_strain_lst(target, full=True) direc = "/data/mitsuki/data/denovo/{}".format(target) for strain in strain_lst: dnafp = "{}/dnaseq/{}.dnaseq".format(direc, strain) fnafp = "{}/annotation/refseq/fna/{}.fna".format(direc, strain) faafp = "{}/annotation/refseq/faa/{}.faa".format(direc, strain) gfffp = "{}/annotation/refseq/gff/{}.gff".format(direc, strain) for fp in (dnafp, fnafp, faafp): if not (os.path.isfile(fp)): print("ERROR: {} not found".format(fp), file=sys.stderr) exit(1) gff_df = read_gff(gfffp, ["orf_id"]) if not ("orf_id" in gff_df.columns): print("ERROR: orf_id not found in {}".format(gfffp), file=sys.stderr) exit(2) exit(0)