def OnSeq(Label, Seq): Acc = fasta.GetAccFromLabel(Label) TaxStr = utax2.GetTaxFromLabel(Label) k = 'k' if TaxStr.find("d:") >= 0: k = 'd' # NR_117221.1 species:Mycobacterium arosiense;genus:Mycobacterium;family:Mycobacteriaceae;order:Corynebacteriales;class:Actinobacteria;phylum:Actinobacteria;superkingdom:Bacteria; Tax = "" if TaxStr.find(",s:") > 0: Tax = AppendRank(Tax, TaxStr, 's') Tax = AppendRank(Tax, TaxStr, 'g') Tax = AppendRank(Tax, TaxStr, 'f') Tax = AppendRank(Tax, TaxStr, 'o') Tax = AppendRank(Tax, TaxStr, 'c') Tax = AppendRank(Tax, TaxStr, 'p') Tax = AppendRank(Tax, TaxStr, k) Acc = Acc.upper() NewLabel = "REF_" + Acc fasta.WriteSeq(fFa, Seq, NewLabel) print >> fTax, "%s\t%s" % (NewLabel, Tax)
def FixQueryLabel(Label): Acc = fasta.GetAccFromLabel(Label) Tax = fasta.GetTaxFromLabel(Label) s = Acc + ";tax=" Fields = Tax.split(',') n = len(Fields) for i in range(0, n): Field = Fields[i] if i > 0: s += "," s += Field[0] + ":" + Field[2:] s += ";" return s
def OnSeq(Label, Seq): global AccToTax global DomainRank Acc = fasta.GetAccFromLabel(Label) Tax = fasta.GetTaxFromLabel(Label) Acc = Acc.upper() if Acc.startswith("REF_"): Acc = Acc.replace("REF_", "") AccToTax[Acc] = Tax if DomainRank == '?': DomainRank = Tax[0]
def OnSeq(Label, Seq): global AccToTax global DomainRank global Ranks Acc = fasta.GetAccFromLabel(Label) Tax = fasta.GetTaxFromLabel(Label) Acc = Acc.upper() AccToTax[Acc] = Tax if DomainRank == '?': DomainRank = Tax[0] Ranks = DomainRank + Ranks[1:] assert Ranks[1] == 'p'
def OnSeq(Label, Seq): Acc = fasta.GetAccFromLabel(Label) TaxStr = utax2.GetTaxFromLabel(Label) k = 'k' if TaxStr.find("d:") >= 0: k = 'd' # DQ200983.1.1404.B Bacteria;Actinobacteria;Actinobacteria;Frankiales;Geodermatophilaceae;Blastococcus;Blastococcus jejuensis; Tax = "" Tax = AppendRank(Tax, TaxStr, k) Tax = AppendRank(Tax, TaxStr, 'p') Tax = AppendRank(Tax, TaxStr, 'c') Tax = AppendRank(Tax, TaxStr, 'o') Tax = AppendRank(Tax, TaxStr, 'f') Tax = AppendRank(Tax, TaxStr, 'g') Tax = AppendRank(Tax, TaxStr, 's') Acc = Acc.upper() fasta.WriteSeq(fFa, Seq, Acc) print >> fTax, "%s\t%s" % (Acc, Tax)
def DoPredFile(FileName, NameToCountA, NameToCountB): f = open(FileName) TP = 0 TN = 0 FN = 0 OC = 0 MC = 0 NC = 0 Known = 0 Novel = 0 while 1: Line = f.readline() if len(Line) == 0: return Known, Novel, TP, TN, FN, OC, MC, NC Fields = Line[:-1].split('\t') assert len(Fields) == 2 QueryLabel = Fields[0] Pred = Fields[1] if Pred.endswith(';'): Pred = Pred[:-1] QueryName = utax2.GetNameFromLabel(QueryLabel, Rank) assert QueryName != "" if Pred.find("tax=") >= 0: Pred = Pred.split("tax=")[1] if Pred == "*": PredName = "" else: PredName = utax2.GetNameFromTaxStr(Pred, Rank) if PredName != "": NC += 1 Count = sortdict.GetCount(NameToCountB, QueryName) IsKnown = (Count > 0) if IsKnown: Known += 1 else: Novel += 1 if PredName == QueryName and not IsKnown: die.Die("QueryName=%s, PredName=%s >%s" % (QueryName, PredName, QueryLabel)) if PredName == QueryName: XX = "TP" TP += 1 elif PredName == "": if Count == 0: XX = "TN" TN += 1 else: XX = "FN" FN += 1 else: if Count == 0: XX = "OC" OC += 1 else: XX = "MC" MC += 1 if REPORT: Acc = fasta.GetAccFromLabel(QueryLabel) if IsKnown: k = "known" else: k = "novel" PredNameStr = "-" if PredName != "": PredNameStr = PredName s = Acc s += "\t" + XX s += "\t" + k s += "\t" + QueryName s += "\t" + PredNameStr s += "\t" + str(Count) print s
assert Ranks[1] == 'p' fasta.ReadSeqsOnSeq(FastaFileName, OnSeq) f = open(FileName) while 1: Line = f.readline() if len(Line) == 0: break # gi_1018196556 Bacteria;Proteobacteria;Gammaproteobacteria;Oceanospirillales;Halomonadaceae; 96.58 292 91.03 Fields = Line[:-1].split('\t') Label = Fields[0] Acc = fasta.GetAccFromLabel(Label) Acc = Acc.upper() QTax = AccToTax[Acc] Tax = Fields[1] Fields2 = Tax.split(";") n = len(Fields2) - 1 assert n <= len(Ranks) Pred = "" for i in range(0, n): if i > 0: Pred += "," Rank = Ranks[i] Pred += Rank + ":" + Fields2[i]