def cutLongBranches(aln, dAlnTree, logger): """ Check for overly long branches in a tree and separate both tree and corresponding alignment if found. @param1 aln: Fasta alignment @param2 tree: Tree corresponding to the alignment @param3 logger: Logging object @return dAlnTree: Updated dictionary of alignments and their corresponding trees """ logger.info("Looking for long branches.") loadTree = ete3.Tree(dAlnTree[aln]) matches = [leaf for leaf in loadTree.traverse() if leaf.dist > 50.0] if len(matches) > 0: logger.info("{} long branches found, separating alignments.".format( len(matches))) seqs = SeqIO.parse(open(aln), 'fasta') dID2Seq = {gene.id: gene.seq for gene in seqs} for node in matches: gp = node.get_children() lNewGp = list(chain.from_iterable([x.get_leaf_names() for x in gp])) newAln = aln.split(".")[0] + "_split" + str( matches.index(node) + 1) + ".fasta" dNewAln = {gene: dID2Seq[gene] for gene in lNewGp} for k in lNewGp: dID2Seq.pop(k, None) # create new file of sequences with open(newAln, "w") as fasta: fasta.write(FastaResFunc.dict2fasta(dNewAln)) dAlnTree[newAln] = "" alnLeft = aln.split(".")[0] + "_split" + str(len(matches) + 1) + ".fasta" with open(alnLeft, "w") as fasta: fasta.write(FastaResFunc.dict2fasta(dID2Seq)) dAlnTree[alnLeft] = "" dAlnTree.pop(aln, None) else: logger.info("No long branches found.") return (dAlnTree)
def supData(filePath, corFile, dirName): """ Delete genes in a fasta file @param1 filePath: Path to a fasta file @param2 corFile: Path to the file with correspondence beetween gene and species @param3 dirName: Name of a directory @return out: Path """ with open(corFile, "r") as corSG: corSG = corSG.readlines() lGene = [i.split("\t")[0] for i in corSG] newDico = {} for accn in SeqIO.parse(open(filePath, "r"), "fasta"): if accn.id in lGene: newDico[accn.id] = accn.seq out = dirName + filePath.replace(".fasta", "_filtered.fasta").split("/")[-1] with open(out, "w") as newVer: newVer.write(FastaResFunc.dict2fasta(newDico)) return out
def covAln(aln, cov, queryName, o): """ Function to discard sequences from alignment according to coverage to query. @param1 aln: Path to prank alignment @param2 cov: minimum coverage necessary to keep sequence (from parameter file) @param4 queryName: full identifier of the sequence against which to check coverage @param5 o: output directory @return outCov: Path to file of conserved sequences """ dId2Seq = { fasta.id: str(fasta.seq) for fasta in SeqIO.parse(open(aln), 'fasta') } logger = logging.getLogger("main.alignment") if queryName in dId2Seq: logger.info( "Discarding sequences with less than {:d}% coverage of query.". format(cov)) outCov = o + aln.split("/")[-1].split(".")[0] + "_mincov.fasta" nbOut = 0 lIndexes = [ pos for pos, char in enumerate(dId2Seq[queryName]) if char != "-" ] dKeep = {} for ID, seq in dId2Seq.items(): seqPos = [seq[x] for x in lIndexes] seqCov = (len(seqPos) - seqPos.count("-")) / len(seqPos) * 100 if seqCov > cov: dKeep[ID] = seq nbOut = len(dId2Seq) - len(dKeep) with open(outCov, "w") as outC: outC.write(FastaResFunc.dict2fasta(dKeep)) outC.close() logger.info("Discarded {:d} sequences".format(nbOut)) return (outCov, nbOut) else: logger.warning( "Provided query name not found in the alignment, skipping coverage check." ) return (aln, 0)
parameters["remote"], parameters["entryQuery"]) elif lSteps[i] == "accessions": if parameters["step"] == "accessions": Data = LoadFileFunc.accnEntry(Data) ExtractFunc.treatAccns(Data) elif lSteps[i] == "fasta": if parameters["step"] == "fasta": Data = LoadFileFunc.getSeqEntry(Data, parameters["duplication"]) FastaResFunc.fastaCreation(Data, logger, parameters["remote"], parameters["APIKey"], parameters["step"], parameters["duplication"]) elif lSteps[i] == "orf": if parameters["step"] == "orf": Data = LoadFileFunc.orfEntry(Data, parameters["duplication"]) if lSteps[i] == firstStep: LoadFileFunc.spTreeCheck(Data, firstStep, parameters["duplication"]) AnalysisFunc.orfFinder(Data) elif lSteps[i] == "alignment": if parameters["step"] == "alignment":
def parseGard(kh, aln, o, logger): """ Function returning the cut fragments following GARD analysis and identification of significant breakpoints. @param1 kh: Path to GARD.json output file @param2 aln: Path to alignment file @param3 pvalue: Float @param4 o: Path to output directory @return lOutFrag: List of Path (Fragments in fasta files) """ lBP = [] f = open(kh, "r") lLine = f.readline() while lLine: if lLine.find("\"breakpoints\"") != -1: lLine = f.readline() lLine = lLine[lLine.find("[") + 1:lLine.find("]")] lBP = list(map(int, lLine.split(","))) break lLine = f.readline() f.close() index = 0 #If there are breakpoints, add it in lBP if len(lBP) > 0: logger.info( "There are {:d} significant breakpoints in alignement {:s} at positions {}" .format(len(lBP), aln, lBP)) else: logger.info( "There are no significant breakpoints in alignement {:s}.".format( aln)) return [] #If there're breakpoint(s), cut sequence in subsequences according to breakpoints if len(lBP) > 0: dFname2Fseq = {} for fasta in SeqIO.parse(open(aln), 'fasta'): dFname2Fseq[fasta.id] = str(fasta.seq) #Creation of a dico where atgc in sequence has been replace by 1 and - by 0 lSeqBin = [] lNameGene = [] for fastaSeq in dFname2Fseq: lSeqBin.append(dFname2Fseq[fastaSeq].lower().replace( "a", "1").replace("t", "1").replace("c", "1").replace("g", "1").replace("-", "0")) lNameGene.append(fastaSeq) #looking for a multiple of 3 (number of letter) (subsequence ends on or after the breakpoint) nbSeq = len(lNameGene) lenSeq = len(lSeqBin[0]) lPos = [0] lBPprec = [0 for i in range(len(lSeqBin))] lFrag = [] for bp in lBP: while bp % 3 != 0: bp += 1 lPos.append(bp) lFrag += [ dFname2Fseq[lNameGene[j]][lPos[-2]:lPos[-1]] for j in range(nbSeq) ] #Adding subsequences that start at the last breakpoint to the end lFrag += [dFname2Fseq[lNameGene[i]][lPos[-1]:] for i in range(nbSeq)] lBP = lPos + [lenSeq] lOutFrag = [] index = 0 for x in range(1, len(lBP)): dFrag = {} if lBP[x - 1] == 0: extension = "_{:d}_{:d}".format(lBP[x - 1], lBP[x]) else: extension = "_{:d}_{:d}".format(lBP[x - 1] - 1, lBP[x]) outFrag = o + aln.split("/")[-1].split( ".")[0] + "_frag" + extension + ".best.fas" for name in lNameGene: dFrag[name] = lFrag[index] index += 1 with open(outFrag, "w") as outF: outF.write(FastaResFunc.dict2fasta(dFrag)) logger.info("\tNew alignment: %s" % {outFrag}) outF.close() lOutFrag.append(outFrag) return lOutFrag else: return []
def getORFs(catFile, queryName, geneDir): """ Function to find Open Reading Frames within the sequence of each gene and select the longest one. @param1 catFile: Path @param2 geneName: Gene name @param3 geneDir: Gene directory @return outORF: Path to the file containing the longest ORFs """ outORFraw = geneDir + catFile.split("/")[-1].split( ".")[0] + "_allORFs.fasta" logger = logging.getLogger("main.orf") cmd( "getorf -sequence {:s} -outseq {:s} -table 0 -find 3 -noreverse". format(catFile, outORFraw), False) logger.debug(cmd) dId2ORFs = defaultdict(list) f = SeqIO.parse(open(outORFraw), 'fasta') for fasta in f: fname, fseq = fasta.id, str(fasta.seq) if len(fname.split("_")) > 2: fname2 = "_".join(fname.split("_")[0:-1]) else: fname2 = fname.split("_")[0] dId2ORFs[fname2].append(fseq) dId2Longest = {} for k, v in dId2ORFs.items(): dId2Longest[k] = max(v, key=len) # delete duplicate sequences dRev = {} for k, v in dId2Longest.items(): dRev.setdefault(v, set()).add(k) AllDupl = [values for key, values in dRev.items() if len(values) > 1] n = 0 for dupl in AllDupl: species = set([x.split("_")[0] for x in dupl]) for sp in species: if queryName in dupl: firstOcc = queryName else: lOcc = [x for x in dupl if sp in x] if len(lOcc) > 0: firstOcc = lOcc[0] else: firstOcc = str(lOcc) dupl.remove(firstOcc) for i in dupl: dId2Longest.pop(i, None) n += 1 logger.debug("Deleted sequence {:s} (duplicate)".format(i)) logger.info("Deleted {} sequences as duplicates".format(n)) outORF = outORFraw.replace("_allORFs.fasta", "_longestORFs.fasta") with open(outORF, "w") as outO: outO.write(FastaResFunc.dict2fasta(dId2Longest)) outO.close() logger.info("Extracted longest ORFs: {:s}".format(outORF)) return (outORF)
def cutLongBranches(aln, dAlnTree, nbSp, LBOpt, logger): """ Check for overly long branches in a tree and separate both tree and corresponding alignment if found. @param1 aln: Fasta alignment @param2 tree: Tree corresponding to the alignment @param3 logger: Logging object @return dAlnTree: Updated dictionary of alignments and their corresponding trees """ logger.info("Looking for long branches.") loadTree = ete3.Tree(dAlnTree[aln]) dist = [leaf.dist for leaf in loadTree.traverse()] #longDist = 500 if "cutoff" in LBOpt: if "(" in LBOpt: factor = float(LBOpt.split("(")[1].replace(")", "")) else: factor = 50 medianDist = median(dist) meanDist = mean(dist) longDist = meanDist * factor elif "IQR" in LBOpt: if "(" in LBOpt: factor = float(LBOpt.split("(")[1].replace(")", "")) else: factor = 50 df = pd.DataFrame(dist) Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 lDist = Q3 + (factor * IQR) longDist = lDist[0] logger.info( "Long branches will be evaluated through the {} method (factor {})". format(LBOpt, factor)) nbSp = int(nbSp) matches = [leaf for leaf in loadTree.traverse() if leaf.dist > longDist] if len(matches) > 0: logger.info("{} long branches found, separating alignments.".format( len(matches))) seqs = SeqIO.parse(open(aln), 'fasta') dID2Seq = {gene.id: gene.seq for gene in seqs} for node in matches: gp = node.get_children() lNewGp = list(chain.from_iterable([x.get_leaf_names() for x in gp])) newAln = aln.split(".")[0] + "_part" + str( matches.index(node) + 1) + ".fasta" dNewAln = { gene: dID2Seq[gene] for gene in lNewGp if gene in dID2Seq } for k in lNewGp: dID2Seq.pop(k, None) # create new file of sequences if len(dNewAln) > nbSp - 1: with open(newAln, "w") as fasta: fasta.write(FastaResFunc.dict2fasta(dNewAln)) fasta.close() dAlnTree[newAln] = "" else: logger.info( "Sequences {} will not be considered for downstream analyses as they do not compose a large enough group." .format(dNewAln.keys())) alnLeft = aln.split(".")[0] + "_part" + str(len(matches) + 1) + ".fasta" if len(dID2Seq) > nbSp - 1: with open(alnLeft, "w") as fasta: fasta.write(FastaResFunc.dict2fasta(dID2Seq)) logger.info("\tNew alignment:%s" % {alnLeft}) fasta.close() dAlnTree[alnLeft] = "" else: logger.info( "Sequences in {} will not be considered for downstream analyses as they do not compose a large enough group." .format(dID2Seq.keys())) dAlnTree.pop(aln, None) else: logger.info("No long branches found.") return (dAlnTree)
def parseGard(kh, aln, pvalue, o, logger): """ Function returning the cut fragments following GARD analysis and identification of significant breakpoints. @param1 kh: Path to GARDprocessor output file @param2 aln: Path to alignment file @param3 pvalue: Float @param4 o: Path to output directory @return lOutFrag: List of Path (Fragments in fasta files) """ lBP = [] with open(kh, "r") as f: lLine = f.readlines() finalIndex = len(lLine) index = 0 while lLine[index].startswith("Breakpoint") != True and index < finalIndex: index += 1 #If there are breakpoints, add it in lBP if lLine[index + 1] != "": index += 1 while lLine[index].startswith(" "): line = [float(item.strip()) for item in lLine[index].split("|")] if line[2] < pvalue and line[4] < pvalue: lBP.append(int(line[0])) index += 1 if len(lBP) > 0: logger.info( "There are {:d} significant breakpoints in alignement {:s} at positions {}" .format(len(lBP), aln, lBP)) else: logger.info( "There are no significant breakpoints in alignement {:s}.". format(aln)) #If there're breakpoint(s), cut sequence in subsequences according to breakpoints if len(lBP) > 0: dFname2Fseq = {} for fasta in SeqIO.parse(open(aln), 'fasta'): dFname2Fseq[fasta.id] = str(fasta.seq) #Creation of a dico where atgc in sequence has been replace by 1 and - by 0 lSeqBin = [] lNameGene = [] for fastaSeq in dFname2Fseq: lSeqBin.append(dFname2Fseq[fastaSeq].lower().replace( "a", "1").replace("t", "1").replace("c", "1").replace( "g", "1").replace("-", "0")) lNameGene.append(fastaSeq) #looking for a multiple of 3 (number of letter) (subsequence ends on or after the breakpoint) nbSeq = len(lNameGene) lenSeq = len(lSeqBin[0]) lPos = [0] lBPprec = [0 for i in range(len(lSeqBin))] lFrag = [] for bp in lBP: while bp % 3 != 0: bp += 1 lPos.append(bp) lFrag += [ dFname2Fseq[lNameGene[j]][lPos[-2]:lPos[-1]] for j in range(nbSeq) ] #Adding subsequences that start at the last breakpoint to the end lFrag += [ dFname2Fseq[lNameGene[i]][lPos[-1]:] for i in range(nbSeq) ] lBP = lPos + [lenSeq] lOutFrag = [] index = 0 for x in range(1, len(lBP)): dFrag = {} extension = "{:d}to{:d}".format(lBP[x - 1], lBP[x]) outFrag = o + aln.split("/")[-1].split( ".")[0] + "_frag" + extension + ".best.fas" for name in lNameGene: dFrag[name] = lFrag[index] index += 1 with open(outFrag, "w") as outF: outF.write(FastaResFunc.dict2fasta(dFrag)) lOutFrag.append(outFrag) return lOutFrag else: return [] else: return []
def treeParsing(ORF, recTree, nbSp, o, logger): """ Function which parse gene data in many group according to duplication in the reconciliated tree @param1 ORFs: Path to the ORFs file @param2 tree: Path to a tree file @param3 geneName: Gene name @param4 o: Output directory @param5 logger: Logging object @return lOut: List of path (fasta files) """ with open(recTree, "r") as tree: reconTree = tree.readlines()[1] testTree = ete3.Tree(reconTree) seqs = SeqIO.parse(open(ORF), 'fasta') dID2Seq = {gene.id: gene.seq for gene in seqs} # get all nodes annotated with a duplication event dupl = testTree.search_nodes(D="Y") dNb2Node = {int(node.ND): node for node in dupl} nDuplSign = 0 lOut = [] sp = set([leaf.S for leaf in testTree]) dDupl2Seq = {} print("TRY 4") # as long as the number of species left in the tree is equal or superior to the cut-off specified by the user and there still are nodes annoted with duplication events while len(sp) > int(nbSp) - 1 and len(dNb2Node.keys()) > 0: # start from the most recent duplications (ie, the furthest node) sp = set([leaf.S for leaf in testTree]) nodeNb = min(dNb2Node.keys()) node = dNb2Node[nodeNb] # for each of the branches concerned by the duplication nGp = 1 interok = False # do not consider dubious duplications (no intersection between the species on either side of the annotated duplication) lf = [set([leaf.S for leaf in gp]) for gp in node.get_children()] interok = (len(lf[0].intersection(lf[1])) != 0 and len(lf[0]) > int(nbSp) / 2 - 1 and len(lf[1]) > int(nbSp) / 2 - 1) if not interok: dNb2Node.pop(nodeNb, None) # otherwise check it out else: for gp in node.get_children(): spGp = set([leaf.S for leaf in gp]) # check if the numbers of species in the branch is equal or superior to the cut-off specified by the user if len(spGp) > int(nbSp) - 1: orthos = gp.get_leaf_names() dOrtho2Seq = { ortho: dID2Seq[ortho] for ortho in orthos if not ortho == "" } #check if orthologues have already been included in another, more recent, duplication event already = False for doneDupl in dDupl2Seq: if all(ortho in dDupl2Seq[doneDupl] for ortho in orthos): already = True break if not already: nDuplSign += 1 outFile = o + ORF.split("/")[-1].split( ".")[0] + "_D" + str(nodeNb) + "gp" + str( nGp) + ".fasta" lOut.append(outFile) # create new file of orthologous sequences with open(outFile, "w") as fasta: fasta.write( FastaResFunc.dict2fasta(dOrtho2Seq)) # remove the node from the tree removed = gp.detach() dDupl2Seq["{:d}-{:d}".format(nodeNb, nGp)] = orthos nGp += 1 dNb2Node.pop(nodeNb, None) # if duplication groups have been extracted # pool remaining sequences (if span enough different species - per user's specification) into new file if len(lOut) > 0: leftovers = filter(None, testTree.get_leaf_names()) dRemain = {left: dID2Seq[left] for left in leftovers} if len(dRemain.keys()) > int(nbSp) - 1: outFile = o + ORF.split("/")[-1].split( ".")[0] + "_duplication_remainingsequences.fasta" with open(outFile, "w") as fasta: fasta.write(FastaResFunc.dict2fasta(dRemain)) lOut.append(outFile) else: logger.info( "Ignoring remaining sequences {} as they do not compose a group of enough orthologs." .format(list(dRemain.keys()))) logger.info( "{:d} duplications detected by Treerecs, extracting {:d} groups of at least {} orthologs." .format(len(dupl), nDuplSign, nbSp)) return lOut
def isoformAln(aln, o): """Function to cluster isoforms according to the alignment. Return the overall coverage of these isoforms. Isoforms are from the same species (recognized through keyword xxxXxx at the beginning of their name) and same letters or indels at same positions in alignment. @param1 aln: Path to alignment @param2 o: output directory @return outAln: Path to file of resulting alignment """ logger = logging.getLogger("main.alignment") logger.info("Clustering isoforms.") dRem={} #for remaining sequences dId2Seq={} #for remaining sequences laln=0 #alignement length for fasta in SeqIO.parse(open(aln),'fasta'): post=fasta.id.find("_") if post!=-1: #regular format sp=fasta.id[:post] tag=fasta.id[post+1:] if not sp in dId2Seq: dId2Seq[sp]={} dId2Seq[sp][tag]=str(fasta.seq) if laln==0: laln=len(fasta.seq) else: dRem[fasta.id]=str(fasta.seq) outCov = o+aln.split("/")[-1].split(".")[0]+"_clustiso.fasta" clustok=False #flag to check if a cluster has occured for sp,dtagseq in dId2Seq.items(): lclust=[list(dtagseq)] #list of clusters of tags to be split for pos in range(laln): lclust2=[] for clust in lclust: dlet={tag:dtagseq[tag][pos] for tag in clust} llet=set([x for x in dlet.values() if x!="-"]) if len(llet)<=1: #one letter at most, keep all lclust2.append(clust) continue else: for x in llet: lclust2.append([tag for tag in clust if dlet[tag]==x]) lind=[tag for tag in clust if dlet[tag]=="-"] #conservative, do not know wether to merge, may be improved if len(lind)!=0: lclust2.append(lind) lclust=lclust2 #now merge sequences in each cluster for clust in lclust: if len(clust)==1: dRem[sp+"_"+clust[0]]=dtagseq[clust[0]] else: clustok=True ntag=clust[-1]+"_clust" logger.info("Clustered sequences " + sp+"_" + (", %s_"%(sp)).join(clust) + " into %s_"%(sp)+ntag) nseq="".join([max([dtagseq[tag][pos] for tag in clust]) for pos in range(laln)]) dRem[sp+"_"+ntag]=nseq if clustok: with open(outCov, "w") as outC: outC.write(FastaResFunc.dict2fasta(dRem)) outC.close() return(outCov) else: return(aln)