Example #1
0
def cutLongBranches(aln, dAlnTree, logger):
    """
	Check for overly long branches in a tree and separate both tree and corresponding alignment if found.
	
	@param1 aln: Fasta alignment
	@param2 tree: Tree corresponding to the alignment
	@param3 logger: Logging object
	@return dAlnTree: Updated dictionary of alignments and their corresponding trees
	"""
    logger.info("Looking for long branches.")
    loadTree = ete3.Tree(dAlnTree[aln])
    matches = [leaf for leaf in loadTree.traverse() if leaf.dist > 50.0]

    if len(matches) > 0:
        logger.info("{} long branches found, separating alignments.".format(
            len(matches)))

        seqs = SeqIO.parse(open(aln), 'fasta')
        dID2Seq = {gene.id: gene.seq for gene in seqs}

        for node in matches:
            gp = node.get_children()
            lNewGp = list(chain.from_iterable([x.get_leaf_names()
                                               for x in gp]))

            newAln = aln.split(".")[0] + "_split" + str(
                matches.index(node) + 1) + ".fasta"

            dNewAln = {gene: dID2Seq[gene] for gene in lNewGp}
            for k in lNewGp:
                dID2Seq.pop(k, None)

            # create new file of sequences
            with open(newAln, "w") as fasta:
                fasta.write(FastaResFunc.dict2fasta(dNewAln))

            dAlnTree[newAln] = ""

        alnLeft = aln.split(".")[0] + "_split" + str(len(matches) +
                                                     1) + ".fasta"
        with open(alnLeft, "w") as fasta:
            fasta.write(FastaResFunc.dict2fasta(dID2Seq))

        dAlnTree[alnLeft] = ""
        dAlnTree.pop(aln, None)

    else:
        logger.info("No long branches found.")

    return (dAlnTree)
Example #2
0
def supData(filePath, corFile, dirName):
    """
	Delete genes in a fasta file

	@param1 filePath: Path to a fasta file
	@param2 corFile: Path to the file with correspondence beetween gene and species
	@param3 dirName: Name of a directory
	@return out: Path  	
	"""

    with open(corFile, "r") as corSG:
        corSG = corSG.readlines()
        lGene = [i.split("\t")[0] for i in corSG]

    newDico = {}
    for accn in SeqIO.parse(open(filePath, "r"), "fasta"):
        if accn.id in lGene:
            newDico[accn.id] = accn.seq

    out = dirName + filePath.replace(".fasta",
                                     "_filtered.fasta").split("/")[-1]
    with open(out, "w") as newVer:
        newVer.write(FastaResFunc.dict2fasta(newDico))

    return out
Example #3
0
def covAln(aln, cov, queryName, o):
    """
	Function to discard sequences from alignment according to coverage to query.
	
	@param1 aln: Path to prank alignment
	@param2 cov: minimum coverage necessary to keep sequence (from parameter file)
	@param4 queryName: full identifier of the sequence against which to check coverage
	@param5 o: output directory
	@return outCov: Path to file of conserved sequences
	"""

    dId2Seq = {
        fasta.id: str(fasta.seq)
        for fasta in SeqIO.parse(open(aln), 'fasta')
    }
    logger = logging.getLogger("main.alignment")

    if queryName in dId2Seq:
        logger.info(
            "Discarding sequences with less than {:d}% coverage of query.".
            format(cov))
        outCov = o + aln.split("/")[-1].split(".")[0] + "_mincov.fasta"

        nbOut = 0
        lIndexes = [
            pos for pos, char in enumerate(dId2Seq[queryName]) if char != "-"
        ]

        dKeep = {}
        for ID, seq in dId2Seq.items():
            seqPos = [seq[x] for x in lIndexes]
            seqCov = (len(seqPos) - seqPos.count("-")) / len(seqPos) * 100

            if seqCov > cov:
                dKeep[ID] = seq

        nbOut = len(dId2Seq) - len(dKeep)

        with open(outCov, "w") as outC:
            outC.write(FastaResFunc.dict2fasta(dKeep))
            outC.close()
        logger.info("Discarded {:d} sequences".format(nbOut))

        return (outCov, nbOut)

    else:
        logger.warning(
            "Provided query name not found in the alignment, skipping coverage check."
        )
        return (aln, 0)
Example #4
0
                                            parameters["remote"],
                                            parameters["entryQuery"])

            elif lSteps[i] == "accessions":
                if parameters["step"] == "accessions":
                    Data = LoadFileFunc.accnEntry(Data)

                ExtractFunc.treatAccns(Data)

            elif lSteps[i] == "fasta":
                if parameters["step"] == "fasta":
                    Data = LoadFileFunc.getSeqEntry(Data,
                                                    parameters["duplication"])

                FastaResFunc.fastaCreation(Data, logger, parameters["remote"],
                                           parameters["APIKey"],
                                           parameters["step"],
                                           parameters["duplication"])

            elif lSteps[i] == "orf":
                if parameters["step"] == "orf":
                    Data = LoadFileFunc.orfEntry(Data,
                                                 parameters["duplication"])

                    if lSteps[i] == firstStep:
                        LoadFileFunc.spTreeCheck(Data, firstStep,
                                                 parameters["duplication"])

                AnalysisFunc.orfFinder(Data)

            elif lSteps[i] == "alignment":
                if parameters["step"] == "alignment":
Example #5
0
def parseGard(kh, aln, o, logger):
    """
	Function returning the cut fragments following GARD analysis and identification of significant breakpoints.

	@param1 kh: Path to GARD.json output file
	@param2 aln: Path to alignment file
	@param3 pvalue: Float
	@param4 o: Path to output directory
	@return lOutFrag: List of Path (Fragments in fasta files)
	"""
    lBP = []
    f = open(kh, "r")
    lLine = f.readline()
    while lLine:
        if lLine.find("\"breakpoints\"") != -1:
            lLine = f.readline()
            lLine = lLine[lLine.find("[") + 1:lLine.find("]")]
            lBP = list(map(int, lLine.split(",")))
            break
        lLine = f.readline()
    f.close()
    index = 0

    #If there are breakpoints, add it in lBP
    if len(lBP) > 0:
        logger.info(
            "There are {:d} significant breakpoints in alignement {:s} at positions {}"
            .format(len(lBP), aln, lBP))
    else:
        logger.info(
            "There are no significant breakpoints in alignement {:s}.".format(
                aln))
        return []

    #If there're breakpoint(s), cut sequence in subsequences according to breakpoints
    if len(lBP) > 0:
        dFname2Fseq = {}
        for fasta in SeqIO.parse(open(aln), 'fasta'):
            dFname2Fseq[fasta.id] = str(fasta.seq)

        #Creation of a dico where atgc in sequence has been replace by 1 and - by 0
        lSeqBin = []
        lNameGene = []
        for fastaSeq in dFname2Fseq:
            lSeqBin.append(dFname2Fseq[fastaSeq].lower().replace(
                "a",
                "1").replace("t",
                             "1").replace("c",
                                          "1").replace("g",
                                                       "1").replace("-", "0"))
            lNameGene.append(fastaSeq)

        #looking for a multiple of 3 (number of letter) (subsequence ends on or after the breakpoint)
        nbSeq = len(lNameGene)
        lenSeq = len(lSeqBin[0])
        lPos = [0]
        lBPprec = [0 for i in range(len(lSeqBin))]
        lFrag = []
        for bp in lBP:
            while bp % 3 != 0:
                bp += 1
            lPos.append(bp)
            lFrag += [
                dFname2Fseq[lNameGene[j]][lPos[-2]:lPos[-1]]
                for j in range(nbSeq)
            ]

        #Adding subsequences that start at the last breakpoint to the end
        lFrag += [dFname2Fseq[lNameGene[i]][lPos[-1]:] for i in range(nbSeq)]

        lBP = lPos + [lenSeq]
        lOutFrag = []
        index = 0
        for x in range(1, len(lBP)):
            dFrag = {}
            if lBP[x - 1] == 0:
                extension = "_{:d}_{:d}".format(lBP[x - 1], lBP[x])
            else:
                extension = "_{:d}_{:d}".format(lBP[x - 1] - 1, lBP[x])

            outFrag = o + aln.split("/")[-1].split(
                ".")[0] + "_frag" + extension + ".best.fas"
            for name in lNameGene:
                dFrag[name] = lFrag[index]
                index += 1
            with open(outFrag, "w") as outF:
                outF.write(FastaResFunc.dict2fasta(dFrag))
                logger.info("\tNew alignment: %s" % {outFrag})
                outF.close()
                lOutFrag.append(outFrag)

        return lOutFrag
    else:
        return []
Example #6
0
def getORFs(catFile, queryName, geneDir):
    """
	Function to find Open Reading Frames within the sequence of each gene and select the longest one.

	@param1 catFile: Path
	@param2 geneName: Gene name
	@param3 geneDir: Gene directory
	@return outORF: Path to the file containing the longest ORFs
	"""

    outORFraw = geneDir + catFile.split("/")[-1].split(
        ".")[0] + "_allORFs.fasta"
    logger = logging.getLogger("main.orf")

    cmd(
        "getorf -sequence {:s} -outseq {:s} -table 0 -find 3 -noreverse".
        format(catFile, outORFraw), False)

    logger.debug(cmd)

    dId2ORFs = defaultdict(list)
    f = SeqIO.parse(open(outORFraw), 'fasta')
    for fasta in f:
        fname, fseq = fasta.id, str(fasta.seq)
        if len(fname.split("_")) > 2:
            fname2 = "_".join(fname.split("_")[0:-1])
        else:
            fname2 = fname.split("_")[0]
        dId2ORFs[fname2].append(fseq)

    dId2Longest = {}
    for k, v in dId2ORFs.items():
        dId2Longest[k] = max(v, key=len)

    # delete duplicate sequences
    dRev = {}
    for k, v in dId2Longest.items():
        dRev.setdefault(v, set()).add(k)

    AllDupl = [values for key, values in dRev.items() if len(values) > 1]
    n = 0
    for dupl in AllDupl:
        species = set([x.split("_")[0] for x in dupl])

        for sp in species:
            if queryName in dupl:
                firstOcc = queryName
            else:
                lOcc = [x for x in dupl if sp in x]

                if len(lOcc) > 0:
                    firstOcc = lOcc[0]
                else:
                    firstOcc = str(lOcc)

            dupl.remove(firstOcc)

        for i in dupl:
            dId2Longest.pop(i, None)
            n += 1
            logger.debug("Deleted sequence {:s} (duplicate)".format(i))

    logger.info("Deleted {} sequences as duplicates".format(n))

    outORF = outORFraw.replace("_allORFs.fasta", "_longestORFs.fasta")

    with open(outORF, "w") as outO:
        outO.write(FastaResFunc.dict2fasta(dId2Longest))
        outO.close()

    logger.info("Extracted longest ORFs: {:s}".format(outORF))

    return (outORF)
Example #7
0
def cutLongBranches(aln, dAlnTree, nbSp, LBOpt, logger):
    """
	Check for overly long branches in a tree and separate both tree and corresponding alignment if found.
	
	@param1 aln: Fasta alignment
	@param2 tree: Tree corresponding to the alignment
	@param3 logger: Logging object
	@return dAlnTree: Updated dictionary of alignments and their corresponding trees
	"""
    logger.info("Looking for long branches.")
    loadTree = ete3.Tree(dAlnTree[aln])
    dist = [leaf.dist for leaf in loadTree.traverse()]
    #longDist = 500

    if "cutoff" in LBOpt:
        if "(" in LBOpt:
            factor = float(LBOpt.split("(")[1].replace(")", ""))
        else:
            factor = 50
        medianDist = median(dist)
        meanDist = mean(dist)
        longDist = meanDist * factor
    elif "IQR" in LBOpt:
        if "(" in LBOpt:
            factor = float(LBOpt.split("(")[1].replace(")", ""))
        else:
            factor = 50
        df = pd.DataFrame(dist)
        Q1 = df.quantile(0.25)
        Q3 = df.quantile(0.75)
        IQR = Q3 - Q1
        lDist = Q3 + (factor * IQR)
        longDist = lDist[0]

    logger.info(
        "Long branches will be evaluated through the {} method (factor {})".
        format(LBOpt, factor))
    nbSp = int(nbSp)
    matches = [leaf for leaf in loadTree.traverse() if leaf.dist > longDist]

    if len(matches) > 0:
        logger.info("{} long branches found, separating alignments.".format(
            len(matches)))

        seqs = SeqIO.parse(open(aln), 'fasta')
        dID2Seq = {gene.id: gene.seq for gene in seqs}

        for node in matches:
            gp = node.get_children()
            lNewGp = list(chain.from_iterable([x.get_leaf_names()
                                               for x in gp]))

            newAln = aln.split(".")[0] + "_part" + str(
                matches.index(node) + 1) + ".fasta"

            dNewAln = {
                gene: dID2Seq[gene]
                for gene in lNewGp if gene in dID2Seq
            }
            for k in lNewGp:
                dID2Seq.pop(k, None)

            # create new file of sequences

            if len(dNewAln) > nbSp - 1:
                with open(newAln, "w") as fasta:
                    fasta.write(FastaResFunc.dict2fasta(dNewAln))
                    fasta.close()
                dAlnTree[newAln] = ""
            else:
                logger.info(
                    "Sequences {} will not be considered for downstream analyses as they do not compose a large enough group."
                    .format(dNewAln.keys()))

        alnLeft = aln.split(".")[0] + "_part" + str(len(matches) +
                                                    1) + ".fasta"

        if len(dID2Seq) > nbSp - 1:
            with open(alnLeft, "w") as fasta:
                fasta.write(FastaResFunc.dict2fasta(dID2Seq))
                logger.info("\tNew alignment:%s" % {alnLeft})
                fasta.close()
            dAlnTree[alnLeft] = ""
        else:
            logger.info(
                "Sequences in {} will not be considered for downstream analyses as they do not compose a large enough group."
                .format(dID2Seq.keys()))

        dAlnTree.pop(aln, None)

    else:
        logger.info("No long branches found.")

    return (dAlnTree)
Example #8
0
def parseGard(kh, aln, pvalue, o, logger):
    """
	Function returning the cut fragments following GARD analysis and identification of significant breakpoints.

	@param1 kh: Path to GARDprocessor output file
	@param2 aln: Path to alignment file
	@param3 pvalue: Float
	@param4 o: Path to output directory
	@return lOutFrag: List of Path (Fragments in fasta files)
	"""
    lBP = []
    with open(kh, "r") as f:
        lLine = f.readlines()
        finalIndex = len(lLine)

    index = 0
    while lLine[index].startswith("Breakpoint") != True and index < finalIndex:
        index += 1

    #If there are breakpoints, add it in lBP
    if lLine[index + 1] != "":
        index += 1
        while lLine[index].startswith(" "):
            line = [float(item.strip()) for item in lLine[index].split("|")]
            if line[2] < pvalue and line[4] < pvalue:
                lBP.append(int(line[0]))
            index += 1

        if len(lBP) > 0:
            logger.info(
                "There are {:d} significant breakpoints in alignement {:s} at positions {}"
                .format(len(lBP), aln, lBP))
        else:
            logger.info(
                "There are no significant breakpoints in alignement {:s}.".
                format(aln))

        #If there're breakpoint(s), cut sequence in subsequences according to breakpoints
        if len(lBP) > 0:
            dFname2Fseq = {}
            for fasta in SeqIO.parse(open(aln), 'fasta'):
                dFname2Fseq[fasta.id] = str(fasta.seq)

            #Creation of a dico where atgc in sequence has been replace by 1 and - by 0
            lSeqBin = []
            lNameGene = []
            for fastaSeq in dFname2Fseq:
                lSeqBin.append(dFname2Fseq[fastaSeq].lower().replace(
                    "a", "1").replace("t", "1").replace("c", "1").replace(
                        "g", "1").replace("-", "0"))
                lNameGene.append(fastaSeq)

            #looking for a multiple of 3 (number of letter) (subsequence ends on or after the breakpoint)
            nbSeq = len(lNameGene)
            lenSeq = len(lSeqBin[0])
            lPos = [0]
            lBPprec = [0 for i in range(len(lSeqBin))]
            lFrag = []
            for bp in lBP:
                while bp % 3 != 0:
                    bp += 1
                lPos.append(bp)
                lFrag += [
                    dFname2Fseq[lNameGene[j]][lPos[-2]:lPos[-1]]
                    for j in range(nbSeq)
                ]

            #Adding subsequences that start at the last breakpoint to the end
            lFrag += [
                dFname2Fseq[lNameGene[i]][lPos[-1]:] for i in range(nbSeq)
            ]

            lBP = lPos + [lenSeq]
            lOutFrag = []
            index = 0
            for x in range(1, len(lBP)):
                dFrag = {}
                extension = "{:d}to{:d}".format(lBP[x - 1], lBP[x])
                outFrag = o + aln.split("/")[-1].split(
                    ".")[0] + "_frag" + extension + ".best.fas"
                for name in lNameGene:
                    dFrag[name] = lFrag[index]
                    index += 1
                with open(outFrag, "w") as outF:
                    outF.write(FastaResFunc.dict2fasta(dFrag))
                lOutFrag.append(outFrag)

            return lOutFrag
        else:
            return []
    else:
        return []
Example #9
0
def treeParsing(ORF, recTree, nbSp, o, logger):
    """
	Function which parse gene data in many group according to duplication in the reconciliated tree

	@param1 ORFs: Path to the ORFs file
	@param2 tree: Path to a tree file
	@param3 geneName: Gene name
	@param4 o: Output directory
	@param5 logger: Logging object
	@return lOut: List of path (fasta files)
	"""

    with open(recTree, "r") as tree:
        reconTree = tree.readlines()[1]
        testTree = ete3.Tree(reconTree)

        seqs = SeqIO.parse(open(ORF), 'fasta')
        dID2Seq = {gene.id: gene.seq for gene in seqs}

        # get all nodes annotated with a duplication event
        dupl = testTree.search_nodes(D="Y")
        dNb2Node = {int(node.ND): node for node in dupl}
        nDuplSign = 0
        lOut = []
        sp = set([leaf.S for leaf in testTree])
        dDupl2Seq = {}
        print("TRY 4")
        # as long as the number of species left in the tree is equal or superior to the cut-off specified by the user and there still are nodes annoted with duplication events
        while len(sp) > int(nbSp) - 1 and len(dNb2Node.keys()) > 0:
            # start from the most recent duplications (ie, the furthest node)
            sp = set([leaf.S for leaf in testTree])
            nodeNb = min(dNb2Node.keys())
            node = dNb2Node[nodeNb]

            # for each of the branches concerned by the duplication
            nGp = 1
            interok = False

            # do not consider dubious duplications (no intersection between the species on either side of the annotated duplication)
            lf = [set([leaf.S for leaf in gp]) for gp in node.get_children()]
            interok = (len(lf[0].intersection(lf[1])) != 0
                       and len(lf[0]) > int(nbSp) / 2 - 1
                       and len(lf[1]) > int(nbSp) / 2 - 1)

            if not interok:
                dNb2Node.pop(nodeNb, None)

            # otherwise check it out
            else:
                for gp in node.get_children():
                    spGp = set([leaf.S for leaf in gp])

                    # check if the numbers of species in the branch is equal or superior to the cut-off specified by the user
                    if len(spGp) > int(nbSp) - 1:

                        orthos = gp.get_leaf_names()
                        dOrtho2Seq = {
                            ortho: dID2Seq[ortho]
                            for ortho in orthos if not ortho == ""
                        }

                        #check if orthologues have already been included in another, more recent, duplication event
                        already = False
                        for doneDupl in dDupl2Seq:
                            if all(ortho in dDupl2Seq[doneDupl]
                                   for ortho in orthos):
                                already = True
                                break

                        if not already:
                            nDuplSign += 1
                            outFile = o + ORF.split("/")[-1].split(
                                ".")[0] + "_D" + str(nodeNb) + "gp" + str(
                                    nGp) + ".fasta"
                            lOut.append(outFile)

                            # create new file of orthologous sequences
                            with open(outFile, "w") as fasta:
                                fasta.write(
                                    FastaResFunc.dict2fasta(dOrtho2Seq))

                            # remove the node from the tree
                            removed = gp.detach()

                        dDupl2Seq["{:d}-{:d}".format(nodeNb, nGp)] = orthos
                    nGp += 1

                dNb2Node.pop(nodeNb, None)

            # if duplication groups have been extracted
            # pool remaining sequences (if span enough different species - per user's specification) into new file
            if len(lOut) > 0:
                leftovers = filter(None, testTree.get_leaf_names())
                dRemain = {left: dID2Seq[left] for left in leftovers}

                if len(dRemain.keys()) > int(nbSp) - 1:
                    outFile = o + ORF.split("/")[-1].split(
                        ".")[0] + "_duplication_remainingsequences.fasta"

                    with open(outFile, "w") as fasta:
                        fasta.write(FastaResFunc.dict2fasta(dRemain))
                    lOut.append(outFile)
                else:
                    logger.info(
                        "Ignoring remaining sequences {} as they do not compose a group of enough orthologs."
                        .format(list(dRemain.keys())))

    logger.info(
        "{:d} duplications detected by Treerecs, extracting {:d} groups of at least {} orthologs."
        .format(len(dupl), nDuplSign, nbSp))

    return lOut
Example #10
0
def isoformAln(aln, o):
        """Function to cluster isoforms according to the alignment. Return the
overall coverage of these isoforms.

	Isoforms are from the same species (recognized through keyword
	xxxXxx at the beginning of their name) and same letters or
	indels at same positions in alignment.

	@param1 aln: Path to alignment
	@param2 o: output directory
	@return outAln: Path to file of resulting alignment

        """

        logger = logging.getLogger("main.alignment")
        logger.info("Clustering isoforms.")

        dRem={} #for remaining sequences
        dId2Seq={} #for remaining sequences
        laln=0 #alignement length
        for fasta in SeqIO.parse(open(aln),'fasta'):
                post=fasta.id.find("_")
                if post!=-1: #regular format
                        sp=fasta.id[:post]
                        tag=fasta.id[post+1:]
                        if not sp in dId2Seq:
                                dId2Seq[sp]={}
                        dId2Seq[sp][tag]=str(fasta.seq)
                        if laln==0:
                                laln=len(fasta.seq)
                else:
                        dRem[fasta.id]=str(fasta.seq)

        
        outCov = o+aln.split("/")[-1].split(".")[0]+"_clustiso.fasta"
        clustok=False #flag to check if a cluster has occured
        for sp,dtagseq in dId2Seq.items():
                lclust=[list(dtagseq)] #list of clusters of tags to be split
                for pos in range(laln):
                        lclust2=[]
                        for clust in lclust:
                                dlet={tag:dtagseq[tag][pos] for tag in clust}
                                llet=set([x for x in dlet.values() if x!="-"])
                                if len(llet)<=1: #one letter at most, keep all
                                        lclust2.append(clust)
                                        continue
                                else:
                                        for x in llet:
                                                lclust2.append([tag for tag in clust if dlet[tag]==x])
                                        lind=[tag for tag in clust if dlet[tag]=="-"] #conservative, do not know wether to merge, may be improved
                                        if len(lind)!=0:
                                                lclust2.append(lind)
                        lclust=lclust2
                                        
                #now merge sequences in each cluster
                for clust in lclust:
                        if len(clust)==1:
                                dRem[sp+"_"+clust[0]]=dtagseq[clust[0]]
                        else:
                                clustok=True
                                ntag=clust[-1]+"_clust"
                                logger.info("Clustered sequences " + sp+"_" + (", %s_"%(sp)).join(clust) + " into %s_"%(sp)+ntag)
                                nseq="".join([max([dtagseq[tag][pos] for tag in clust]) for pos in range(laln)])
                                dRem[sp+"_"+ntag]=nseq

        if clustok:
                with open(outCov, "w") as outC:
        	        outC.write(FastaResFunc.dict2fasta(dRem))
        	        outC.close()
	
                return(outCov)
        else:
                return(aln)