コード例 #1
0
def main(proteinTree, outFile, sortAttr=None):
    if sortAttr:
        get_attribute = lambda *args: ProteinTree_getnodeattr(
            *args, attrname=sortAttr)
    else:
        get_attribute = ProteinTree_getId

    with myFile.openFile(outFile, 'w') as out:
        for tree in myProteinTree.loadTree(proteinTree):
            ProteinTree_LeafSort(tree, get_attribute)
            tree.printTree(out)
コード例 #2
0
def main(ensembltree, outputfile):
    get_ch = lambda tree, node: [x[0] for x in tree.data.get(node, [])]
    get_chd = lambda tree, nodedist: tree.data.get(nodedist[0], [])

    count_trees = 0
    count_treenodes = []
    count_splits = 0
    count_split_desc = 0
    with myFile.openFile(outputfile, 'w') as out:
        for tree in ProteinTree.loadTree(ensembltree):
            count_treenodes.append(0)
            for (node, dist), childrendists in dfw_descendants_generalized(
                    tree, get_chd, queue=[(None, (tree.root, 0))]):
                count_treenodes[-1] += 1
                assert tree.info[node]['duplication'] != 10, \
                        "Unexpected. parent node is a split gene: %s: %s" % \
                                (node, tree.info[node])
                for child, chdist in childrendists:
                    if tree.info[child]['Duplication'] == 10:
                        # It's a gene split
                        # Recurse through all the descendants to remove them
                        count_splits += 1
                        for _, GS_descendant in reversed(
                                list(
                                    dfw_pairs_generalized(tree,
                                                          get_ch,
                                                          queue=[(None, child)
                                                                 ],
                                                          include_root=True))):
                            tree.info.pop(GS_descendant)
                            tree.data.pop(GS_descendant)
                            count_split_desc += 1

                        tree.data[node].remove((child, chdist))

            tree.printTree(out)
    print("%d trees" % count_trees, file=stderr)
    print("treenodes:",
          " ".join(str(nn) for nn in count_treenodes),
          file=stderr)
    print("Splits: %d  Split descendants: %d" %
          (count_splits, count_split_desc),
          file=stderr)
コード例 #3
0
def loadVals(filename, func):

    vals = {}
    if filename != "":
        f = myFile.openFile(filename, "r")
        for l in f:
            #print >> sys.stderr, "the line", l
            t = l.replace('\n', '').split("\t")
            s1 = sys.intern(t[0])
            s2 = sys.intern(t[1])
            assert (s1 == phylTree.parent[s2][0]) or (
                s2 == phylTree.parent[s1][0]), (s1, s2)
            #x = func(float(t[2]) / (3. if s2 in ["Sus scrofa", "Meleagris gallopavo"] else 1.) , float(abs(phylTree.ages[s1] - phylTree.ages[s2])))
            x = func(float(t[2]),
                     float(abs(phylTree.ages[s1] - phylTree.ages[s2])))
            vals[(s1, s2)] = x
            vals[(s2, s1)] = x
            print('%s/%s (%s)' % (s1, s2, t[2]), file=sys.stderr)
        f.close()
    return vals
コード例 #4
0
ファイル: count_robusts.py プロジェクト: DyogenIBENS/Phylorgs
def ancgenes_extract_genecounts(ancgenes_file,
                                ensembl_version,
                                spset=set(('H**o sapiens', ))):
    anc_genecounts = []
    anc_spgenes = []
    ancgenes = []
    with myFile.openFile(ancgenes_file, 'r') as f:
        for line in f:
            ancgene, *genes = line.split()
            ancgenes.append(ancgene)

            gcounts = defaultdict(int)
            spgenes = defaultdict(list)
            for g in genes:
                species = convert_gene2species(g, ensembl_version)
                gcounts[species] += 1
                if species in spset:
                    spgenes[species].append(g)

            anc_genecounts.append(gcounts)
            anc_spgenes.append(spgenes)

    return ancgenes, anc_genecounts, anc_spgenes
コード例 #5
0
def readerDependingOnFileWithDebAndEnd(fileName):
        flb = myFile.firstLineBuffer(myFile.openFile(fileName, 'r'))
        c = flb.firstLine.split("\t")
        if len(c) == 6:
            print("(c, beg, end, s, gName, transcriptName) -> (c, s, gName)", file=sys.stderr)
            # c, beg, end, s,  gName, transcriptName
            reader = myFile.myTSV.readTabular(fileName, [str, str, str, str, str, str])
            reader = ((c, intOrNone(beg), intOrNone(end), intOrNone(strand), gName) for (c, beg, end, strand, gName, tName) in reader)
        elif len(c) == 5:
            print("(c, beg, end, s, gName) -> (c, s, gName)", file=sys.stderr)
            # c, beg, end, s,  gName
            tmpReader = myFile.myTSV.readTabular(fileName, [str, str, str, str, str])
            # check, with the first line, if there are several gene names (the format genome of Matthieu contains several gene names)
            (c, beg, end, strand, gNames) = next(tmpReader)
            severalNames = True if len(gNames.split(' ')) > 0 else False
            reader = itertools.chain([(c, beg, end, strand, gNames)], tmpReader)
            if severalNames:
                # if gNames contains more than one gene name, only take the first gene name
                reader = ((c, intOrNone(beg), intOrNone(end), intOrNone(strand), gNames.split(' ')[0]) for (c, beg, end, strand, gNames) in reader)
            else:
                reader = ((c, intOrNone(beg), intOrNone(end), intOrNone(strand), gName) for (c, beg, end, strand, gName) in reader)
        else:
            raise ValueError("%s file is badly formatted" % fileName)
        return reader
コード例 #6
0
def main():
    # Arguments
    arguments = myTools.checkArgs([("phylTree.conf", myTools.File),
                                   ("proteinTree", myTools.File)],
                                  [("out:ancGenesFiles", str, ""),
                                   ("reuseNames", bool, False)],
                                  __doc__)

    phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"])
    proteinTrees = myProteinTree.loadTree(arguments["proteinTree"])

    count, dupCount, geneFamilies = extractGeneFamilies(phylTree,
                                                        proteinTrees,
                                                        arguments["reuseNames"])

    outTemplate = arguments["out:ancGenesFiles"]
    if outTemplate:
        for (anc, lst) in geneFamilies.items():
            print("Ecriture des familles de %s ..." % anc, end=' ', file=sys.stderr)
            f = myFile.openFile(outTemplate % phylTree.fileName[anc], "w")
            for gg in lst:
                print(" ".join(gg), file=f)
            f.close()
            print(len(lst), "OK", file=sys.stderr)
rankOfChrHasChanged = False
if arguments['orderChromosomesBy'] == 'decreasingNbOfGenes':
    if not myTools.isSorted(list(genomeListByChr.items()), key=lambda x: len(x[1]), stricly=False, increasingOrder=False):
        genomeListByChr = collections.OrderedDict(sorted(list(genomeListByChr.items()), key=lambda x: len(x[1]), reverse=True))
        rankOfChrHasChanged = True
elif arguments['orderChromosomesBy'] == 'names':
    if not myTools.isSorted(list(genomeListByChr.items()), key=lambda x: myTools.keyNaturalSort(x[0])):
        genomeListByChr = collections.OrderedDict(sorted(list(genomeListByChr.items()), key=lambda x: myTools.keyNaturalSort(x[0]), reverse=True))
        rankOfChrHasChanged = True
if rankOfChrHasChanged:
    print('The rank of at least one chromosome has changed while sorting chrNames using the length of chromosomes', file=sys.stderr)

# 2) If necessary rank genes by increasing beg coordinates
geneRankHasChanged = False
for chr, chrom in  genomeListByChr.items():
    if not myTools.isSorted(chrom, key=lambda x: x[0], stricly=False, increasingOrder=True):
        chrom.sort(key=lambda x: x[0])
        geneRankHasChanged = True
if geneRankHasChanged:
    print('The rank of at least one gene has changed while sorting using the 5\' extremities', file=sys.stderr)
assert sum(len(chrom) for chrom in list(genomeListByChr.values())) == iniGenomeLength

# 3) Print the genome
f = myFile.openFile(arguments['out:genome'], 'w')
for chr, chrom in genomeListByChr.items():
    for (beg, end, s, gNames) in chrom:
        print(myFile.myTSV.printLine([chr, beg, end, s, gNames]), file=f)
f.close()

コード例 #8
0
    # Les genes des especes modernes
    genes[e] = myGenomes.Genome(arguments["in:genesFiles"] %
                                phylTree.fileName[e])
    diags[e] = []
    for (c, l) in genes[e].lstGenes.items():
        diags[e].append([((c, i), l[i].strand) for i in range(len(l))])

for a in phylTree.listAncestr:
    # Les genes ancestraux
    genes[a] = myGenomes.Genome(arguments["in:ancGenesFiles"] %
                                phylTree.fileName[a])
    # Les diagonales
    diags[a] = []
    # On en profite pour lister les diagonales et les genes seuls
    notseen = set(range(len(genes[a].lstGenes[None])))
    f = myFile.openFile(arguments["in:diagsFiles"] % phylTree.fileName[a], "r")
    for l in f:
        t = l.split("\t")
        d = [int(x) for x in t[2].split()]
        s = [int(x) for x in t[3].split()]
        s = [2 * int(x >= 0) - 1 for x in s]
        diags[a].append(list(zip([(None, i) for i in d], s)))
        notseen.difference_update(d)
    f.close()
    assert len(notseen) == 0
# print >> sys.stderr, len(notseen)
# diags[a].extend( [((None,g),0)] for g in notseen)

# Creation des dictionnaires genes -> diags
for (esp, lst) in diags.items():
    dic = {}
コード例 #9
0
#!/usr/bin/env python3
"""
	Convertit un genome (scaffolds = suite de contigs) en genome (uniquement des contigs)
"""

import sys

from LibsDyogen import myDiags, myFile, myTools, myGenomes

arguments = myTools.checkArgs([("scaffoldsFile", file), ("contigsFile", file)],
                              [], __doc__)

(diags, singletons) = myDiags.loadIntegr(arguments["scaffoldsFile"])

ref = {}
f = myFile.openFile(arguments["contigsFile"], "r")
for (i, l) in enumerate(f):
    ref[i + 1] = l
f.close()

for (chrom, weights) in diags:
    li = []
    ls = []
    lw = []
    n = 0
    for (i, (c, s)) in enumerate(chrom):
        t = ref.pop(c)[:-1].split("\t")
        if i >= 1:
            lw.append(weights[i - 1])
        n += len(t[2].split())
        if s > 0:
コード例 #10
0
#print(extantGenes, file=sys.stderr)

for x in lstAncGenomes:
    if arguments["except2XSpecies"] == "True":
        lstDescSpecies = [
            y for y in phylTree.listSpecies
            if phylTree.dicParents[y][x] == x and y not in phylTree.lstEsp2X
        ]
    else:
        lstDescSpecies = [
            y for y in phylTree.listSpecies if phylTree.dicParents[y][x] == x
        ]

    if len(lstDescSpecies) > 0:
        f = myFile.openFile(
            arguments["OUT.ancGenesFiles"] % phylTree.fileName[x], "w")

        print(x, lstDescSpecies, file=sys.stdout)
        for ancGene in phylTree.dicGenomes[x]:
            #print(ancGene, file=sys.stderr)
            nbDesc = {}
            ancGenename = ancGene.names[0]
            for descSpecies in lstDescSpecies:
                nbDesc[descSpecies] = 0
            for modernGene in ancGene.names[1:]:

                if (modernGene in extantGenes):
                    #print("modernGene:", modernGene,extantGenes[modernGene], file=sys.stderr)
                    nbDesc[extantGenes[modernGene]] += 1
                else:
                    next
コード例 #11
0

def sort_children(tree):
    """Sort children based on their numerical id"""
    for node, nodedata in tree.data.items():
        nodedata.sort()


if __name__ == '__main__':
    if set(('-h', '--help')) & set(argv[1:]):
        print(__doc__)
        exit()
    elif len(argv) > 3:
        print(__doc__)
        exit(1)

    else:
        try:
            outfile = argv[2]
        except IndexError:
            outfile = stdout
        try:
            infile = argv[1]
        except IndexError:
            infile = stdin

    with myFile.openFile(outfile, 'w') as out:
        for tree in myProteinTree.loadTree(infile):
            sort_children(tree)
            tree.printTree(out)
コード例 #12
0
ファイル: mkODS.py プロジェクト: DyogenIBENS/ToolsDyogen_py3
def main():
    # Arguments
    arguments = myTools.checkArgs( \
        [("phylTree.conf", myTools.File), ("dirList", myTools.FileList(1))], \
        [("diagsFile", str, "diags/integr/diags.%s.list.bz2"), ("outputODS", str, "")], \
        __doc__ \
        )

    # L'arbre phylogenetique
    phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"])

    # except KeyError:
    lstEspeces = sorted(set(phylTree.listAncestr))

    allCutoff = arguments["dirList"]

    titles = [
        "AncGenes", "Blocks", "Genes in blocks", "%Cov", "NbInt", "%CovInt",
        "Min", "25%", "50%", "75%", "N75", "N50", "N25", "Max", "Mean",
        "LongBlocks"
    ]

    alldata = {}
    alldiff = {}
    allEvents = []

    for cutoff in allCutoff:
        # allEvents.append(cutoff.replace(".refine32-all.fuseSingletons-all.halfInsert-all.groups","").replace("denovo-",""))
        allEvents.append(cutoff)
    for events in allEvents:

        print(events, "...", end=' ', file=sys.stderr)

        # Recuperation des donnees de longueur de blocs
        alldata[events] = data = {}
        for e in lstEspeces:
            # print >> sys.stderr, e, "...",
            f = myFile.openFile(
                events + "/" + (arguments["diagsFile"] % phylTree.fileName[e]),
                "r")
            lst = []

            sing = 0
            tot = 0
            interv = 0
            for l in f:
                x = int(l.split("\t")[1])
                tot += x
                if x >= 2:
                    lst.append(x)
                    interv += (x - 1)
                else:
                    sing += 1
            f.close()

            data[e] = [
                e, phylTree.ages[e], tot,
                len(lst), tot - sing, (100. * (tot - sing)) / tot, interv,
                (100. * interv) / (tot - 20.)
            ]
            data[e].extend(myMaths.myStats.valSummary(lst)[:-2])

            # on trie la liste des blocks par taille de blocks.
            lstSort = list(lst)
            lstSort.sort()
            # print  >> sys.stderr, lst
            nbBlock = 0
            ValKaryo75 = (tot - sing) * 75 / 100
            Karyo75 = 0
            while Karyo75 < ValKaryo75:
                tmp = lstSort.pop()
                Karyo75 += tmp
                nbBlock += 1

            data[e].append(nbBlock)
            print(e, "...", nbBlock, "...", end=' ', file=sys.stderr)
        if events == allEvents[0]:
            ref = data

        print("OK", file=sys.stderr)

    if arguments["outputODS"] == "":
        for events in allEvents:
            print(events, file=sys.stdout)
            print(myFile.myTSV.printLine(["Ancestor", "Age (My)"] + titles))
            for e in lstEspeces:
                print(myFile.myTSV.printLine(alldata[events][e]))
        if events in alldiff:
            print(
                myFile.myTSV.printLine(
                    ["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles))
            for e in lstEspeces:
                print(myFile.myTSV.printLine(alldiff[events][e]))

    else:
        import odf.opendocument
        from odfpy_datatable import DataTable

        textdoc = odf.opendocument.OpenDocumentSpreadsheet()

        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            # Premiere table avec les stats brutes
            val = [["Ancestor", "Age (My)"] + titles]
            for e in lstEspeces:
                val.append(alldata[events][e])

            table = DataTable(val)
            table.datasourcehaslabels = "both"
            t = table()
            t.setAttribute('name', valevents)
            textdoc.spreadsheet.addElement(t)

        # Table specifique pour un ancetre
        for esp in lstEspeces:
            # continue
            val = [["events"] + titles]
            for events in allEvents:
                # valevents = events.split("/")[-1]
                valevents = events
                val.append([valevents] + alldata[events][esp][2:])

            table = DataTable(val)
            table.datasourcehaslabels = "both"
            t = table()
            t.setAttribute('name', esp)
            textdoc.spreadsheet.addElement(t)

        # Resume final

        val = [["N50"] + ["events"] + [esp for esp in lstEspeces]]
        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            val.append([""] + [valevents] +
                       [alldata[events][e][13] for e in lstEspeces])

        val.append(["Mean"] + ["events"] + [esp for esp in lstEspeces])
        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            val.append([""] + [valevents] +
                       [alldata[events][e][16] for e in lstEspeces])

        val.append(["NbBlocks"] + ["events"] + [esp for esp in lstEspeces])
        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            val.append([""] + [valevents] +
                       [alldata[events][e][3] for e in lstEspeces])
        val.append(["MaxLength"] + ["events"] + [esp for esp in lstEspeces])
        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            val.append([""] + [valevents] +
                       [alldata[events][e][15] for e in lstEspeces])
        val.append(["LongBlocks"] + ["events"] + [esp for esp in lstEspeces])
        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            val.append([""] + [valevents] +
                       [alldata[events][e][17] for e in lstEspeces])

        table = DataTable(val)
        table.datasourcehaslabels = "both"
        t = table()
        t.setAttribute('name', "Summary")
        textdoc.spreadsheet.addElement(t)

        textdoc.save(arguments["outputODS"])
コード例 #13
0
    def calc(anc, val):
        nesp = len(phylTree.species[anc])
        n2X = len(phylTree.lstEsp2X.intersection(phylTree.species[anc]))
        # La moitie des especes non 2X a vu la duplication (au minimum 1 espece)
        return round(max(1., val * (nesp - n2X)) / nesp, 3) - 2e-3


minDuplicationScore = {}
try:
    # Une limite pour tout le monde
    val = float(arguments["cutoff"])
    for anc in phylTree.listAncestr:
        minDuplicationScore[anc] = calc(anc, val)
except ValueError:
    f = myFile.openFile(arguments["cutoff"], "r")
    for l in f:
        t = l.split()
        anc = phylTree.officialName[t[0]]
        minDuplicationScore[anc] = calc(anc, float(t[1]))
    f.close()
print("minDuplicationScore:", minDuplicationScore, file=sys.stderr)

# Les scores dans l'abre pour les especes modernes valent toujours 1, on doit toujours les accepter
for esp in phylTree.listSpecies:
    minDuplicationScore[esp] = 0

myProteinTree.nextNodeID = arguments["newNodeID"]


@myTools.memoize
コード例 #14
0
def setupScoring(phylTree, scoreMethod=1, cutoff=-1):
    """Return a `hasLowScore` function that attributes a return True/False
    depending on whether a duplication node has a good confidence, with
    reference to a given species phylogenetic tree (phyltreefile)."""

    # Limites automatiques de score de duplication
    if scoreMethod in [1, 3]:
        def calc(anc, val):
            return val
    elif scoreMethod == 2:
        def calc(anc, val):
            nesp = len(phylTree.species[anc])
            n2X = len(phylTree.lstEsp2X.intersection(phylTree.species[anc]))
            # La moitie des especes non 2X a vu la duplication (au minimum 1 espece)
            return round(max(1., val*(nesp-n2X)) / nesp, 3) - 2e-3

    minDuplicationScore = {}
    try:
        # Une limite pour tout le monde
        val = float(cutoff)

        # Shortcut
        if val < 0:
            return alwaysFalse

        for anc in phylTree.listAncestr:
            minDuplicationScore[anc] = calc(anc, val)
    except ValueError:
        f = myFile.openFile(cutoff, "r")
        for l in f:
            t = l.split()
            anc = phylTree.officialName[t[0]]
            minDuplicationScore[anc] = calc(anc, float(t[1]))
        f.close()
    logger.debug("minDuplicationScore:\n%s", minDuplicationScore)

    # Les scores dans l'arbre pour les especes modernes valent toujours 1, on
    # doit toujours les accepter
    for esp in phylTree.listSpecies:
        minDuplicationScore[esp] = 0

    @myTools.memoize
    def goodSpecies(anc):
        return phylTree.species[anc].difference(phylTree.lstEsp2X)

    # This is a Jaccard Index of species on each side of the duplication.
    def hasLowScore(tree, rnode):

        logger.debug("# hasLowScore is used.")
        @myTools.memoize
        def getSpeciesSets(node):
            if node in tree.data:
                return set().union(*(getSpeciesSets(x) for (x,_) in tree.data[node]))
            else:
                logger.debug('Node without data (leaf) at %r', tree.info[node]["taxon_name"])
                assert tree.info[node]["taxon_name"] in phylTree.listSpecies
                return set([tree.info[node]["taxon_name"]])

        if rnode not in tree.data:
            return False

        speciessets = [getSpeciesSets(x) for (x,_) in tree.data[rnode]]
        inters = set()
        for (s1,s2) in itertools.combinations(speciessets, 2):
            inters.update(s1.intersection(s2))
        all = set().union(*speciessets)
        anc = tree.info[rnode]["taxon_name"]

        if scoreMethod == 3:
            inters.intersection_update(goodSpecies(anc))
            all.intersection_update(goodSpecies(anc))
        return ((len(inters) == 0) and (minDuplicationScore[anc] == 0)) or (len(inters) < (minDuplicationScore[anc] * len(all)))
    ###TODO: this should update the 'duplication_confidence_score' tag.

    return hasLowScore
コード例 #15
0
	Run the XMLfile BIOMART Query
	Usage:
		./ENSEMBL.biomartQuery.py XMLfiles/BIOMART.HumanProteinCodingGene.xml   -> will generate ouput.txt
		./ENSEMBL.biomartQuery.py XMLfiles/BIOMART.HumanProteinCodingGene.xml -outputFileName=HumanProteinCodingGene.txt
"""

from __future__ import print_function

import sys
import urllib.request, urllib.parse, urllib.error

from LibsDyogen import myFile, myTools

# Arguments
arguments = myTools.checkArgs(
    [("xmlRequest", myTools.File)],
    [("biomartServer", str, "http://www.ensembl.org/biomart/martservice"),
     ("outputFileName", str, "output.txt")], __doc__)

# La requete
with myFile.openFile(arguments["xmlRequest"], "r") as f:
    request = f.read()

print("Downloading XML Query", end=' ', file=sys.stderr)
urllib.request.urlretrieve(arguments["biomartServer"],
                           filename=arguments["outputFileName"],
                           data=urllib.parse.urlencode({
                               "query": request
                           }).encode())
print("OK", file=sys.stderr)
コード例 #16
0
def main():
    # Arguments
    arguments = myTools.checkArgs( \
        [("phylTree.conf", file), ("dirList", myTools.FileList(1))], \
        [("diagsFile", str, "diags/integr/final/anc/diags.%s.list.bz2"), ("outputODS", str, "")], \
        __doc__ \
        )

    # L'arbre phylogenetique
    phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"])
    todo = set(phylTree.listAncestr)
    try:
        l1 = phylTree.dicLinks["Euteleostomi"]["H**o sapiens"][:-1]
        todo.difference_update(l1)
        l2 = phylTree.dicLinks["Glires"]["Murinae"]
        todo.difference_update(l2)
        l3 = [e for e in todo if phylTree.isChildOf(e, "Mammalia")]
        l3 = sorted(l3, key=lambda e: phylTree.ages[e], reverse=True)
        todo.difference_update(l3)
        l4 = [e for e in todo if phylTree.isChildOf(e, "Clupeocephala")]
        l4 = sorted(l4, key=lambda e: phylTree.ages[e], reverse=True)
        todo.difference_update(l4)
        l5 = [e for e in todo if phylTree.isChildOf(e, "Amniota")]
        l5 = sorted(l5, key=lambda e: phylTree.ages[e], reverse=True)
        todo.difference_update(l5)
        l6 = sorted(todo, key=lambda e: phylTree.ages[e], reverse=True)
        lstEspeces = l6 + l5 + l4 + l1 + l3 + l2
    except KeyError:
        lstEspeces = sorted(phylTree.listAncestr)
    # lstEspeces = l5

    # lstEspeces = ["Euteleostomi", "Amniota", "Boreoeutheria"]

    allCutoff = arguments["dirList"]

    titles = [
        "AncGenes", "Blocks", "Genes in blocks", "%Cov", "NbInt", "%CovInt",
        "Min", "25%", "50%", "75%", "N75", "N50", "N25", "WeigthedAverage",
        "Max", "Mean", "LongBlocks"
    ]

    alldata = {}
    alldiff = {}
    allEvents = []

    for cutoff in allCutoff:
        # allEvents.append(cutoff.replace(".refine32-all.fuseSingletons-all.halfInsert-all.groups","").replace("denovo-",""))
        allEvents.append(cutoff)
    for events in allEvents:

        print(events, "...", end=' ', file=sys.stderr)

        # Recuperation des donnees de longueur de blocs
        alldata[events] = data = {}
        for e in lstEspeces:
            # print >> sys.stderr, e, "...",
            f = myFile.openFile(
                events + "/" + (arguments["diagsFile"] % phylTree.fileName[e]),
                "r")
            lst = []

            sing = 0
            tot = 0
            interv = 0
            for l in f:
                x = int(l.split("\t")[1])
                tot += x
                if x >= 2:
                    lst.append(x)
                    interv += (x - 1)
                else:
                    sing += 1
            f.close()

            data[e] = [
                e, phylTree.ages[e], tot,
                len(lst), tot - sing, (100. * (tot - sing)) / tot, interv,
                (100. * interv) / (tot - 20.)
            ]
            data[e].extend(myMaths.myStats.valSummary2(lst)[:-2])

            # on trie la liste des blocks par taille de blocks.
            lstSort = list(lst)
            lstSort.sort()
            # print  >> sys.stderr, lst
            nbBlock = 0
            ValKaryo75 = (tot - sing) * 75 / 100
            Karyo75 = 0
            while Karyo75 < ValKaryo75:
                tmp = lstSort.pop()
                Karyo75 += tmp
                nbBlock += 1

            data[e].append(nbBlock)
            print(e, "...", nbBlock, "...", end=' ', file=sys.stderr)
        if events == allEvents[0]:
            ref = data
        # else:

        # alldiff[events] = diff = {}
        #	for e in lstEspeces:
        #		newdata = [(x-ref[e][i] if i >= 2 else x) for (i,x) in enumerate(data[e])]
        #		newdata.insert(2, 100*(1.-float(newdata[4])/newdata[2]) if newdata[2] != 0 else None)
        #		diff[e] = newdata
        print("OK", file=sys.stderr)

    if arguments["outputODS"] == "":
        for events in allEvents:
            print(events, file=sys.stdout)
            print(myFile.myTSV.printLine(["Ancestor", "Age (My)"] + titles))
            for e in lstEspeces:
                print(myFile.myTSV.printLine(alldata[events][e]))
        if events in alldiff:
            print(
                myFile.myTSV.printLine(
                    ["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles))
            for e in lstEspeces:
                print(myFile.myTSV.printLine(alldiff[events][e]))

    else:
        import odf.opendocument
        from odfpy_datatable import DataTable

        textdoc = odf.opendocument.OpenDocumentSpreadsheet()

        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            # Premiere table avec les stats brutes
            val = [["Ancestor", "Age (My)"] + titles]
            for e in lstEspeces:
                val.append(alldata[events][e])

            table = DataTable(val)
            table.datasourcehaslabels = "both"
            t = table()
            t.setAttribute('name', valevents)
            textdoc.spreadsheet.addElement(t)

        # if events in alldiff:
        #
        #			# Deuxieme table avec les differences par rapport a la reference
        #			val = [["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles]
        #			for e in lstEspeces:
        #				val.append(alldiff[events][e])
        #
        #			table = DataTable(val)
        #			table.datasourcehaslabels = "both"
        #			t = table()
        #			t.setAttribute('name', "d"+valevents)
        #			textdoc.spreadsheet.addElement(t)

        # Table specifique pour un ancetre
        for esp in lstEspeces:
            # continue
            val = [["events"] + titles]
            for events in allEvents:
                # valevents = events.split("/")[-1]
                valevents = events
                val.append([valevents] + alldata[events][esp][2:])

            table = DataTable(val)
            table.datasourcehaslabels = "both"
            t = table()
            t.setAttribute('name', esp)
            textdoc.spreadsheet.addElement(t)

        # Resume final
        # val = [["events", "Mean gain", "Median gain", "N50 gain", "%Cov gain", "%CovInt gain", "BlockLength %gain (mean)", "BlockLength %gain (Median)", "BlockLength %gain (N50)", "Cov %gain", "CovInt %gain"]]
        #	for events in allEvents:
        #		valevents = events.split("/")[-1]
        #		val.append( [valevents] + [myMaths.myStats.mean([alldiff[events][e][i] for e in lstEspeces]) for i in [17, 12, 14, 6, 8]] +
        #			[myMaths.myStats.mean([100*float(alldata[events][e][i-1]-alldata[allEvents[0]][e][i-1])/alldata[allEvents[0]][e][i-1] for e in lstEspeces]) for i in [17, 12, 14, 6, 8]]
        #		)
        #	table = DataTable(val)
        #	table.datasourcehaslabels = "both"
        #	t = table()
        #	t.setAttribute('name', "events")
        #	textdoc.spreadsheet.addElement(t)

        # Pour les courbes
        val = [["AncGenes"] + ["events"] + [esp for esp in lstEspeces]]
        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            val.append([""] + [valevents] +
                       [alldata[events][e][2] for e in lstEspeces])

        val.append(["WeigthedAverage"] + ["events"] +
                   [esp for esp in lstEspeces])
        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            val.append([""] + [valevents] +
                       [int(alldata[events][e][15]) for e in lstEspeces])

        val.append(["N50"] + ["events"] + [esp for esp in lstEspeces])

        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            val.append([""] + [valevents] +
                       [alldata[events][e][13] for e in lstEspeces])

        val.append(["Mean"] + ["events"] + [esp for esp in lstEspeces])
        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            val.append([""] + [valevents] +
                       [alldata[events][e][17] for e in lstEspeces])

        val.append(["NbBlocks"] + ["events"] + [esp for esp in lstEspeces])
        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            val.append([""] + [valevents] +
                       [alldata[events][e][3] for e in lstEspeces])
        val.append(["MaxLength"] + ["events"] + [esp for esp in lstEspeces])
        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            val.append([""] + [valevents] +
                       [alldata[events][e][16] for e in lstEspeces])
        val.append(["LongBlocks"] + ["events"] + [esp for esp in lstEspeces])
        for events in allEvents:
            # valevents = events.split("/")[-1]
            valevents = events
            val.append([""] + [valevents] +
                       [alldata[events][e][18] for e in lstEspeces])

        table = DataTable(val)
        table.datasourcehaslabels = "both"
        t = table()
        t.setAttribute('name', "Summary")
        textdoc.spreadsheet.addElement(t)

        textdoc.save(arguments["outputODS"])
コード例 #17
0
    ("OUT.geneInfo", str, "geneInfoFromTrees.txt"),
], __doc__)

phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"])

##############################################
# Chargement de la base de donnees d'Ensembl #
##############################################

# On charge les liens taxon_id -> species name
###############################################
print("Chargement des liens taxon_id -> species_name ...",
      end=' ',
      file=sys.stderr)
taxonName = {}
f = myFile.openFile(
    os.path.join(arguments["IN.EnsemblURL"], arguments["IN.genome_db"]), "r")
for ligne in myFile.myTSV.MySQLFileLoader(f):
    t = ligne.split("\t")
    taxonName[t[1]] = t[2]
f.close()
print(len(taxonName), "especes OK", file=sys.stderr)

# On charge les liens member_id -> protein name
################################################
print("Chargement des liens member_id -> protein_name ...",
      end=' ',
      file=sys.stderr)
tmpLinks = {}
f = myFile.openFile(
    os.path.join(arguments["IN.EnsemblURL"], arguments["IN.member"]), "r")
for ligne in myFile.myTSV.MySQLFileLoader(f):
コード例 #18
0
            info["gene_name"] = x

        print("%sinfo\t%s" % (indent, info))

        if node in tree.items:
            indent = indent + "\t"
            for (e, l) in tree.items[node]:
                print("%slen\t%g" % (indent, l))
                printTree(indent, e)

    printTree("", tree.root)


arguments = myTools.checkArgs([("tree", myTools.File)], [], __doc__)

f = myFile.openFile(arguments["tree"], "r")
totalNbLines = sum(1 for line in f)
f.close()
f = myFile.openFile(arguments["tree"], "r")
progressBar = myTools.ProgressBar(totalNbLines)
cptLines = 0
for line in f:
    cptLines += 1
    progressBar.printProgressIn(sys.stderr, cptLines)
    if len(line.replace(" ", "").replace("\n", "")) == 0:
        #Do nothing : empty line
        continue
    elif line.find(";\n"):
        processData(line)
    else:
        raise NameError(
コード例 #19
0
def main():
    # Arguments
    arguments = myTools.checkArgs( \
        [("phylTree.conf", myTools.File), ("dirList", myTools.FileList(1))], \
        [("diagsFile", str, "diags/integr/final/anc/diags.%s.list.bz2"), ("outputODS", str, "")], \
        __doc__ \
        )

    # L'arbre phylogenetique
    phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"])

    # Liste des especes dans le bon ordre
    todo = set(phylTree.listAncestr)
    try:
        l1 = phylTree.dicLinks["Euteleostomi"]["H**o sapiens"][:-1]
        todo.difference_update(l1)
        l2 = phylTree.dicLinks["Glires"]["Murinae"]
        todo.difference_update(l2)
        l3 = [e for e in todo if phylTree.isChildOf(e, "Mammalia")]
        l3 = sorted(l3, key=lambda e: phylTree.ages[e], reverse=True)
        todo.difference_update(l3)
        l4 = [e for e in todo if phylTree.isChildOf(e, "Clupeocephala")]
        l4 = sorted(l4, key=lambda e: phylTree.ages[e], reverse=True)
        todo.difference_update(l4)
        l5 = [e for e in todo if phylTree.isChildOf(e, "Amniota")]
        l5 = sorted(l5, key=lambda e: phylTree.ages[e], reverse=True)
        todo.difference_update(l5)
        l6 = sorted(todo, key=lambda e: phylTree.ages[e], reverse=True)
        lstEspeces = l6 + l5 + l4 + l1 + l3 + l2
    except KeyError:
        lstEspeces = sorted(phylTree.listAncestr)
    # lstEspeces = l5

    # lstEspeces = ["Euteleostomi", "Amniota", "Boreoeutheria"]

    allCutoff = arguments["dirList"]

    titles = ["AncGenes", "Blocks", "Genes in blocks", "%Cov", "NbInt", "%CovInt", "Min", "25%", "50%", "75%", "N75", "N50",
              "N25", "Max", "Mean"]

    alldata = {}
    alldiff = {}

    for cutoff in allCutoff:

        print(cutoff, "...", end=' ', file=sys.stderr)

        # Recuperation des donnees de longueur de blocs
        alldata[cutoff] = data = {}
        for e in lstEspeces:

            f = myFile.openFile(cutoff + "/" + (arguments["diagsFile"] % phylTree.fileName[e]), "r")
            lst = []
            sing = 0
            tot = 0
            interv = 0
            for l in f:
                x = int(l.split("\t")[1])
                tot += x
                if x >= 2:
                    lst.append(x)
                    interv += (x - 1)
                else:
                    sing += 1
            f.close()

            data[e] = [e, phylTree.ages[e], tot, len(lst), tot - sing, (100. * (tot - sing)) / tot, interv,
                       (100. * interv) / (tot - 20.)]
            data[e].extend(myMaths.myStats.valSummary(lst)[:-2])

        if cutoff == allCutoff[0]:
            ref = data
        # else:

        alldiff[cutoff] = diff = {}
        for e in lstEspeces:
            newdata = [(x - ref[e][i] if i >= 2 else x) for (i, x) in enumerate(data[e])]
            newdata.insert(2, 100 * (1. - float(newdata[4]) / newdata[2]) if newdata[2] != 0 else None)
            diff[e] = newdata
        print("OK", file=sys.stderr)

    if arguments["outputODS"] == "":
        for cutoff in allCutoff:
            print(myFile.myTSV.printLine(["Ancestor", "Age (My)"] + titles))
            for e in lstEspeces:
                print(myFile.myTSV.printLine(alldata[cutoff][e]))
        if cutoff in alldiff:
            print(myFile.myTSV.printLine(["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles))
            for e in lstEspeces:
                print(myFile.myTSV.printLine(alldiff[cutoff][e]))

    else:
        import odf.opendocument
        from odfpy_datatable import DataTable

        textdoc = odf.opendocument.OpenDocumentSpreadsheet()

        for cutoff in allCutoff:
            valCutoff = cutoff.split("/")[-1]

            # Premiere table avec les stats brutes
            val = [["Ancestor", "Age (My)"] + titles]
            for e in lstEspeces:
                val.append(alldata[cutoff][e])

            table = DataTable(val)
            table.datasourcehaslabels = "both"
            t = table()
            t.setAttribute('name', valCutoff)
            textdoc.spreadsheet.addElement(t)

            if cutoff in alldiff:

                # Deuxieme table avec les differences par rapport a la reference
                val = [["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles]
                for e in lstEspeces:
                    val.append(alldiff[cutoff][e])

                table = DataTable(val)
                table.datasourcehaslabels = "both"
                t = table()
                t.setAttribute('name', "d" + valCutoff)
                textdoc.spreadsheet.addElement(t)

        # Table specifique pour un ancetre
        for esp in lstEspeces:
            # continue
            val = [["cutoff"] + titles]
            for cutoff in allCutoff:
                valCutoff = cutoff.split("/")[-1]
                val.append([valCutoff] + alldata[cutoff][esp][2:])

            table = DataTable(val)
            table.datasourcehaslabels = "both"
            t = table()
            t.setAttribute('name', esp)
            textdoc.spreadsheet.addElement(t)


        # Resume final
        val = [["cutoff", "Mean gain", "Median gain", "N50 gain", "%Cov gain", "%CovInt gain", "BlockLength %gain (mean)",
                "BlockLength %gain (Median)", "BlockLength %gain (N50)", "Cov %gain", "CovInt %gain"]]
        for cutoff in allCutoff:
            valCutoff = cutoff.split("/")[-1]
            val.append([valCutoff] + [myMaths.myStats.mean([alldiff[cutoff][e][i] for e in lstEspeces]) for i in
                                      [17, 12, 14, 6, 8]] +
                       [myMaths.myStats.mean([100 * float(
                           alldata[cutoff][e][i - 1] - alldata[allCutoff[0]][e][i - 1]) / alldata[allCutoff[0]][e][i - 1]
                                                    for e in lstEspeces]) for i in [17, 12, 14, 6, 8]]
                       )
        table = DataTable(val)
        table.datasourcehaslabels = "both"
        t = table()
        t.setAttribute('name', "cutoff")
        textdoc.spreadsheet.addElement(t)

        textdoc.save(arguments["outputODS"])
コード例 #20
0
#! /usr/bin/env python
"""
	Decoupe un fichier d'arbres en fichiers separes

	usage:
			./splitTrees.py GeneTreeForest.phylTree.bz2 Fam.%s
"""

from LibsDyogen import myFile, myTools, myProteinTree

arguments = myTools.checkArgs([("proteinTree", myTools.File), ("output", str)],
                              [], __doc__)

for (i, tree) in enumerate(myProteinTree.loadTree(arguments["proteinTree"])):
    print(i)
    f = myFile.openFile(arguments["output"] % (i + 1), "w")
    tree.printTree(f)
    f.close()
コード例 #21
0
	Min  [Q25/Q50/Q75]  [N75/N50/N25]   Max   [Mean/Stddev-Length]
	
	
	Usage: ./printStats.py filename 
		./printStats.py filename +long +colNames
	
"""

from LibsDyogen import myFile, myMaths, myTools

arguments = myTools.checkArgs([("file", file)], [("long", bool, False),
                                                 ("colNames", bool, False)],
                              __doc__)

lst = []
f = myFile.openFile(arguments["file"], 'r')

for l in f:
    c = l.split()
    for x in c:
        try:
            x = int(x)
        except ValueError:
            x = float(x)
        lst.append(x)
f.close()

# returns results

if arguments["long"]:
    if arguments["colNames"]:
コード例 #22
0
def main():
    # Arguments
    arguments = myTools.checkArgs( \
        [("phylTree.conf", myTools.File)], [("diags", str, ""), ("colNames", bool, False)], \
        __doc__ \
        )

    # L'arbre phylogenetique
    phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"])

    if (arguments["colNames"]):
        print(myFile.myTSV.printLine([
            "Ancestor", "NbComp", "Nb(In/Out)Comp", "Nb(In/In)Comp", "Age",
            "MeanSize_OfBlocks", "N50Size_OfBlocks", "WASize_OfBlocks",
            "NbComp/Age"
        ]),
              file=sys.stdout)

    for anc in phylTree.listAncestr:
        # nb d'outgroup:
        ###############

        nb_outgroup = len(phylTree.outgroupSpecies[anc])

        # nb d'Ingroups.
        ##############
        nbInSpec = [len(phylTree.species[x]) for (x, _) in phylTree.items[anc]]

        l = [len(phylTree.species[x]) for (x, _) in phylTree.items[anc]]
        # for (x,_) in phylTree.items[anc]:
        #	print >> sys.stderr, phylTree.species[x]
        l.append(nb_outgroup)

        # Comp InSpecies/OutGroups
        #########################

        compInOut = sum(nb_outgroup * n1 for n1 in nbInSpec)

        # Comp InSpecies/InSpecies
        #########################

        compInIn = sum(n1 * n2
                       for (n1, n2) in itertools.combinations(nbInSpec, 2))

        nbc = sum(n1 * n2 for (n1, n2) in itertools.combinations(l, 2))

        # quid des blocs.
        ###############

        totalStat = []
        if (arguments["diags"] != ""):
            r = []
            f = myFile.openFile(arguments["diags"] % phylTree.fileName[anc],
                                "r")
            for line in f:
                x = int(line.split("\t")[1])
                if x > 1:
                    r.append(x)
            f.close()
            #lll = float(sum(r)) / len(r)
            totalStat = myMaths.myStats.valSummary2(r)
        else:
            lll = "NONE"
        ###############

        print(
            myFile.myTSV.printLine([
                anc, nbc, compInOut, compInIn, phylTree.ages[anc],
                totalStat[9], totalStat[6],
                int(totalStat[7]),
                float(nbc) / phylTree.ages[anc]
            ]))
コード例 #23
0
                    newAnc, newLastWritten))

    else:  # when the node is a leaf
        allGenes = [tree.info[node]["gene_name"]]

    for a in toWrite:  # 'a'= name of the ancestor to print
        geneFamilies[a].append(
            [currName] + allGenes
        )  # write the name of the gene of Anc followed by the names of the genes of the children (this is done for all the species in toWrite)
        #FIXME geneFamilies is defined in the main, it is modified whereas it is not even a parameter

    return allGenes  # for the recurrence


geneFamilies = collections.defaultdict(list)
for tree in myProteinTree.loadTree(
        arguments["geneTreeForest"]):  # for all gene trees in the forest
    extractGeneFamilies(
        tree.root, tree.info[tree.root]["tree_name"], None, None
    )  # FIXME this function modifies tree and geneFamilies even if tree and gene families are not parameters
    tree.printTree(sys.stdout)

for (anc, lst) in geneFamilies.items():
    print("Write %s family ..." % anc, end=' ', file=sys.stderr)
    f = myFile.openFile(arguments["out:ancGenes"] % speciesTree.fileName[anc],
                        "w")
    for gg in lst:
        print(" ".join(gg), file=f)
    f.close()
    print(len(lst), "OK", file=sys.stderr)
コード例 #24
0
"""

import sys
import io

from LibsDyogen import myFile, myTools, myPhylTree

arguments = myTools.checkArgs([("IN.protein_tree_tag", file)], [], __doc__)

dicTaxonName = {}
dicTaxonID = {}
dicTaxonAlias = {}

# Chargement des donnees
print("Chargement des tags ...", end=' ', file=sys.stderr)
f = myFile.openFile(arguments["IN.protein_tree_tag"], "r")
for ligne in f:
    t = ligne[:-1].split("\t")
    if t[1] == "taxon_name":
        dicTaxonName[t[0]] = t[2]
    elif t[1] == "taxon_id":
        dicTaxonID[t[0]] = t[2]
    elif (t[1] == "taxon_alias") and (t[0] not in dicTaxonAlias):
        dicTaxonAlias[t[0]] = t[2]
    elif t[1] == "taxon_alias_mya":
        dicTaxonAlias[t[0]] = t[2]
    elif t[1] == "species_tree_string":
        tree = t[2]
print("OK (lengths:",
      len(dicTaxonName),
      len(dicTaxonID),