Esempio n. 1
0
def getRange(s):
    if myFile.hasAccess(s):
        f = myFile.openFile(s, "r")
        lst = []
        for l in f:
            lst.extend([int(x) for x in l.replace("\n", "").split()])
        f.close()
        return lst
    else:
        (start, _, end) = s.partition(":")
        return range(int(start), int(end) + 1)
Esempio n. 2
0
def loadTree(name):

    ns = myTools.Namespace()

    # read the next line of the file (and bufferise the next one)
    def nextLine():
        old = ns.curr
        try:
            l = ""
            while (l == "") or l.startswith("#"):
                # the final '\n' is removed and we cut owing to the '\t'
                l = f.next().replace('\n', '')
            l = l.split('\t')
            # the triplet (indentation,key,value) is recorded
            ns.curr = (len(l)-2, l[-2], l[-1])
        except StopIteration:
            ns.curr = None
        return old

    # the analysing process of the lines of the file
    def recLoad(tree, indent):

        # id of the point
        currID = int(nextLine()[2])
        # associated informations
        tree.info[currID] = eval(nextLine()[2])

        # children ?
        child = []
        while (ns.curr != None) and (ns.curr[0] == indent+1):
            length = float(nextLine()[2])
            child.append((recLoad(tree, indent+1), length))
        if len(child) > 0:
            tree.data[currID] = child

        return currID


    print >> sys.stderr, "Loading the forest of gene trees %s ..." % name,
    f = myFile.openFile(name, "r") if isinstance(name, str) else name
    ns.curr = None
    nextLine()
    n = (0,0,0)
    while True:
        tree = ProteinTree()
        tree.root = recLoad(tree, 0)
        yield tree
        n = (n[0]+1, n[1]+len(tree.data), n[2]+len(tree.info)-len(tree.data))
        if ns.curr == None:
            break
    print >> sys.stderr, "%d roots, %d branches, %d nodes OK" % n

    f.close()
Esempio n. 3
0
 def iterFile(name):
     f = myFile.openFile(name, "r")
     name = None
     for ligne in f:
         ligne = ligne.replace('\n', '').strip()
         # chevrons indicate the beginning of a new sequence
         if ligne.startswith('>'):
             tmp = []
             if name != None:
                 yield (name, "".join(tmp))
             name = ligne[1:].strip()
         # lines must be concatenated
         elif name != None:
             tmp.append(ligne.upper())
     if name != None:
         yield (name, "".join(tmp))
     f.close()
Esempio n. 4
0
 def __init__(self, *args):
     list.__init__(self)
     self.fidMax = 0
     self.g2fid = {}
     if len(args) == 0:
         # null constructor
         return
     if len(args) == 1 and isinstance(args[0], str):
         fileName = args[0]
         self.name = fileName
         print >> sys.stderr, "Loading Families from", fileName, "...",
         # FIXME use myFile.firstLineBuffer to choose which format is in
         # input.
         # A more synthetic format would have only 3 columns:
         # c, s and gName
         file = myFile.openFile(fileName, 'r')
         for l in file:
             names = l.replace('\n', '').split(' ')
             # family name
             fn = names[0]
             # modern names
             # FIXME: dns cannot be a set, since the first element is the positional ortholog as much as possible
             dns = set(names[1:])
             self.append(Family(fn, dns))
             for n in {fn} | dns:
                 # Each (fID + 1) corresponds to the line number in the output file
                 # of families obtained with self.printIn(file)
                 # Be careful, a fID number is equal to the line number - 1
                 # (if line number begins from 1)
                 self.g2fid[n] = self.fidMax
             self.fidMax += 1
         file.close()
         assert self.fidMax == len(self)
     else:
         raise ValueError('Constructor needs a file')
     print >> sys.stderr, 'OK'
Esempio n. 5
0
 def __init__(self, file, skipInit=False, stream=open(os.devnull, 'w')):
     if type(file) == tuple:
         print >> stream, "Creation of the phylogenetic tree ...",
         (self.items, self.root, self.officialName) = file
     else:
         print >> stream, "Loading phylogenetic tree %s ..." % file,
         self.officialName = {}
         self.items = self.newCommonNamesMapperInstance()
         # name and instance of file
         f = myFile.openFile(file, 'r')
         try:
             self.name = f.name
         except AttributeError:
             self.name = file
         f = myFile.firstLineBuffer(f)
         if (';' in f.firstLine) or ('(' in f.firstLine):
             self.__loadFromNewick__(' '.join(f).replace('\n', '') + " ;")
         else:
             self.__loadFromMyFormat__(f)
         f.close()
         if not skipInit:
             self.reinitTree()
         else:
             print >> stream, "OK"
Esempio n. 6
0
    def __init__(self, *args, **kwargs):
        self.name = None
        # this dict contains the sets chromosomes per type of contig (cf class ContigType)
        self.chrSet = collections.defaultdict(set)
        # kwargs.get('name', default=None)
        myTools.DefaultOrderedDict.__init__(self, default_factory=list)
        self.withDict = kwargs.get("withDict", False)
        if self.withDict:
            self.g2p = {}
        if len(args) == 0:
            return
        else:
            assert len(args) == 1, args
            arg = args[0]

        if isinstance(arg, str):
            fileName = arg
            self.name = fileName
            print >> sys.stderr, "Loading LightGenome from", fileName,
            # FIXME use myFile.firstLineBuffer to choose which format is in
            # input.
            # choice of the loading function
            flb = myFile.firstLineBuffer(myFile.openFile(fileName, 'r'))
            c = flb.firstLine.split("\t")
            if len(c) == 6:
                print >> sys.stderr, "(c, beg, end, s, gName, transcriptName) -> (c, s, gName)",
                # c, beg, end, s,  gName, transcriptName
                reader = myFile.myTSV.readTabular(fileName, [str, int, int, int, str, str])
                reader = ((c, strand, gName) for (c, beg, end, strand, gName, tName) in reader)
            elif len(c) == 3:
                print >> sys.stderr, "(c, s, gName)",
                # c, s, gName
                reader = myFile.myTSV.readTabular(fileName, [str, int, str])
            elif len(c) == 5:
                print >> sys.stderr, "(c, beg, end, s, gName) -> (c, s, gName)",
                # c, beg, end, s,  gName
                tmpReader = myFile.myTSV.readTabular(fileName, [str, int, int, int, str])
                # check, with the first line, if there are several gene names (the format genome of Matthieu contains several gene names)
                (c, beg, end, strand, gNames) = tmpReader.next()
                severalNames = True if len(gNames.split(' ')) > 0 else False
                reader = itertools.chain([(c, beg, end, strand, gNames)], tmpReader)
                if severalNames:
                    # if gNames contains more than one gene name, only take the first gene name
                    reader = ((c, strand, gNames.split(' ')[0]) for (c, beg, end, strand, gNames) in reader)
                else:
                    reader = ((c, strand, gName) for (c, beg, end, strand, gName) in reader)
            else:
                raise ValueError("%s file is badly formatted" % fileName)
            print >> sys.stderr, "...",
            # FIXME do not need beg, end and tName
            # c = chromosome name
            # beg = coordinate in nucleotides of the beginning of
            # transcription of the shortest transcript
            # end = coordinate in nucleotides of the ending of
            # transcription of the shortest transcript
            # gName = gene name
            # tName = transcript name
            idx = -1
            c_old = None
            for (c, strand, gName) in reader:
                self.chrSet[contigType(c)].add(c)
                idx = (idx + 1) if c == c_old else 0
                self[c].append(OGene(gName, strand))
                if self.withDict:
                    # dict 'G'ene to (pronounced '2') 'P'osition
                    self.g2p[gName] = GeneP(c, idx)
                    c_old = c
            print >> sys.stderr, 'OK'
        elif isinstance(arg, myGenomes.Genome):
            genome = arg
            self.name = genome.name
            self.chrSet = arg.chrSet
            for c in genome.lstGenes.keys():
                for (idx, g) in enumerate(genome.lstGenes[c]):
                    self[str(c)].append(OGene(g.names[0], g.strand))
                    if self.withDict:
                        self.g2p[g.names[0]] = GeneP(str(c), idx)
        elif isinstance(arg, LightGenome):
            self.name = arg.name
            self.chrSet = arg.chrSet
            self.withDict = arg.withDict
            for c in arg:
                self[c] = [OGene(gene.n, gene.s) for gene in arg[c]]
            if self.withDict:
                self.g2p = dict((gn, GeneP(gp.c, gp.idx)) for (gn, gp) in arg.g2p.iteritems())
        elif isinstance(arg, dict):
            genome = arg
            for c in genome:
                for (idx, (gName, strand)) in enumerate(genome[c]):
                    self[c].append(OGene(gName, strand))
                    if self.withDict:
                        # dict 'G'ene to (pronounced '2') 'P'osition
                        self.g2p[gName] = GeneP(c, idx)
        else:
            raise ValueError('Constructor needs a file')
Esempio n. 7
0
    def __init__(self, fichier, **kwargs):
        if isinstance(fichier, str):
            print >> sys.stderr, "Loading genome of", fichier, "...",
            f = myFile.firstLineBuffer(myFile.openFile(fichier, 'r'))

            # list of genes per chromosome
            self.lstGenes = collections.defaultdict(list)

            # choice of the loading function
            c = f.firstLine.split("\t")
            if f.firstLine.startswith(">") or f.firstLine.endswith("$"):
                # GRIMM-Synteny format
                ######################
                if f.firstLine.startswith(">"):
                    self.name = f.firstLine[1:].strip()
                chrom = 1
                for l in f:
                    l = l.strip()
                    if not l.endswith("$"):
                        continue
                    for (i,x) in enumerate(l.replace("$","").split()):
                        strand = -1 if x.startswith("-") else 1
                        self.addGene([x[1:] if x[0] in "-+" else x], chrom, i, i+1, strand)
                    chrom += 1
                print >> sys.stderr, "(GRIMM)",

            elif len(c) == 1:
                # ancestral genes: "NAMES"
                ##########################
                for (i,l) in enumerate(f):
                    self.lstGenes[None].append( Gene(None, i, i+1, 0, tuple(intern(x) for x in l.split())) )
                print >> sys.stderr, "(ancestral genes)",

            elif (len(c) == 2) and not set(c[1]).issubset("01-"):
                # ancestral genome: "CHR NAMES"
                ###############################
                lastC = None
                for (i,l) in enumerate(f):
                    c = l.split("\t")
                    if lastC != c[0]:
                        lastC = c[0]
                        dec = i
                    self.addGene(c[1].split(), c[0], i-dec, i-dec+1, 0)
                print >> sys.stderr, "(ancestral genome: chrom+noms)",

            elif (len(c) >= 5) and (" " not in c[3]) and (len(c[4]) > 0):
                # Ensembl: "CHR BEG END STRAND NAMES"
                #####################################
                for l in f:
                    c = l.replace('\n', '').split('\t')
                    self.addGene(c[4].split(), c[0], int(c[1]), int(c[2]), int(c[3]) if c[3]!='None' else None)
                print >> sys.stderr, "(Ensembl)",

            elif (len(c) == 4) and int(c[1]) < 2:
                # ancestral genome: "CHR STRAND LST-INDEX LST-STRANDS"
                ######################################################
                if 'ancGenes' in kwargs:
                    ancGenes = kwargs["ancGenes"].lstGenes[None]
                lastC = None
                for l in f:
                    c = l.split("\t")
                    if lastC != c[0]:
                        lastC = c[0]
                        pos = 0
                        currC = commonChrName(c[0])
                    data = zip([int(x) for x in c[2].split()], [int(x) for x in c[3].split()])
                    if int(c[1]) < 0:
                        data = [(i,-s) for (i,s) in data.__reversed__()]
                    for (index,strand) in data:
                        if 'ancGenes' in kwargs:
                            self.lstGenes[currC].append( Gene(currC, pos, pos+1, strand, ancGenes[index].names) )
                        else:
                            self.lstGenes[currC].append( Gene(currC, pos, pos+1, strand, (index,)) )
                        pos += 1
                print >> sys.stderr, "(ancestral genome: chrom+diags)",

            else:
                if len(c) == 2:
                    (ili,ils) = (0,1)
                else:
                    assert len(c) >= 4
                    (ili,ils) = (2,3)
                    self.ancName = c[0]

                if 'ancGenes' in kwargs:
                    ancGenes = kwargs["ancGenes"].lstGenes[None]

                # ancestral genome: "LST-INDEX LST-STRANDS"
                #############################################
                for (i,l) in enumerate(f):
                    c = l.split("\t")
                    chrom = i+1
                    lchrom = self.lstGenes[chrom]
                    for (pos,(index,strand)) in enumerate(itertools.izip(c[ili].split(), c[ils].split())):
                        if 'ancGenes' in kwargs:
                            lchrom.append( Gene(chrom, pos, pos+1, int(strand), ancGenes[int(index)].names) )
                        else:
                            lchrom.append( Gene(chrom, pos, pos+1, int(strand), (int(index),) ) )
                print >> sys.stderr, "(ancestral genome: diags)",

            f.close()
            self.name = fichier

        else:
            genomeBase = fichier
            print >> sys.stderr, "Filtering of", genomeBase.name, "...",
            filterIn = set(kwargs["filterIn"]) if "filterIn" in kwargs else None
            filterOut = set(kwargs["filterOut"]) if "filterOut" in kwargs else None

            def filt(gene):
                if filterIn is not None:
                    return any(s in filterIn for s in gene.names)
                if filterOut is not None:
                    return all(s not in filterOut for s in gene.names)
                return True

            self.lstGenes = {}
            for (chrom,l) in genomeBase.lstGenes.iteritems():
                l = [gene for gene in l if filt(gene)]
                if len(l) > 0:
                    self.lstGenes[chrom] = l
            self.name = "Filter from " + genomeBase.name
            print >> sys.stderr, "%d genes -> %d genes" % (sum(len(x) for x in genomeBase.lstGenes.itervalues()), sum(len(x) for x in self.lstGenes.itervalues())),

        self.init(**kwargs)
        print >> sys.stderr, "OK"