def __init__(self, file, skipInit=False, stream=open(os.devnull, 'w')): if type(file) == tuple: print >> stream, "Creation of the phylogenetic tree ...", (self.items, self.root, self.officialName) = file else: print >> stream, "Loading phylogenetic tree %s ..." % file, self.officialName = {} self.items = self.newCommonNamesMapperInstance() # name and instance of file f = myFile.openFile(file, 'r') try: self.name = f.name except AttributeError: self.name = file f = myFile.firstLineBuffer(f) if (';' in f.firstLine) or ('(' in f.firstLine): self.__loadFromNewick__(' '.join(f).replace('\n', '') + " ;") else: self.__loadFromMyFormat__(f) f.close() if not skipInit: self.reinitTree() else: print >> stream, "OK"
def __init__(self, *args, **kwargs): self.name = None # this dict contains the sets chromosomes per type of contig (cf class ContigType) self.chrSet = collections.defaultdict(set) # kwargs.get('name', default=None) myTools.DefaultOrderedDict.__init__(self, default_factory=list) self.withDict = kwargs.get("withDict", False) if self.withDict: self.g2p = {} if len(args) == 0: return else: assert len(args) == 1, args arg = args[0] if isinstance(arg, str): fileName = arg self.name = fileName print >> sys.stderr, "Loading LightGenome from", fileName, # FIXME use myFile.firstLineBuffer to choose which format is in # input. # choice of the loading function flb = myFile.firstLineBuffer(myFile.openFile(fileName, 'r')) c = flb.firstLine.split("\t") if len(c) == 6: print >> sys.stderr, "(c, beg, end, s, gName, transcriptName) -> (c, s, gName)", # c, beg, end, s, gName, transcriptName reader = myFile.myTSV.readTabular(fileName, [str, int, int, int, str, str]) reader = ((c, strand, gName) for (c, beg, end, strand, gName, tName) in reader) elif len(c) == 3: print >> sys.stderr, "(c, s, gName)", # c, s, gName reader = myFile.myTSV.readTabular(fileName, [str, int, str]) elif len(c) == 5: print >> sys.stderr, "(c, beg, end, s, gName) -> (c, s, gName)", # c, beg, end, s, gName tmpReader = myFile.myTSV.readTabular(fileName, [str, int, int, int, str]) # check, with the first line, if there are several gene names (the format genome of Matthieu contains several gene names) (c, beg, end, strand, gNames) = tmpReader.next() severalNames = True if len(gNames.split(' ')) > 0 else False reader = itertools.chain([(c, beg, end, strand, gNames)], tmpReader) if severalNames: # if gNames contains more than one gene name, only take the first gene name reader = ((c, strand, gNames.split(' ')[0]) for (c, beg, end, strand, gNames) in reader) else: reader = ((c, strand, gName) for (c, beg, end, strand, gName) in reader) else: raise ValueError("%s file is badly formatted" % fileName) print >> sys.stderr, "...", # FIXME do not need beg, end and tName # c = chromosome name # beg = coordinate in nucleotides of the beginning of # transcription of the shortest transcript # end = coordinate in nucleotides of the ending of # transcription of the shortest transcript # gName = gene name # tName = transcript name idx = -1 c_old = None for (c, strand, gName) in reader: self.chrSet[contigType(c)].add(c) idx = (idx + 1) if c == c_old else 0 self[c].append(OGene(gName, strand)) if self.withDict: # dict 'G'ene to (pronounced '2') 'P'osition self.g2p[gName] = GeneP(c, idx) c_old = c print >> sys.stderr, 'OK' elif isinstance(arg, myGenomes.Genome): genome = arg self.name = genome.name self.chrSet = arg.chrSet for c in genome.lstGenes.keys(): for (idx, g) in enumerate(genome.lstGenes[c]): self[str(c)].append(OGene(g.names[0], g.strand)) if self.withDict: self.g2p[g.names[0]] = GeneP(str(c), idx) elif isinstance(arg, LightGenome): self.name = arg.name self.chrSet = arg.chrSet self.withDict = arg.withDict for c in arg: self[c] = [OGene(gene.n, gene.s) for gene in arg[c]] if self.withDict: self.g2p = dict((gn, GeneP(gp.c, gp.idx)) for (gn, gp) in arg.g2p.iteritems()) elif isinstance(arg, dict): genome = arg for c in genome: for (idx, (gName, strand)) in enumerate(genome[c]): self[c].append(OGene(gName, strand)) if self.withDict: # dict 'G'ene to (pronounced '2') 'P'osition self.g2p[gName] = GeneP(c, idx) else: raise ValueError('Constructor needs a file')
def __init__(self, fichier, **kwargs): if isinstance(fichier, str): print >> sys.stderr, "Loading genome of", fichier, "...", f = myFile.firstLineBuffer(myFile.openFile(fichier, 'r')) # list of genes per chromosome self.lstGenes = collections.defaultdict(list) # choice of the loading function c = f.firstLine.split("\t") if f.firstLine.startswith(">") or f.firstLine.endswith("$"): # GRIMM-Synteny format ###################### if f.firstLine.startswith(">"): self.name = f.firstLine[1:].strip() chrom = 1 for l in f: l = l.strip() if not l.endswith("$"): continue for (i,x) in enumerate(l.replace("$","").split()): strand = -1 if x.startswith("-") else 1 self.addGene([x[1:] if x[0] in "-+" else x], chrom, i, i+1, strand) chrom += 1 print >> sys.stderr, "(GRIMM)", elif len(c) == 1: # ancestral genes: "NAMES" ########################## for (i,l) in enumerate(f): self.lstGenes[None].append( Gene(None, i, i+1, 0, tuple(intern(x) for x in l.split())) ) print >> sys.stderr, "(ancestral genes)", elif (len(c) == 2) and not set(c[1]).issubset("01-"): # ancestral genome: "CHR NAMES" ############################### lastC = None for (i,l) in enumerate(f): c = l.split("\t") if lastC != c[0]: lastC = c[0] dec = i self.addGene(c[1].split(), c[0], i-dec, i-dec+1, 0) print >> sys.stderr, "(ancestral genome: chrom+noms)", elif (len(c) >= 5) and (" " not in c[3]) and (len(c[4]) > 0): # Ensembl: "CHR BEG END STRAND NAMES" ##################################### for l in f: c = l.replace('\n', '').split('\t') self.addGene(c[4].split(), c[0], int(c[1]), int(c[2]), int(c[3]) if c[3]!='None' else None) print >> sys.stderr, "(Ensembl)", elif (len(c) == 4) and int(c[1]) < 2: # ancestral genome: "CHR STRAND LST-INDEX LST-STRANDS" ###################################################### if 'ancGenes' in kwargs: ancGenes = kwargs["ancGenes"].lstGenes[None] lastC = None for l in f: c = l.split("\t") if lastC != c[0]: lastC = c[0] pos = 0 currC = commonChrName(c[0]) data = zip([int(x) for x in c[2].split()], [int(x) for x in c[3].split()]) if int(c[1]) < 0: data = [(i,-s) for (i,s) in data.__reversed__()] for (index,strand) in data: if 'ancGenes' in kwargs: self.lstGenes[currC].append( Gene(currC, pos, pos+1, strand, ancGenes[index].names) ) else: self.lstGenes[currC].append( Gene(currC, pos, pos+1, strand, (index,)) ) pos += 1 print >> sys.stderr, "(ancestral genome: chrom+diags)", else: if len(c) == 2: (ili,ils) = (0,1) else: assert len(c) >= 4 (ili,ils) = (2,3) self.ancName = c[0] if 'ancGenes' in kwargs: ancGenes = kwargs["ancGenes"].lstGenes[None] # ancestral genome: "LST-INDEX LST-STRANDS" ############################################# for (i,l) in enumerate(f): c = l.split("\t") chrom = i+1 lchrom = self.lstGenes[chrom] for (pos,(index,strand)) in enumerate(itertools.izip(c[ili].split(), c[ils].split())): if 'ancGenes' in kwargs: lchrom.append( Gene(chrom, pos, pos+1, int(strand), ancGenes[int(index)].names) ) else: lchrom.append( Gene(chrom, pos, pos+1, int(strand), (int(index),) ) ) print >> sys.stderr, "(ancestral genome: diags)", f.close() self.name = fichier else: genomeBase = fichier print >> sys.stderr, "Filtering of", genomeBase.name, "...", filterIn = set(kwargs["filterIn"]) if "filterIn" in kwargs else None filterOut = set(kwargs["filterOut"]) if "filterOut" in kwargs else None def filt(gene): if filterIn is not None: return any(s in filterIn for s in gene.names) if filterOut is not None: return all(s not in filterOut for s in gene.names) return True self.lstGenes = {} for (chrom,l) in genomeBase.lstGenes.iteritems(): l = [gene for gene in l if filt(gene)] if len(l) > 0: self.lstGenes[chrom] = l self.name = "Filter from " + genomeBase.name print >> sys.stderr, "%d genes -> %d genes" % (sum(len(x) for x in genomeBase.lstGenes.itervalues()), sum(len(x) for x in self.lstGenes.itervalues())), self.init(**kwargs) print >> sys.stderr, "OK"