def getRange(s): if myFile.hasAccess(s): f = myFile.openFile(s, "r") lst = [] for l in f: lst.extend([int(x) for x in l.replace("\n", "").split()]) f.close() return lst else: (start, _, end) = s.partition(":") return range(int(start), int(end) + 1)
def loadTree(name): ns = myTools.Namespace() # read the next line of the file (and bufferise the next one) def nextLine(): old = ns.curr try: l = "" while (l == "") or l.startswith("#"): # the final '\n' is removed and we cut owing to the '\t' l = f.next().replace('\n', '') l = l.split('\t') # the triplet (indentation,key,value) is recorded ns.curr = (len(l)-2, l[-2], l[-1]) except StopIteration: ns.curr = None return old # the analysing process of the lines of the file def recLoad(tree, indent): # id of the point currID = int(nextLine()[2]) # associated informations tree.info[currID] = eval(nextLine()[2]) # children ? child = [] while (ns.curr != None) and (ns.curr[0] == indent+1): length = float(nextLine()[2]) child.append((recLoad(tree, indent+1), length)) if len(child) > 0: tree.data[currID] = child return currID print >> sys.stderr, "Loading the forest of gene trees %s ..." % name, f = myFile.openFile(name, "r") if isinstance(name, str) else name ns.curr = None nextLine() n = (0,0,0) while True: tree = ProteinTree() tree.root = recLoad(tree, 0) yield tree n = (n[0]+1, n[1]+len(tree.data), n[2]+len(tree.info)-len(tree.data)) if ns.curr == None: break print >> sys.stderr, "%d roots, %d branches, %d nodes OK" % n f.close()
def iterFile(name): f = myFile.openFile(name, "r") name = None for ligne in f: ligne = ligne.replace('\n', '').strip() # chevrons indicate the beginning of a new sequence if ligne.startswith('>'): tmp = [] if name != None: yield (name, "".join(tmp)) name = ligne[1:].strip() # lines must be concatenated elif name != None: tmp.append(ligne.upper()) if name != None: yield (name, "".join(tmp)) f.close()
def __init__(self, *args): list.__init__(self) self.fidMax = 0 self.g2fid = {} if len(args) == 0: # null constructor return if len(args) == 1 and isinstance(args[0], str): fileName = args[0] self.name = fileName print >> sys.stderr, "Loading Families from", fileName, "...", # FIXME use myFile.firstLineBuffer to choose which format is in # input. # A more synthetic format would have only 3 columns: # c, s and gName file = myFile.openFile(fileName, 'r') for l in file: names = l.replace('\n', '').split(' ') # family name fn = names[0] # modern names # FIXME: dns cannot be a set, since the first element is the positional ortholog as much as possible dns = set(names[1:]) self.append(Family(fn, dns)) for n in {fn} | dns: # Each (fID + 1) corresponds to the line number in the output file # of families obtained with self.printIn(file) # Be careful, a fID number is equal to the line number - 1 # (if line number begins from 1) self.g2fid[n] = self.fidMax self.fidMax += 1 file.close() assert self.fidMax == len(self) else: raise ValueError('Constructor needs a file') print >> sys.stderr, 'OK'
def __init__(self, file, skipInit=False, stream=open(os.devnull, 'w')): if type(file) == tuple: print >> stream, "Creation of the phylogenetic tree ...", (self.items, self.root, self.officialName) = file else: print >> stream, "Loading phylogenetic tree %s ..." % file, self.officialName = {} self.items = self.newCommonNamesMapperInstance() # name and instance of file f = myFile.openFile(file, 'r') try: self.name = f.name except AttributeError: self.name = file f = myFile.firstLineBuffer(f) if (';' in f.firstLine) or ('(' in f.firstLine): self.__loadFromNewick__(' '.join(f).replace('\n', '') + " ;") else: self.__loadFromMyFormat__(f) f.close() if not skipInit: self.reinitTree() else: print >> stream, "OK"
def __init__(self, *args, **kwargs): self.name = None # this dict contains the sets chromosomes per type of contig (cf class ContigType) self.chrSet = collections.defaultdict(set) # kwargs.get('name', default=None) myTools.DefaultOrderedDict.__init__(self, default_factory=list) self.withDict = kwargs.get("withDict", False) if self.withDict: self.g2p = {} if len(args) == 0: return else: assert len(args) == 1, args arg = args[0] if isinstance(arg, str): fileName = arg self.name = fileName print >> sys.stderr, "Loading LightGenome from", fileName, # FIXME use myFile.firstLineBuffer to choose which format is in # input. # choice of the loading function flb = myFile.firstLineBuffer(myFile.openFile(fileName, 'r')) c = flb.firstLine.split("\t") if len(c) == 6: print >> sys.stderr, "(c, beg, end, s, gName, transcriptName) -> (c, s, gName)", # c, beg, end, s, gName, transcriptName reader = myFile.myTSV.readTabular(fileName, [str, int, int, int, str, str]) reader = ((c, strand, gName) for (c, beg, end, strand, gName, tName) in reader) elif len(c) == 3: print >> sys.stderr, "(c, s, gName)", # c, s, gName reader = myFile.myTSV.readTabular(fileName, [str, int, str]) elif len(c) == 5: print >> sys.stderr, "(c, beg, end, s, gName) -> (c, s, gName)", # c, beg, end, s, gName tmpReader = myFile.myTSV.readTabular(fileName, [str, int, int, int, str]) # check, with the first line, if there are several gene names (the format genome of Matthieu contains several gene names) (c, beg, end, strand, gNames) = tmpReader.next() severalNames = True if len(gNames.split(' ')) > 0 else False reader = itertools.chain([(c, beg, end, strand, gNames)], tmpReader) if severalNames: # if gNames contains more than one gene name, only take the first gene name reader = ((c, strand, gNames.split(' ')[0]) for (c, beg, end, strand, gNames) in reader) else: reader = ((c, strand, gName) for (c, beg, end, strand, gName) in reader) else: raise ValueError("%s file is badly formatted" % fileName) print >> sys.stderr, "...", # FIXME do not need beg, end and tName # c = chromosome name # beg = coordinate in nucleotides of the beginning of # transcription of the shortest transcript # end = coordinate in nucleotides of the ending of # transcription of the shortest transcript # gName = gene name # tName = transcript name idx = -1 c_old = None for (c, strand, gName) in reader: self.chrSet[contigType(c)].add(c) idx = (idx + 1) if c == c_old else 0 self[c].append(OGene(gName, strand)) if self.withDict: # dict 'G'ene to (pronounced '2') 'P'osition self.g2p[gName] = GeneP(c, idx) c_old = c print >> sys.stderr, 'OK' elif isinstance(arg, myGenomes.Genome): genome = arg self.name = genome.name self.chrSet = arg.chrSet for c in genome.lstGenes.keys(): for (idx, g) in enumerate(genome.lstGenes[c]): self[str(c)].append(OGene(g.names[0], g.strand)) if self.withDict: self.g2p[g.names[0]] = GeneP(str(c), idx) elif isinstance(arg, LightGenome): self.name = arg.name self.chrSet = arg.chrSet self.withDict = arg.withDict for c in arg: self[c] = [OGene(gene.n, gene.s) for gene in arg[c]] if self.withDict: self.g2p = dict((gn, GeneP(gp.c, gp.idx)) for (gn, gp) in arg.g2p.iteritems()) elif isinstance(arg, dict): genome = arg for c in genome: for (idx, (gName, strand)) in enumerate(genome[c]): self[c].append(OGene(gName, strand)) if self.withDict: # dict 'G'ene to (pronounced '2') 'P'osition self.g2p[gName] = GeneP(c, idx) else: raise ValueError('Constructor needs a file')
def __init__(self, fichier, **kwargs): if isinstance(fichier, str): print >> sys.stderr, "Loading genome of", fichier, "...", f = myFile.firstLineBuffer(myFile.openFile(fichier, 'r')) # list of genes per chromosome self.lstGenes = collections.defaultdict(list) # choice of the loading function c = f.firstLine.split("\t") if f.firstLine.startswith(">") or f.firstLine.endswith("$"): # GRIMM-Synteny format ###################### if f.firstLine.startswith(">"): self.name = f.firstLine[1:].strip() chrom = 1 for l in f: l = l.strip() if not l.endswith("$"): continue for (i,x) in enumerate(l.replace("$","").split()): strand = -1 if x.startswith("-") else 1 self.addGene([x[1:] if x[0] in "-+" else x], chrom, i, i+1, strand) chrom += 1 print >> sys.stderr, "(GRIMM)", elif len(c) == 1: # ancestral genes: "NAMES" ########################## for (i,l) in enumerate(f): self.lstGenes[None].append( Gene(None, i, i+1, 0, tuple(intern(x) for x in l.split())) ) print >> sys.stderr, "(ancestral genes)", elif (len(c) == 2) and not set(c[1]).issubset("01-"): # ancestral genome: "CHR NAMES" ############################### lastC = None for (i,l) in enumerate(f): c = l.split("\t") if lastC != c[0]: lastC = c[0] dec = i self.addGene(c[1].split(), c[0], i-dec, i-dec+1, 0) print >> sys.stderr, "(ancestral genome: chrom+noms)", elif (len(c) >= 5) and (" " not in c[3]) and (len(c[4]) > 0): # Ensembl: "CHR BEG END STRAND NAMES" ##################################### for l in f: c = l.replace('\n', '').split('\t') self.addGene(c[4].split(), c[0], int(c[1]), int(c[2]), int(c[3]) if c[3]!='None' else None) print >> sys.stderr, "(Ensembl)", elif (len(c) == 4) and int(c[1]) < 2: # ancestral genome: "CHR STRAND LST-INDEX LST-STRANDS" ###################################################### if 'ancGenes' in kwargs: ancGenes = kwargs["ancGenes"].lstGenes[None] lastC = None for l in f: c = l.split("\t") if lastC != c[0]: lastC = c[0] pos = 0 currC = commonChrName(c[0]) data = zip([int(x) for x in c[2].split()], [int(x) for x in c[3].split()]) if int(c[1]) < 0: data = [(i,-s) for (i,s) in data.__reversed__()] for (index,strand) in data: if 'ancGenes' in kwargs: self.lstGenes[currC].append( Gene(currC, pos, pos+1, strand, ancGenes[index].names) ) else: self.lstGenes[currC].append( Gene(currC, pos, pos+1, strand, (index,)) ) pos += 1 print >> sys.stderr, "(ancestral genome: chrom+diags)", else: if len(c) == 2: (ili,ils) = (0,1) else: assert len(c) >= 4 (ili,ils) = (2,3) self.ancName = c[0] if 'ancGenes' in kwargs: ancGenes = kwargs["ancGenes"].lstGenes[None] # ancestral genome: "LST-INDEX LST-STRANDS" ############################################# for (i,l) in enumerate(f): c = l.split("\t") chrom = i+1 lchrom = self.lstGenes[chrom] for (pos,(index,strand)) in enumerate(itertools.izip(c[ili].split(), c[ils].split())): if 'ancGenes' in kwargs: lchrom.append( Gene(chrom, pos, pos+1, int(strand), ancGenes[int(index)].names) ) else: lchrom.append( Gene(chrom, pos, pos+1, int(strand), (int(index),) ) ) print >> sys.stderr, "(ancestral genome: diags)", f.close() self.name = fichier else: genomeBase = fichier print >> sys.stderr, "Filtering of", genomeBase.name, "...", filterIn = set(kwargs["filterIn"]) if "filterIn" in kwargs else None filterOut = set(kwargs["filterOut"]) if "filterOut" in kwargs else None def filt(gene): if filterIn is not None: return any(s in filterIn for s in gene.names) if filterOut is not None: return all(s not in filterOut for s in gene.names) return True self.lstGenes = {} for (chrom,l) in genomeBase.lstGenes.iteritems(): l = [gene for gene in l if filt(gene)] if len(l) > 0: self.lstGenes[chrom] = l self.name = "Filter from " + genomeBase.name print >> sys.stderr, "%d genes -> %d genes" % (sum(len(x) for x in genomeBase.lstGenes.itervalues()), sum(len(x) for x in self.lstGenes.itervalues())), self.init(**kwargs) print >> sys.stderr, "OK"