def getOrganismsColors(project): ''' Check the colors assigned to the organisms and return a dictionary If no colors are assigned, they are assigned automatically ''' organism = Organism(project) colors = {} for org in organism.getAll(): if not org.color or org.color == '': colors[org.org_id] = None else: colors[org.org_id] = org.color orgs = colors.keys() for org, color in colors.iteritems(): # Automatic assignment, probably not the best choiche # if we got some organism assigned and some others not if not color: autocolor = plt.get_cmap('jet')(float( orgs.index(org) )/(len(orgs)-1)) autocolor = pltcls.rgb2hex(autocolor) colors[org] = autocolor organism.setColor(org, autocolor) logger.info('Automatically assigned color to %s'%org) return colors
def dMutAdd(project, mutID, mutparent,kind, name='', descr='', color=None): ''' Check and add a mutant ''' org = Organism(project) if org.isOrg(mutID): logger.warning('Organism %s is already present'%mutID) logger.warning('Remove it before addition') return False if not org.isOrg(mutparent): logger.error('Parent organism %s not present!'%mutparent) return False elif org.isMutant(mutparent): logger.error('Parent organism %s cannot be a mutant!'%mutparent) return False parents = len(org) - org.howManyMutants() if parents != 1: logger.error('Only one parent is allowed!') return False if not color: org.addOrg(mutID, name=name, description=descr, mutant=True, reference=mutparent, mkind=kind) else: org.addOrg(mutID, name=name, description=descr, mutant=True, reference=mutparent, mkind=kind, color=color) logger.info('Mutant %s (%s) added' %(mutID, org.getOrg(mutID).mkind)) return True
def dClear(project): ''' Clear all the organisms data ''' org = Organism(project) org.delAllOrgs(True) logger.info('Successfully removed all organisms data') return True
def dGenomeRemove(project, organisms): ''' Remove all the genomic data about specific organism ID(s) ''' gen = Genome(project) oCheck = Organism(project) for org in organisms: if not oCheck.isOrg(org): logger.warning('Genome %s is not present: skipping'%org) continue gen.delProteome(org) logger.info('Successfully removed genome %s'%org) return True
def dPhenomeRemove(project, organisms): ''' Remove all the phenomic data about specific organism ID(s) ''' biolog = Biolog(project) oCheck = Organism(project) for org in organisms: if not oCheck.isOrg(org): logger.warning('Phenome %s is not present: skipping'%org) continue biolog.delOrg(org) logger.info('Successfully removed phenome %s'%org) if biolog.atLeastOneParameter(): logger.warning('The activity must be recalculated') return True
def dRemove(project, organisms): ''' Remove all the organism info regarding a particular organism ID(s) ''' org = Organism(project) for orgID in organisms: if not org.isOrg(orgID): logger.warning('Organism %s is not present: skipping'%orgID) continue muts = [mutID for mutID in org.getOrgMutants(orgID)] org.delOrg(orgID, True) logger.info('Successfully removed organism %s'%orgID) if len(muts) > 0: logger.info('Removed also %d %s mutant(s)'%(len(muts),orgID)) return True
def dGenomeMutAdd(project, mutID, mutfasta): ''' Check and add a mutant ''' if not os.path.exists(mutfasta): logger.error('Fasta file %s may not be present'%(mutfasta)) return False org = Organism(project) if not org.isOrg(mutID): logger.warning('Organism %s is not present yet!'%mutID) return False gen = Genome(project) gen.addProteome(mutID, mutfasta) logger.info('Mutant %s (%s) added, having %d mutated genes' %(mutID, org.getOrg(mutID).mkind,gen.howMany(mutID))) return True
def dGenomeAdd(project, orgID, filename): ''' Add a single genome ''' if not os.path.exists(filename): logger.error('Fasta file %s may not be present'%(filename)) return False filename = os.path.abspath(filename) org = Organism(project) if not org.isOrg(orgID): logger.warning('Organism %s is not present yet!'%orgID) return False gen = Genome(project) gen.addProteome(orgID, filename) logger.info('Added genome %s, having %d proteins'% (orgID, gen.howMany(orgID))) return True
def dGenomeDirAdd(project, folder, extension): ''' Add a series of genomes contained in a directory ''' if not os.path.exists(folder): logger.error('Fasta folder %s may not be present'%(folder)) return False logger.info('Looking for files with extension %s'%extension) org = Organism(project) added = 0 for infile in os.listdir(folder): if infile.split('.')[-1] != extension: logger.debug('Skipping file %s'%infile) continue orgID = infile.split('.')[0] filename = os.path.join(folder, infile) if os.path.isdir(filename): continue if not org.isOrg(orgID): logger.warning('Organism %s is not present yet! Skipping...'%orgID) continue if not org.isMutant(orgID): if not dGenomeAdd(project, orgID, filename): logger.error('Could not add genome %s'%infile) return False else: if not dGenomeMutAdd(project, orgID, filename): logger.error('Could not add genome %s'%infile) return False added += 1 if added > 0: logger.info('Added %d genomes from %s'% (added, folder)) else: logger.warning('No genomes were added from %s'%folder) return True
def dAdd(project, orgID, name='', descr='', color=None): ''' Add a single organism ''' org = Organism(project) # If trying to override a present organism, throw an error if org.isOrg(orgID): logger.warning('Organism %s is already present'%orgID) logger.warning('Remove it before addition') return False if not color: org.addOrg(orgID, name=name, description=descr) else: org.addOrg(orgID, name=name, description=descr, color=color) logger.info('Added organism %s'%orgID) return True
def dSetKind(project): ''' Set the kind of genomic project and return its value ''' proj = Project(project) proj.getProject() org = Organism(project) if org.howManyMutants() > 0: logger.info('%d mutants are present'%org.howManyMutants()) proj.setKind('mutants') return 'mutants' elif org.howMany() == 1: logger.info('Just one organism is present') proj.setKind('single') return 'single' elif org.howMany() == 0: logger.info('No organisms are present yet') return None else: logger.info('%d organisms are present'%org.howMany()) proj.setKind('pangenome') return 'pangenome'
def dPhenomeAdd(project, orgID, filename): ''' Add a single phenome ''' if not os.path.exists(filename): logger.error('Phenomic file %s may not be present'%(filename)) return False org = Organism(project) if not org.isOrg(orgID): logger.warning('Organism %s is not present yet!'%orgID) return False filename = os.path.abspath(filename) bparser = BiologParser(filename) bparser.parse() if len(bparser.plates) == 0: logger.warning('No biolog data was found!') return False # Check the organisms id inside the biolog files strainNumbers = set([plate.strainNumber for plate in bparser.plates]) strainNames = set([plate.strainName for plate in bparser.plates]) samples = set([plate.sample for plate in bparser.plates]) if orgID not in samples: logger.debug('No sign of %s in sample field'%orgID) if orgID not in strainNames: logger.debug('No sign of %s in strainName field'%orgID) if orgID not in strainNumbers: logger.debug('No sign of %s in strainNumber field'%orgID) # TODO: regular expression search if orgID in samples: if len(samples) > 1: logger.warning('''More than one organism ID may be present in this phenomic data file!''') logger.warning('''%s'''%' '.join(samples)) return False for plate in bparser.plates: plate.strain = plate.sample elif orgID in strainNames: if len(strainNames) > 1: logger.warning('''More than one organism ID may be present in this phenomic data file!''') logger.warning('''%s'''%' '.join(strainNames)) return False for plate in bparser.plates: plate.strain = plate.strainName elif orgID in strainNumbers: if len(strainNumbers) > 1: logger.warning('''More than one organism ID may be present in this phenomic data file!''') logger.warning('''%s'''%' '.join(strainNumbers)) return False for plate in bparser.plates: plate.strain = plate.strainNumber else: logger.warning('''The organism ID you provided was not found inside the phenomic data file''') logger.info('''Using it anyway to add this data''') # Prepare a series of Plate objects to catch the replicas dPlates={} for plate in bparser.plates: if plate.plate_id not in dPlates: dPlates[plate.plate_id] = Plate(plate.plate_id) dPlates[plate.plate_id].addData(plate.strain, plate) # Grep the wells wells = [w for plate in dPlates.itervalues() for w in plate.getWells()] # Add to the project biolog = Biolog(project) biolog.addWells(wells, clustered=False) logger.info('Added phenome %s, having %d biolog plates (%d wells)'% (orgID, len(dPlates), len(wells))) return True
def dGenomeExport(project): # Is there something to be exported? organism = Organism(project) if organism.howMany() == 0: logger.info('No genomic data can be exported at this time') return False else: logger.info('Exporting protein data') genome = Genome(project) for org in organism.getAll(): nprots = SeqIO.write([x for x in genome.getRecords(org.org_id)], open('%s.faa'%org.org_id,'w'), 'fasta') logger.info('Saved %d proteins from %s (%s)'%(nprots, org.org_id, '%s.faa'%org.org_id)) logger.info('Exporting Kegg data') logger.info('Exporting KO map data') kegg = Kegg(project) for org in organism.getAll(): fname = 'ko_%s.tsv'%org.org_id fout = open(fname,'w') i = 0 for prot_id, ko_id in kegg.getAllKO(org.org_id): fout.write('%s\t%s\n'%(prot_id, ko_id.lstrip('ko:'))) i += 1 fout.close() if i == 0: os.remove(fname) logger.warning('No KO links available for %s'%org.org_id) else: logger.info('Saved %d KO links for %s (%s)'%(i, org.org_id, fname)) logger.info('Exporting Kegg reactions data') for org in organism.getAll(): fname = 'reactions_%s.tsv'%org.org_id fout = open(fname,'w') i = 0 for prot_id, re_id in kegg.getAllReactions(org.org_id): fout.write('%s\t%s\n'%(prot_id, re_id.lstrip('rn:'))) i += 1 fout.close() if i == 0: os.remove(fname) logger.warning('No Kegg reactions available for %s'%org.org_id) else: logger.info('Saved %d Kegg reactions links for %s (%s)'% (i, org.org_id, fname)) proj = Project(project) if proj.isPanGenome(): logger.info('Exporting pangenome data') dG = genome.getPanGenome() if len(dG) == 0: logger.warning('No pangenome available') else: fname = 'pangenome.tsv' fout = open(fname,'w') for group, prots in dG.iteritems(): for prot in prots: fout.write('%s\t%s\n'%(group,prot)) fout.close() logger.info('Exported %d orthologs (%s)'%(len(dG),fname)) fname = 'pangenome_category.tsv' fout = open(fname,'w') dG = genome.getPanGenomeOrgs() for group in genome.getCore(): fout.write('%s\t%s\t%s\n'%(group.group_id, 'core', '-'.join(dG[group.group_id]))) for group in genome.getAcc(): fout.write('%s\t%s\t%s\n'%(group.group_id, 'accessory', '-'.join(dG[group.group_id]))) for group in genome.getUni(): fout.write('%s\t%s\t%s\n'%(group.group_id, 'unique', '-'.join(dG[group.group_id]))) fout.close() logger.info('Exported orthologs informations (%s)'%fname) return True
def dGenomeStats(project, svg=False, doPrint=True): # Which project are we talking about? kind = dSetKind(project) proj = Project(project) organism = Organism(project) genome = Genome(project) kegg = Kegg(project) if kind == 'single' or kind == 'pangenome': logger.info('Single genomes stats') # Single genomes stats # Header header = '\t'.join( ['ID', 'name', 'description', 'proteome size', 'mapped to kegg', 'KEGG orthology IDs', 'pathways', 'reactions'] ) if doPrint: print header else: logger.info(header) lOrg = [] for org in organism.getAll(): org_id = org.org_id name = org.name if org.name else 'NONE' description = org.description if org.description else 'NONE' prots = genome.howMany(org_id) mapped, ko, react, path = (kegg.howManyMapped(org_id), kegg.howManyKO(org_id), kegg.howManyReactions(org_id), kegg.howManyPathways(org_id)) stats = '\t'.join( [str(x) for x in [org_id, name, description, prots, mapped, ko, path, react]] ) if doPrint: print stats else: logger.info(stats) lOrg.append([org_id, prots, mapped, react]) plotMapBars(lOrg, 'Single genomes statistics', 'single', svg) if proj.isPanGenome(): logger.info('Pangenome stats') # Pangenome stats # Header header = '\t'.join( ['kind', 'size', 'mapped to kegg', 'KEGG orthology IDs', 'pathways', 'reactions'] ) if doPrint: print header else: logger.info(header) core, acc, uni = (genome.getLenCore(), genome.getLenAcc(), genome.getLenUni()) stats = [] stats.append('\t'.join( [str(x) for x in ['core', core, kegg.howManyMapped(pangenome='core'), kegg.howManyKO(pangenome='core'), kegg.howManyPathways(pangenome='core'), kegg.howManyReactions(pangenome='core')]])) stats.append('\t'.join( [str(x) for x in ['accessory', acc, kegg.howManyMapped(pangenome='accessory'), kegg.howManyKO(pangenome='accessory'), kegg.howManyPathways(pangenome='accessory'), kegg.howManyReactions(pangenome='accessory')]])) stats.append('\t'.join( [str(x) for x in ['unique', uni, kegg.howManyMapped(pangenome='unique'), kegg.howManyKO(pangenome='unique'), kegg.howManyPathways(pangenome='unique'), kegg.howManyReactions(pangenome='unique')]])) for stat in stats: if doPrint: print stat else: logger.info(stat) lPanGenome = [['Core', core, kegg.howManyMapped(pangenome='core'), kegg.howManyReactions(pangenome='core')], ['Accessory', acc, kegg.howManyMapped(pangenome='accessory'), kegg.howManyReactions(pangenome='accessory')], ['Unique', uni, kegg.howManyMapped(pangenome='unique'), kegg.howManyReactions(pangenome='unique')]] plotMapBars(lPanGenome, 'PanGenome statistics', 'pangenome_stats', svg) plotPanGenome(core, acc, uni, svg) elif kind == 'mutants': refs = [org.org_id for org in organism.getAll() if not organism.isMutant(org.org_id)] # Header header = '\t'.join( ['ID', 'name', 'description', 'kind', 'proteome size', 'mapped to kegg', 'reactions'] ) for ref_id in refs: logger.info('Mutants of %s stats'%ref_id) if doPrint: print header else: logger.info(header) muts = [x for x in organism.getOrgMutants(ref_id)] lOrg = [] for org_id in [ref_id] + muts: org = organism.getOrg(org_id) name = org.name if org.name else 'NONE' description = org.description if org.description else 'NONE' mkind = org.mkind if org.mkind in ['deletion', 'insertion'] else 'wild-type' if mkind not in ['deletion', 'insertion']: prots = genome.howMany(org_id) elif mkind == 'deletion': prots = genome.howMany(ref_id) - genome.howMany(org_id) elif mkind == 'insertion': prots = genome.howMany(ref_id) + genome.howMany(org_id) mapped, react = (kegg.howManyMapped(org_id), kegg.howManyReactions(org_id)) if mkind == 'deletion': mapped = kegg.howManyMapped(ref_id) - mapped react = kegg.howManyReactions(ref_id) - react elif mkind == 'insertion': mapped += kegg.howManyMapped(ref_id) react += kegg.howManyReactions(ref_id) stats = '\t'.join( [str(x) for x in [org_id, name, description, mkind, prots, mapped, react]] ) if doPrint: print stats else: logger.info(stats) lOrg.append([org_id, prots, mapped, react]) plotMapBars(lOrg, 'Wild-type (%s) and mutants statistics'%ref_id, '%s'%ref_id, svg) else: logger.info('No statistics can be computed at this time') return False return True
def dPhenomeMultiAdd(project, filename): ''' Add a single phenomic file with multiple organisms in it ''' if not os.path.exists(filename): logger.error('Phenomic file %s may not be present'%(filename)) return False filename = os.path.abspath(filename) bparser = BiologParser(filename) bparser.parse() if len(bparser.plates) == 0: logger.warning('No biolog data was found!') return False # Check the organism ids inside the biolog files # Assuming the names are correct AND stored inside the strainName field logger.debug('Assuming organism IDs are correct and inside the field strainName') strainNames = set([plate.strainName for plate in bparser.plates]) strainNames.discard(None) strainNames.discard('') if len(strainNames) == 0: logger.warning('''Field strainName doesn't contain any value (%s)'''%filename) return False logger.info('Found the following organism IDs: %s'%' '.join(strainNames)) for plate in bparser.plates: plate.strain = plate.strainName # TODO: regular expressions verification orgs = strainNames for orgID in orgs: org = Organism(project) if not org.isOrg(orgID): logger.warning('Organism %s is not present yet! Skipping...'%orgID) continue # Prepare a series of Plate objects to catch the replicas dPlates={} for plate in bparser.plates: if plate.strain == orgID: if plate.plate_id not in dPlates: dPlates[plate.plate_id] = Plate(plate.plate_id) dPlates[plate.plate_id].addData(plate.strain, plate) # Grep the wells wells = [w for plate in dPlates.itervalues() for w in plate.getWells()] # Add to the project biolog = Biolog(project) biolog.addWells(wells, clustered=False) logger.info('Added phenome %s, having %d biolog plates (%d wells)'% (orgID, len(dPlates), len(wells))) return True