def addOrganismToChado(gff, organismName): try: organism = Organism.objects.get(common_name=organismName) except ObjectDoesNotExist: organisms = Organism.objects.order_by('-organism_id') nextId = 0 if (len(organisms) > 0): nextId = organisms[0].organism_id + 1 organismNameArr = organismName.split() if (len(organismNameArr) < 2): raise Exception( 'Organism name does not have enough tokens to find a genus and species: ' + organismName) genus = organismNameArr[0] species = organismNameArr[1] organism = Organism(organism_id=nextId, abbreviation=genus[0] + '. ' + species, genus=genus, species=species, common_name=organismName) organism.save() args = [ '--organism', organismName, "--gfffile", gff, " ", settings.DATABASES['default']['NAME'], "--dbuser", settings.DATABASES['default']['USER'], "--dbpass", settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir" ] runProgram('gmod_bulk_load_gff3.pl', args) return nextId
def __processGffFilesNotNew(self, changed): for gff in changed: loc = os.path.dirname(gff) dbName = os.path.splitext(os.path.basename(gff))[0] + '.db' dbName = os.path.join(loc, dbName) gffRewriter = GFFRewriter(filename=gff, outfile=gff + ".sorted.prepared", accession=genbank_id) #print setting.DATABASES['default']['USER'] gffRewriter.addUnknownCvTerms({ 'user': settings.DATABASES['default']['USER'], 'password': settings.DATABASES['default']['PASSWORD'], 'db': settings.DATABASES['default']['NAME'] }) gffRewriter.addColor({ 'user': settings.DATABASES['default']['USER'], 'password': settings.DATABASES['default']['PASSWORD'], 'db': 'go' }) error = gffRewriter.getError() # run the sqlite database loader to be able to add it to GBrowse # since the name should be preserved, no changes need to be made # to the GBrowse configuration file args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff] runProgram('bp_seqfeature_load.pl', args) parser = GenBank.RecordParser() gbk = os.path.join(os.path.splitext(gff)[0], '.gbk') record = parser.parse(open(gbk)) organismName = record.organism organismDir = os.path.basename(loc) GenomeDBUtil.editGBrowseEntry(gff, dbName, organismDir, organismName) # now edit the record in Chado args = [ '--organism', organismName, "--gfffile", gff, "--dbname", settings.DATABASES['default']['NAME'], "--dbuser", settings.DATABASES['default']['USER'], "--dbpass", settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir" ] runProgram('gmod_bulk_load_gff3.pl', args)
def __createGFFFiles(self, newOrganisms): for newOrganism in newOrganisms: newOrganism = os.path.join(NEW_GENOMIC_DATA_DIR, newOrganism) organismFiles = os.walk(newOrganism).next()[2] gbk = None for organismFile in organismFiles: extension = os.path.splitext(organismFile)[1] if (extension == '.gbk'): gbk = organismFile break if (gbk): runProgram("bp_genbank2gff3.pl", ["-noCDS", "-s", "-o", newOrganism, os.path.join(newOrganism, gbk)])
def runFormatDB(fastaName, loc, protein=False): option = 'F' dbDir = 'nucleotideDB' if (protein): option = 'T' dbDir = 'proteinDB' # must move the file into the new directory to correctly place the # formatdb information newLoc = os.path.join(loc, dbDir) originalFile = os.path.join(loc, fastaName) newFile = os.path.join(newLoc, fastaName) os.mkdir(newLoc) shutil.move(originalFile, newLoc) args = ['-p', option, '-i', newFile] runProgram('formatdb', args) shutil.move(newFile, loc)
def addOrganismToChado(gff, organismName): try: organism = Organism.objects.get(common_name=organismName) except ObjectDoesNotExist: organisms = Organism.objects.order_by('-organism_id') nextId = 0 if (len(organisms) > 0): nextId = organisms[0].organism_id + 1 organismNameArr = organismName.split() if (len(organismNameArr) < 2): raise Exception('Organism name does not have enough tokens to find a genus and species: ' + organismName) genus = organismNameArr[0] species = organismNameArr[1] organism = Organism(organism_id=nextId, abbreviation=genus[0] + '. ' + species, genus=genus, species=species, common_name=organismName) organism.save() args= ['--organism', organismName, "--gfffile", gff, " ", settings.DATABASES['default']['NAME'], "--dbuser", settings.DATABASES['default']['USER'], "--dbpass", settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir"] runProgram('gmod_bulk_load_gff3.pl', args) return nextId
def __processGffFilesNotNew(self, changed): for gff in changed: loc = os.path.dirname(gff) dbName = os.path.splitext(os.path.basename(gff))[0] + '.db' dbName = os.path.join(loc, dbName) gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=genbank_id) gffRewriter.addUnknownCvTerms({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : settings.DATABASES['default']['NAME'] }) gffRewriter.addColor({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : 'MyGO' }) error = gffRewriter.getError() # run the sqlite database loader to be able to add it to GBrowse # since the name should be preserved, no changes need to be made # to the GBrowse configuration file args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff] runProgram('bp_seqfeature_load.pl', args) parser = GenBank.RecordParser() gbk = os.path.join(os.path.splitext(gff)[0], '.gbk') record = parser.parse(open(gbk)) organismName = record.organism organismDir = os.path.basename(loc) GenomeDBUtil.editGBrowseEntry(gff, dbName, organismDir, organismName) # now edit the record in Chado args= ['--organism', organismName, "--gfffile", gff, "--dbname", settings.DATABASES['default']['NAME'], "--dbuser", settings.DATABASES['default']['USER'], "--dbpass", settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir"] runProgram('gmod_bulk_load_gff3.pl', args)
def __processGffFilesNew(self, newOrganismDirs): for newOrganism in newOrganismDirs: # start by creating the BLAST database newOrganism = os.path.join(NEW_GENOMIC_DATA_DIR, newOrganism) print newOrganism organismFiles = os.walk(newOrganism).next()[2] faa = None ffn = None gff = None gbk = None for organismFile in organismFiles: extension = os.path.splitext(organismFile)[1] if (extension == '.ffn'): ffn = organismFile elif (extension == '.faa'): faa = organismFile elif (extension == '.gff'): gff = organismFile elif (extension == '.gbk'): gbk = organismFile if (faa and ffn and gff and gbk): break if (faa): GenomeDBUtil.runFormatDB(os.path.basename(faa), newOrganism, protein=True) self.report.addLogEntry('Ran formatdb successully on ' + faa) if (ffn): GenomeDBUtil.runFormatDB(os.path.basename(ffn), newOrganism, protein=False) self.report.addLogEntry('Ran formatdb successully on ' + ffn) # process the gff and genbank files for creating the databases if (gff and gbk): # create the sqlite database for GBrowse and create the configuration file # for GBrowse hook up dbName = os.path.splitext(os.path.basename(gff))[0] + '.db' dbName = os.path.join(newOrganism, dbName) gff = os.path.join(newOrganism, gff) parser = GenBank.RecordParser() gbk = os.path.join(newOrganism, gbk) record = parser.parse(open(gbk)) organismName = record.organism accession = record.accession[0] self.report.addLogEntry('Found organism name ' + organismName) # create a brand new GBrowse configuration file examiner = GFFExaminer() gffHandle = open(gff) landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0] gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=accession) '''gffRewriter.addUnknownCvTerms({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : settings.DATABASES['default']['NAME'] })''' gffRewriter.addColor({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : 'MyGO' }) error = gffRewriter.getError() print error gff = gff + ".sorted.prepared" args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff] runProgram('bp_seqfeature_load.pl', args) self.report.addLogEntry('Successfully created sqlite database for ' + str(gff)) organismDir = os.path.basename(newOrganism) self.report.addLogEntry('Added new GBrowse entry for ' + organismName) # now edit the record in Chado by first adding the organism and then adding # bulk loading the information from gff3 id = GenomeDBUtil.addOrganismToChado(gff, organismName) GenomeDBUtil.createNewGBrowseEntry(landmark, dbName, organismDir, organismName, id)