Example #1
0
def addOrganismToChado(gff, organismName):
    try:
        organism = Organism.objects.get(common_name=organismName)
    except ObjectDoesNotExist:
        organisms = Organism.objects.order_by('-organism_id')
        nextId = 0
        if (len(organisms) > 0):
            nextId = organisms[0].organism_id + 1
        organismNameArr = organismName.split()
        if (len(organismNameArr) < 2):
            raise Exception(
                'Organism name does not have enough tokens to find a genus and species: '
                + organismName)
        genus = organismNameArr[0]
        species = organismNameArr[1]
        organism = Organism(organism_id=nextId,
                            abbreviation=genus[0] + '. ' + species,
                            genus=genus,
                            species=species,
                            common_name=organismName)
        organism.save()

    args = [
        '--organism', organismName, "--gfffile", gff, " ",
        settings.DATABASES['default']['NAME'], "--dbuser",
        settings.DATABASES['default']['USER'], "--dbpass",
        settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir"
    ]
    runProgram('gmod_bulk_load_gff3.pl', args)

    return nextId
Example #2
0
    def __processGffFilesNotNew(self, changed):
        for gff in changed:
            loc = os.path.dirname(gff)
            dbName = os.path.splitext(os.path.basename(gff))[0] + '.db'
            dbName = os.path.join(loc, dbName)

            gffRewriter = GFFRewriter(filename=gff,
                                      outfile=gff + ".sorted.prepared",
                                      accession=genbank_id)

            #print setting.DATABASES['default']['USER']

            gffRewriter.addUnknownCvTerms({
                'user':
                settings.DATABASES['default']['USER'],
                'password':
                settings.DATABASES['default']['PASSWORD'],
                'db':
                settings.DATABASES['default']['NAME']
            })

            gffRewriter.addColor({
                'user':
                settings.DATABASES['default']['USER'],
                'password':
                settings.DATABASES['default']['PASSWORD'],
                'db':
                'go'
            })

            error = gffRewriter.getError()

            # run the sqlite database loader to be able to add it to GBrowse
            # since the name should be preserved, no changes need to be made
            # to the GBrowse configuration file
            args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff]
            runProgram('bp_seqfeature_load.pl', args)

            parser = GenBank.RecordParser()
            gbk = os.path.join(os.path.splitext(gff)[0], '.gbk')
            record = parser.parse(open(gbk))
            organismName = record.organism
            organismDir = os.path.basename(loc)

            GenomeDBUtil.editGBrowseEntry(gff, dbName, organismDir,
                                          organismName)

            # now edit the record in Chado
            args = [
                '--organism', organismName, "--gfffile", gff, "--dbname",
                settings.DATABASES['default']['NAME'], "--dbuser",
                settings.DATABASES['default']['USER'], "--dbpass",
                settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir"
            ]
            runProgram('gmod_bulk_load_gff3.pl', args)
Example #3
0
	def __createGFFFiles(self, newOrganisms):
		for newOrganism in newOrganisms:
			newOrganism = os.path.join(NEW_GENOMIC_DATA_DIR, newOrganism)
			organismFiles = os.walk(newOrganism).next()[2]
			
			gbk = None
			for organismFile in organismFiles:
				extension = os.path.splitext(organismFile)[1]
				if (extension == '.gbk'):
					gbk = organismFile
					break
				
			if (gbk):
				runProgram("bp_genbank2gff3.pl",  ["-noCDS", "-s", "-o", newOrganism, os.path.join(newOrganism, gbk)])
Example #4
0
def runFormatDB(fastaName, loc, protein=False):
        option = 'F'
        dbDir = 'nucleotideDB'
        if (protein):
            option = 'T'
            dbDir = 'proteinDB'
            
        # must move the file into the new directory to correctly place the
        # formatdb information
        newLoc = os.path.join(loc, dbDir)
        originalFile = os.path.join(loc, fastaName)
        newFile = os.path.join(newLoc, fastaName)
        
        os.mkdir(newLoc)
        shutil.move(originalFile, newLoc)
             
        args = ['-p', option, '-i', newFile]
        runProgram('formatdb', args)
        
        shutil.move(newFile, loc)
def addOrganismToChado(gff, organismName):
    try:                   
        organism = Organism.objects.get(common_name=organismName)                  
    except ObjectDoesNotExist:
        organisms = Organism.objects.order_by('-organism_id')
        nextId = 0
        if (len(organisms) > 0):  
            nextId = organisms[0].organism_id + 1
        organismNameArr = organismName.split()
        if (len(organismNameArr) < 2):
            raise Exception('Organism name does not have enough tokens to find a genus and species: ' + organismName)
        genus = organismNameArr[0]
        species = organismNameArr[1]
        organism = Organism(organism_id=nextId, abbreviation=genus[0] + '. ' + species, genus=genus, species=species, common_name=organismName)                    
        organism.save()
            
    args= ['--organism', organismName, "--gfffile", gff, " ", settings.DATABASES['default']['NAME'], "--dbuser", settings.DATABASES['default']['USER'], "--dbpass", settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir"]
    runProgram('gmod_bulk_load_gff3.pl', args)
    
    return nextId
def runFormatDB(fastaName, loc, protein=False):
        option = 'F'
        dbDir = 'nucleotideDB'
        if (protein):
            option = 'T'
            dbDir = 'proteinDB'
            
        # must move the file into the new directory to correctly place the
        # formatdb information
        newLoc = os.path.join(loc, dbDir)
        originalFile = os.path.join(loc, fastaName)
        newFile = os.path.join(newLoc, fastaName)
        
        os.mkdir(newLoc)
        shutil.move(originalFile, newLoc)
             
        args = ['-p', option, '-i', newFile]
        runProgram('formatdb', args)
        
        shutil.move(newFile, loc)
Example #7
0
	def __processGffFilesNotNew(self, changed):
		for gff in changed:
			loc = os.path.dirname(gff)
			dbName = os.path.splitext(os.path.basename(gff))[0] + '.db'
			dbName = os.path.join(loc, dbName)
			
			gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=genbank_id)
	
			gffRewriter.addUnknownCvTerms({
				'user' : settings.DATABASES['default']['USER'], 
				'password' : settings.DATABASES['default']['PASSWORD'], 
				'db' : settings.DATABASES['default']['NAME']
			})
		
			gffRewriter.addColor({
				'user' : settings.DATABASES['default']['USER'],
				'password' : settings.DATABASES['default']['PASSWORD'],
				'db' : 'MyGO'
			})
		
			error = gffRewriter.getError()
			
			# run the sqlite database loader to be able to add it to GBrowse
			# since the name should be preserved, no changes need to be made
			# to the GBrowse configuration file
			args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff]
			runProgram('bp_seqfeature_load.pl', args)
			
			parser = GenBank.RecordParser()
			gbk = os.path.join(os.path.splitext(gff)[0], '.gbk')
			record = parser.parse(open(gbk))
			organismName = record.organism
			organismDir = os.path.basename(loc)
			
			GenomeDBUtil.editGBrowseEntry(gff, dbName, organismDir, organismName)
			
			# now edit the record in Chado
			args= ['--organism', organismName, "--gfffile", gff, "--dbname", settings.DATABASES['default']['NAME'], "--dbuser", settings.DATABASES['default']['USER'], "--dbpass", settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir"]
			runProgram('gmod_bulk_load_gff3.pl', args)
Example #8
0
	def __processGffFilesNew(self, newOrganismDirs):
		for newOrganism in newOrganismDirs:
			# start by creating the BLAST database
			newOrganism = os.path.join(NEW_GENOMIC_DATA_DIR, newOrganism)
			print newOrganism
			organismFiles = os.walk(newOrganism).next()[2]
			faa = None
			ffn = None
			gff = None
			gbk = None
			for organismFile in organismFiles:
				extension = os.path.splitext(organismFile)[1]
				if (extension == '.ffn'):
					ffn = organismFile
				elif (extension == '.faa'):
					faa = organismFile
				elif (extension == '.gff'):
					gff = organismFile
				elif (extension == '.gbk'):
					gbk = organismFile
				if (faa and ffn and gff and gbk):
					break
			
			if (faa):
				GenomeDBUtil.runFormatDB(os.path.basename(faa), newOrganism, protein=True)
				self.report.addLogEntry('Ran formatdb successully on ' + faa)
			if (ffn):
				GenomeDBUtil.runFormatDB(os.path.basename(ffn), newOrganism, protein=False)
				self.report.addLogEntry('Ran formatdb successully on ' + ffn)
				
			# process the gff and genbank files for creating the databases
			if (gff and gbk):
				# create the sqlite database for GBrowse and create the configuration file
				# for GBrowse hook up
				dbName = os.path.splitext(os.path.basename(gff))[0] + '.db'
				dbName = os.path.join(newOrganism, dbName)
				gff = os.path.join(newOrganism, gff)
				
				parser = GenBank.RecordParser()
				gbk = os.path.join(newOrganism, gbk)
				record = parser.parse(open(gbk))
				organismName = record.organism
				accession = record.accession[0]
				self.report.addLogEntry('Found organism name ' + organismName)
				
				# create a brand new GBrowse configuration file
				examiner = GFFExaminer()
				gffHandle = open(gff)
				landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0]
				
				gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=accession)
	
				'''gffRewriter.addUnknownCvTerms({
					'user' : settings.DATABASES['default']['USER'], 
					'password' : settings.DATABASES['default']['PASSWORD'], 
					'db' : settings.DATABASES['default']['NAME']
				})'''
			
				gffRewriter.addColor({
					'user' : settings.DATABASES['default']['USER'],
					'password' : settings.DATABASES['default']['PASSWORD'],
					'db' : 'MyGO'
				})
			
				error = gffRewriter.getError()
				print error
				
				gff = gff + ".sorted.prepared"
				
				args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff]
				runProgram('bp_seqfeature_load.pl', args)
				self.report.addLogEntry('Successfully created sqlite database for ' + str(gff))
				
				organismDir = os.path.basename(newOrganism)
				self.report.addLogEntry('Added new GBrowse entry for ' + organismName)
				
				# now edit the record in Chado by first adding the organism and then adding
				# bulk loading the information from gff3
				id = GenomeDBUtil.addOrganismToChado(gff, organismName)
				GenomeDBUtil.createNewGBrowseEntry(landmark, dbName, organismDir, organismName, id)