Example #1
0
    def __processGffFilesNotNew(self, changed):
        for gff in changed:
            loc = os.path.dirname(gff)
            dbName = os.path.splitext(os.path.basename(gff))[0] + '.db'
            dbName = os.path.join(loc, dbName)

            gffRewriter = GFFRewriter(filename=gff,
                                      outfile=gff + ".sorted.prepared",
                                      accession=genbank_id)

            #print setting.DATABASES['default']['USER']

            gffRewriter.addUnknownCvTerms({
                'user':
                settings.DATABASES['default']['USER'],
                'password':
                settings.DATABASES['default']['PASSWORD'],
                'db':
                settings.DATABASES['default']['NAME']
            })

            gffRewriter.addColor({
                'user':
                settings.DATABASES['default']['USER'],
                'password':
                settings.DATABASES['default']['PASSWORD'],
                'db':
                'go'
            })

            error = gffRewriter.getError()

            # run the sqlite database loader to be able to add it to GBrowse
            # since the name should be preserved, no changes need to be made
            # to the GBrowse configuration file
            args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff]
            runProgram('bp_seqfeature_load.pl', args)

            parser = GenBank.RecordParser()
            gbk = os.path.join(os.path.splitext(gff)[0], '.gbk')
            record = parser.parse(open(gbk))
            organismName = record.organism
            organismDir = os.path.basename(loc)

            GenomeDBUtil.editGBrowseEntry(gff, dbName, organismDir,
                                          organismName)

            # now edit the record in Chado
            args = [
                '--organism', organismName, "--gfffile", gff, "--dbname",
                settings.DATABASES['default']['NAME'], "--dbuser",
                settings.DATABASES['default']['USER'], "--dbpass",
                settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir"
            ]
            runProgram('gmod_bulk_load_gff3.pl', args)
Example #2
0
    def __processFastaFilesNotNew(self, unchanged, changed):
        for fasta in changed:
            extension = os.path.splitext(fasta)[1]
            fileDir = os.path.dirname(fasta)

            if (extension == 'ffn'):
                GenomeDBUtil.runFormatDB(os.path.basename(fasta),
                                         fileDir,
                                         NEW_GENOMIC_DATA_DIR,
                                         protein=False)
                self.report.addLogEntry('Created BLASTn database for ' +
                                        fasta + '(replaced old file)...')
            elif (extension == 'faa'):
                GenomeDBUtil.runFormatDB(os.path.basename(fasta),
                                         fileDir,
                                         NEW_GENOMIC_DATA_DIR,
                                         protein=True)
                self.report.addLogEntry('Created BLASTp database for ' +
                                        fasta + ' (replaced old file)...')

        for fasta in unchanged:
            fileDir = os.path.dirname(fasta)
            prefix = CUR_GENOMIC_DATA_DIR + fileDir

            extension = os.path.splitext(fasta)[1]

            if (extension == 'ffn'):
                # check to see if the nucleotideDB directory exists
                # for this organism already
                nucleotideDB = os.path.join(prefix, 'nucleotideDB')
                nucleotideExists = os.path.isdir(nucleotideDB)

                # since these are unchanged files we wish to simply move the old directories over
                # to save cpu time, if they do not exist previously then we will make them in the
                # new directory
                if (nucleotideExists):
                    shutil.copytree(nucleotideDB,
                                    NEW_GENOMIC_DATA_DIR + fileDir)
                else:
                    GenomeDBUtil.runFormatDB(os.path.basename(fasta),
                                             NEW_GENOMIC_DATA_DIR + fileDir,
                                             protein=False)
            elif (extension == 'faa'):
                # check to see if the proteinDB directory exists
                # for this organism already
                proteinDB = os.path.join(prefix, 'proteinDB')
                proteinExists = os.path.isdir(proteinDB)

                # since these are unchanged files we wish to simply move the old directories over
                # to save cpu time, if they do not exist previously then we will make them in the
                # new directory
                if (proteinExists):
                    shutil.copytree(proteinDB, NEW_GENOMIC_DATA_DIR + fileDir)
                else:
                    GenomeDBUtil.runFormatDB(os.path.basename(fasta),
                                             fileDir,
                                             NEW_GENOMIC_DATA_DIR,
                                             protein=True)
Example #3
0
	def __processGffFilesNotNew(self, changed):
		for gff in changed:
			loc = os.path.dirname(gff)
			dbName = os.path.splitext(os.path.basename(gff))[0] + '.db'
			dbName = os.path.join(loc, dbName)
			
			gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=genbank_id)
	
			gffRewriter.addUnknownCvTerms({
				'user' : settings.DATABASES['default']['USER'], 
				'password' : settings.DATABASES['default']['PASSWORD'], 
				'db' : settings.DATABASES['default']['NAME']
			})
		
			gffRewriter.addColor({
				'user' : settings.DATABASES['default']['USER'],
				'password' : settings.DATABASES['default']['PASSWORD'],
				'db' : 'MyGO'
			})
		
			error = gffRewriter.getError()
			
			# run the sqlite database loader to be able to add it to GBrowse
			# since the name should be preserved, no changes need to be made
			# to the GBrowse configuration file
			args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff]
			runProgram('bp_seqfeature_load.pl', args)
			
			parser = GenBank.RecordParser()
			gbk = os.path.join(os.path.splitext(gff)[0], '.gbk')
			record = parser.parse(open(gbk))
			organismName = record.organism
			organismDir = os.path.basename(loc)
			
			GenomeDBUtil.editGBrowseEntry(gff, dbName, organismDir, organismName)
			
			# now edit the record in Chado
			args= ['--organism', organismName, "--gfffile", gff, "--dbname", settings.DATABASES['default']['NAME'], "--dbuser", settings.DATABASES['default']['USER'], "--dbpass", settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir"]
			runProgram('gmod_bulk_load_gff3.pl', args)
Example #4
0
	def __processFastaFilesNotNew(self, unchanged, changed):
		for fasta in changed:
			extension = os.path.splitext(fasta)[1]
			fileDir = os.path.dirname(fasta)
			
			if (extension == 'ffn'):
				GenomeDBUtil.runFormatDB(os.path.basename(fasta), fileDir, NEW_GENOMIC_DATA_DIR, protein=False)
				self.report.addLogEntry('Created BLASTn database for ' + fasta + '(replaced old file)...')
			elif (extension == 'faa'):
				GenomeDBUtil.runFormatDB(os.path.basename(fasta), fileDir, NEW_GENOMIC_DATA_DIR, protein=True)
				self.report.addLogEntry('Created BLASTp database for ' + fasta + ' (replaced old file)...')
		
		for fasta in unchanged:
			fileDir = os.path.dirname(fasta)
			prefix = CUR_GENOMIC_DATA_DIR + fileDir
			
			extension = os.path.splitext(fasta)[1]
			
			if (extension == 'ffn'):
				# check to see if the nucleotideDB directory exists
				# for this organism already
				nucleotideDB = os.path.join(prefix, 'nucleotideDB')
				nucleotideExists = os.path.isdir(nucleotideDB)
				
				# since these are unchanged files we wish to simply move the old directories over
				# to save cpu time, if they do not exist previously then we will make them in the
				# new directory
				if (nucleotideExists):
					shutil.copytree(nucleotideDB, NEW_GENOMIC_DATA_DIR + fileDir)
				else:
					GenomeDBUtil.runFormatDB(os.path.basename(fasta), NEW_GENOMIC_DATA_DIR + fileDir, protein=False)
			elif(extension == 'faa'):			
				# check to see if the proteinDB directory exists
				# for this organism already
				proteinDB = os.path.join(prefix, 'proteinDB')
				proteinExists = os.path.isdir(proteinDB)
								
				# since these are unchanged files we wish to simply move the old directories over
				# to save cpu time, if they do not exist previously then we will make them in the
				# new directory
				if (proteinExists):
					shutil.copytree(proteinDB, NEW_GENOMIC_DATA_DIR + fileDir)
				else:
					GenomeDBUtil.runFormatDB(os.path.basename(fasta), fileDir, NEW_GENOMIC_DATA_DIR, protein=True)
Example #5
0
	def __processGffFilesNew(self, newOrganismDirs):
		for newOrganism in newOrganismDirs:
			# start by creating the BLAST database
			newOrganism = os.path.join(NEW_GENOMIC_DATA_DIR, newOrganism)
			print newOrganism
			organismFiles = os.walk(newOrganism).next()[2]
			faa = None
			ffn = None
			gff = None
			gbk = None
			for organismFile in organismFiles:
				extension = os.path.splitext(organismFile)[1]
				if (extension == '.ffn'):
					ffn = organismFile
				elif (extension == '.faa'):
					faa = organismFile
				elif (extension == '.gff'):
					gff = organismFile
				elif (extension == '.gbk'):
					gbk = organismFile
				if (faa and ffn and gff and gbk):
					break
			
			if (faa):
				GenomeDBUtil.runFormatDB(os.path.basename(faa), newOrganism, protein=True)
				self.report.addLogEntry('Ran formatdb successully on ' + faa)
			if (ffn):
				GenomeDBUtil.runFormatDB(os.path.basename(ffn), newOrganism, protein=False)
				self.report.addLogEntry('Ran formatdb successully on ' + ffn)
				
			# process the gff and genbank files for creating the databases
			if (gff and gbk):
				# create the sqlite database for GBrowse and create the configuration file
				# for GBrowse hook up
				dbName = os.path.splitext(os.path.basename(gff))[0] + '.db'
				dbName = os.path.join(newOrganism, dbName)
				gff = os.path.join(newOrganism, gff)
				
				parser = GenBank.RecordParser()
				gbk = os.path.join(newOrganism, gbk)
				record = parser.parse(open(gbk))
				organismName = record.organism
				accession = record.accession[0]
				self.report.addLogEntry('Found organism name ' + organismName)
				
				# create a brand new GBrowse configuration file
				examiner = GFFExaminer()
				gffHandle = open(gff)
				landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0]
				
				gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=accession)
	
				'''gffRewriter.addUnknownCvTerms({
					'user' : settings.DATABASES['default']['USER'], 
					'password' : settings.DATABASES['default']['PASSWORD'], 
					'db' : settings.DATABASES['default']['NAME']
				})'''
			
				gffRewriter.addColor({
					'user' : settings.DATABASES['default']['USER'],
					'password' : settings.DATABASES['default']['PASSWORD'],
					'db' : 'MyGO'
				})
			
				error = gffRewriter.getError()
				print error
				
				gff = gff + ".sorted.prepared"
				
				args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff]
				runProgram('bp_seqfeature_load.pl', args)
				self.report.addLogEntry('Successfully created sqlite database for ' + str(gff))
				
				organismDir = os.path.basename(newOrganism)
				self.report.addLogEntry('Added new GBrowse entry for ' + organismName)
				
				# now edit the record in Chado by first adding the organism and then adding
				# bulk loading the information from gff3
				id = GenomeDBUtil.addOrganismToChado(gff, organismName)
				GenomeDBUtil.createNewGBrowseEntry(landmark, dbName, organismDir, organismName, id)