Python FigureOutTaxID Examples

Programming Language: Python

Namespace/Package Name: pymodule.utils

Class/Type: FigureOutTaxID

Examples at hotexamples.com: 4

Python FigureOutTaxID - 4 examples found. These are the top rated real world Python examples of pymodule.utils.FigureOutTaxID extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FigureOutTaxID(1)

returnTaxIDGivenSentence(1)

Example #1

Show file

	def __init__(self, inputFnameLs=None, **keywords):
		"""
		2008-07-27
			use option_default_dict
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		AbstractDBInteractingJob.__init__(self, inputFnameLs=inputFnameLs, **keywords)
		#self.connectDB() called within its __init__()
		
		
		self.FigureOutTaxID_ins = FigureOutTaxID(db_user=self.db_user,
								db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname)
		if self.organism is not None:
			from annot.bin.codense.common import org_short2long, org2tax_id
			if org_short2long(self.organism):
				self.tax_id = org2tax_id(org_short2long(self.organism))
			else:
				self.tax_id = self.FigureOutTaxID_ins.returnTaxIDGivenSentence(self.organism)
		
		#self.p_chromosome = re.compile(r'[a-zA-Z]+_chr(\w+).fa')
		self.p_chromosome = re.compile(r'chromosome (\w+)[,\n\r]?')	#the last ? means [,\n\r] is optional
		self.p_acc_ver = re.compile(r'(\w+)\.(\d+)')
		
		self.parseFastaDescriptionDict = {1: self.parseFastaDescriptionForGenBank, \
										2: self.parseFastaDescriptionForWUSTLVervetScaffolds,\
										3: self.parseFastaDescriptionForFullVervetBACs,\
										4: self.parseFastaDescriptionForWUSTLVervetChromosomeGenome}

Example #2

Show file

	def __init__(self, inputFnameLs=None, **keywords):
		"""
		2008-07-27
			use option_default_dict
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		AbstractDBInteractingJob.__init__(self, inputFnameLs=inputFnameLs, **keywords)
		#self.connectDB() called within its __init__()
		
		
		self.FigureOutTaxID_ins = FigureOutTaxID(db_user=self.db_user,
								db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname)
		if self.organism is not None:
			from annot.bin.codense.common import org_short2long, org2tax_id
			if org_short2long(self.organism):
				self.tax_id = org2tax_id(org_short2long(self.organism))
			else:
				self.tax_id = self.FigureOutTaxID_ins.returnTaxIDGivenSentence(self.organism)
		
		#self.p_chromosome = re.compile(r'[a-zA-Z]+_chr(\w+).fa')
		self.p_chromosome = re.compile(r'chromosome (\w+)[,\n\r]?')	#the last ? means [,\n\r] is optional
		self.p_acc_ver = re.compile(r'(\w+)\.(\d+)')
		
		self.parseFastaDescriptionDict = {1: self.parseFastaDescriptionForGenBank, \
										2: self.parseFastaDescriptionForWUSTLVervetScaffolds,\
										3: self.parseFastaDescriptionForFullVervetBACs,\
										4: self.parseFastaDescriptionForWUSTLVervetChromosomeGenome}

Example #3

Show file

class chromosome_fasta2db(AbstractDBInteractingJob):
	__doc__ = __doc__
	option_default_dict = AbstractDBInteractingJob.option_default_dict.copy()
	option_default_dict.update({
							('organism', 0, ): [None, 'g', 1, '2-letter abbreviation for organism. Optional, if specified, only sequence from this organism would be extracted.'],\
							('sequence_type_id', 0, int):[9, '', 1, 'column SequenceType.id in database GenomeDB'],\
							('sequence_type_name', 0, ):[None, 's', 1, 'column SequenceType.short_name'],\
							('tax_id', 0, int):[60711, '', 1, 'taxonomy ID, if not given, query argument organism against tax db'],\
							('version', 0, int):[1, '', 1, 'which version'],\
							('run_type', 1, int):[1, 'y', 1, 'run type. 1: genBank fasta files. \n\
							2: scaffolds from WUSTL. \n\
							3: fully sequenced vervet BACs. \n\
							4: fully-assembled vervet ref genome from WUSTL. '],\
							('maxNoOfFastaRecords', 1, int):[500, 'x', 1, 'maximum number of fasta records to be inserted (in the input file order)'],\
							})
	option_default_dict[('schema', 0, )][0] = 'genome'
	option_default_dict.pop((('outputFname', 0, )))
	
	def __init__(self, inputFnameLs=None, **keywords):
		"""
		2008-07-27
			use option_default_dict
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		AbstractDBInteractingJob.__init__(self, inputFnameLs=inputFnameLs, **keywords)
		#self.connectDB() called within its __init__()
		
		
		self.FigureOutTaxID_ins = FigureOutTaxID(db_user=self.db_user,
								db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname)
		if self.organism is not None:
			from annot.bin.codense.common import org_short2long, org2tax_id
			if org_short2long(self.organism):
				self.tax_id = org2tax_id(org_short2long(self.organism))
			else:
				self.tax_id = self.FigureOutTaxID_ins.returnTaxIDGivenSentence(self.organism)
		
		#self.p_chromosome = re.compile(r'[a-zA-Z]+_chr(\w+).fa')
		self.p_chromosome = re.compile(r'chromosome (\w+)[,\n\r]?')	#the last ? means [,\n\r] is optional
		self.p_acc_ver = re.compile(r'(\w+)\.(\d+)')
		
		self.parseFastaDescriptionDict = {1: self.parseFastaDescriptionForGenBank, \
										2: self.parseFastaDescriptionForWUSTLVervetScaffolds,\
										3: self.parseFastaDescriptionForFullVervetBACs,\
										4: self.parseFastaDescriptionForWUSTLVervetChromosomeGenome}
	
	def saveRawSequence(self, session, seq_to_db, passingdata, aa_attr_instance):
		"""
		2010-12-17
			RawSequence.annot_assembly is a foreign key element now.
		2008-07-29
			to store one sequence segment
		"""
		passingdata.current_stop = passingdata.current_start+len(seq_to_db)-1
		raw_sequence = RawSequence(start=passingdata.current_start, stop=passingdata.current_stop, sequence=seq_to_db)
		raw_sequence.annot_assembly = aa_attr_instance
		session.add(raw_sequence)
		if not passingdata.raw_sequence_initiated:
			session.flush()	# 2010-12-17 to get raw_sequence.id
			passingdata.raw_sequence_initiated = True
			aa_attr_instance.raw_sequence_start_id = raw_sequence.id
		passingdata.current_start += len(seq_to_db)
	
	def parseFastaDescriptionForGenBank(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		"""
		"""
		possible header lines:
		
		>gi|51511461|ref|NC_000001.8|NC_000001 H**o sapiens chromosome 1, complete sequence
		>gi|186497660|ref|NC_003070.6| Arabidopsis thaliana chromosome 1, complete sequence
		>gi|26556996|ref|NC_001284.2| Arabidopsis thaliana mitochondrion, complete genome
		>gi|115442598|ref|NC_008394.1| Oryza sativa (japonica cultivar-group) genomic DNA, chromosome 1
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split('|')
		_tax_id = FigureOutTaxID_ins.returnTaxIDGivenSentence(header[4])
		
		if self.p_chromosome.search(header[4]) is not None:
			chromosome = self.p_chromosome.search(header[4]).groups()[0]
		elif header[4].find('mitochondrion')!=-1:
			chromosome = 'mitochondrion'
		elif header[4].find('chloroplast')!=-1:
			chromosome = 'chloroplast'
		else:	#something else, take the whole before ','
			chromosome = header[4].split(',')[0]
		gi = int(header[1])
		acc_ver = header[3]
		comment = header[4]
		return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)

	def parseFastaDescriptionForWUSTLVervetScaffolds(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		"""
		"""
		possible header lines:
		>Contig0  12652774 13406928

		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split()
		chromosome = header[0]	#contig name is taken as chromosome
		"""
		p_chromosome = re.compile(r'Contig(\d+)')
		if p_chromosome.search(header[0]) is not None:
			chromosome = p_chromosome.search(header[0]).groups()[0]
		else:
			chromosome = None
		"""
		gi = None
		acc_ver = None
		comment = None
		return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
	
	
	def parseFastaDescriptionForWUSTLVervetChromosomeGenome(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2013.05.09 include 'CAE' in the chromosome ID name.
		2013.04.12 header looks like, CAE stands for C. aethiops:
		
			>CAE1
			GTGAAAGAAGCCAAAAAG
			
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split()
		p_chromosome = re.compile(r'(CAE[\dXYxy]+)')
		if p_chromosome.search(header[0]) is not None:
			chromosome = p_chromosome.search(header[0]).groups()[0]
		else:
			chromosome = None
		gi = None
		acc_ver = None
		comment = None
		return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
	
	def parseFastaDescriptionForFullVervetBACs(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		possible header lines:
			
		>gi|285026568|gb|AC239257.2| Chlorocebus aethiops chromosome UNK clone CH252-270J24, WORKING DRAFT SEQUENCE, 2 unordered pieces
		>gi|281332227|gb|AC238852.3| Chlorocebus aethiops BAC clone CH252-133A18 from chromosome 3, complete sequence
		>gi|285002488|gb|AC239185.3| Chlorocebus aethiops BAC clone CH252-404N12 from chromosome unknown, complete sequence
		
		
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split('|')
		_tax_id = None
		p_chromosome = re.compile(r'UNK clone ([^,]+),')	# 1st type of clone description
		p2_chromosome = re.compile(r'clone ([^,]+),')	# 2nd type of clone description
		
		if p_chromosome.search(header[4]) is not None:
			chromosome = p_chromosome.search(header[4]).groups()[0]
		else:
			if p2_chromosome.search(header[4]) is not None:
				chromosome = p2_chromosome.search(header[4]).groups()[0]
			else:
				chromosome = None
		gi = int(header[1])
		acc_ver = header[3]
		comment = header[4]
		return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
	
	def parse_chromosome_fasta_file(self, db=None, filename=None, tax_id=None, version=None, chunk_size=10000, \
								sequence_type_name=None, sequence_type_id=None, run_type=1,
								maxNoOfFastaRecords=500):
		"""
		2011-7-10
			add argument maxNoOfFastaRecords: the max number of fasta records before quitting
		2011-7-6
			add argument run_type
				1: chromosome sequences from NCBI genbank
				2: vervet scaffolds from WUSTL
				3: full vervet BACs from McGill
		2010-12-15
			fix a bug that _tax_id shall be used in query AnnotAssembly.
			This bug caused the db redundancy check to fail.
		2010-12-15
			if entry already exists in AnnotAssembly, skip it.
		2008-07-29
			figure out tax_id via FigureOutTaxID
			filename could contain multiple fasta blocks
		2008-07-27
			change to use data structures from GenomeDB.py
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		inf = utils.openGzipFile(filename, openMode='r')
		
		line = inf.readline()
		new_fasta_block = 1	#'line' is not enough to stop the 'while' loop. after the file reading is exhausted by "for line in inf:", 'line' still contains the stuff from the last line.
		no_of_fasta_blocks = 0
		while line and new_fasta_block:
			new_fasta_block = 0	#set it to 0, assuming only one fasta block, change upon new fasta block
			if line[0]!='>':	#not fasta block header
				for line in inf:	#exhaust this fasta block as it's not what's wanted.
					if line[0]=='>':
						new_fasta_block = 1
						break	#start from while again
				continue
			headerData = self.parseFastaDescriptionDict[run_type](line, self.FigureOutTaxID_ins)
			if not headerData.chromosome:
				sys.stderr.write("Error chromosome for header %s is empty %s.\n"%(line, headerData.chromosome))
				import pdb
				pdb.set_trace()
			if tax_id is not None and headerData.tax_id and tax_id!=headerData.tax_id:
				sys.stderr.write("tax_id (%s) not matching the one given (%s). Ignore.\n"%(headerData.tax_id, tax_id))
				line = inf.readline()
				new_fasta_block = 1
				continue
			
			chromosome = headerData.chromosome
			sequence_type = db.getSequenceType(short_name=sequence_type_name, id=sequence_type_id)
			start = 1
			aa_attr_instance = db.checkAnnotAssembly(version=version, tax_id=tax_id, \
								chromosome=chromosome, start=start, stop=None, \
								sequence_type_id=sequence_type.id)
			if aa_attr_instance and aa_attr_instance.raw_sequence_start_id is not None:
				# if raw sequences have been associated with this AnnotAssembly and 
				sys.stderr.write("raw sequences have been associated with this AnnotAssembly (tax_id %s, chr=%s, start=%s). Ignore.\n"%\
								(tax_id, chromosome, start))
				line = inf.readline()
				new_fasta_block = 1
				continue
			if aa_attr_instance is None:
				aa_attr_instance = db.getAnnotAssembly(gi=headerData.gi, acc_ver=headerData.acc_ver, accession =None, \
						version =None, tax_id=tax_id, chromosome =chromosome, \
						start =start, stop =None, orientation=None, sequence = None,\
						raw_sequence_start_id=None, original_path=os.path.abspath(filename),\
						sequence_type_id=sequence_type.id, \
						chromosome_type_id=None, chromosome_type_name=None, comment=headerData.comment)
				if aa_attr_instance.acc_ver and self.p_acc_ver.search(aa_attr_instance.acc_ver):
					aa_attr_instance.accession, aa_attr_instance.version = self.p_acc_ver.search(aa_attr_instance.acc_ver).groups()
					aa_attr_instance.version = int(aa_attr_instance.version)
				else:
					aa_attr_instance.accession = None
					aa_attr_instance.version = version
				if self.debug:
					sys.stderr.write("tax_id=%s for %s.\n"%(aa_attr_instance.tax_id, line))
				#aa_attr_instance.raw_sequence_start_id = self.get_current_max_raw_sequence_id(curs, raw_sequence_table)+1
			passingdata = PassingData()
			passingdata.current_start = 1
			passingdata.raw_sequence_initiated = False
			seq = ''
			for line in inf:
				if line[0]=='>':
					if seq:	#last segment from the previous fasta block
						self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
						seq = ''	#set to nothing to avoid saving one more RawSequence
					new_fasta_block = 1
					break	#start from while again
				
				seq += line.strip()
				if len(seq)>=chunk_size:
					seq_to_db = seq[:chunk_size]
					self.saveRawSequence(db.session, seq_to_db, passingdata, aa_attr_instance)
					seq = seq[chunk_size:]	#remove the one already in db
					if self.report:
						sys.stderr.write("%s\t%s\t%s"%('\x08'*20, no_of_fasta_blocks, passingdata.current_start/chunk_size+1))
			if seq:	# last segment from last line
				self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
			aa_attr_instance.stop = passingdata.current_stop
			db.session.add(aa_attr_instance)
			db.session.flush()
			no_of_fasta_blocks += 1
			if no_of_fasta_blocks>=maxNoOfFastaRecords:
				break
		sys.stderr.write("  Number of fasta blocks/chromosomes: %s.\n"%(no_of_fasta_blocks))
		del inf
	
	def connectDB(self):
		"""
		2013.3.14
		"""
		db = GenomeDatabase(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		self.db = db
		
	def run(self):
		"""
		2008-07-27
			
			--GenomeDatabase
			--parse_chromosome_fasta_file()
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		sys.stderr.write("\tTotally, %d files to be processed.\n"%(len(self.inputFnameLs)))
		
		session = self.db.session
		session.begin()
		for filename in self.inputFnameLs:
			sys.stderr.write("%d/%d:\t%s "%(self.inputFnameLs.index(filename)+1,\
											len(self.inputFnameLs),filename))
			self.parse_chromosome_fasta_file(db=self.db, filename=filename, tax_id=self.tax_id, version=self.version, \
											 chunk_size=10000, \
									sequence_type_name=self.sequence_type_name, \
									sequence_type_id=self.sequence_type_id,\
									run_type=self.run_type, maxNoOfFastaRecords=self.maxNoOfFastaRecords)
		if self.commit:
			session.commit()
		else:
			session.rollback()

Example #4

Show file

class chromosome_fasta2db(AbstractDBInteractingJob):
	__doc__ = __doc__
	option_default_dict = AbstractDBInteractingJob.option_default_dict.copy()
	option_default_dict.update({
							('organism', 0, ): [None, 'g', 1, '2-letter abbreviation for organism. Optional, if specified, only sequence from this organism would be extracted.'],\
							('sequence_type_id', 0, int):[9, '', 1, 'column SequenceType.id in database GenomeDB'],\
							('sequence_type_name', 0, ):[None, 's', 1, 'column SequenceType.short_name'],\
							('tax_id', 0, int):[60711, '', 1, 'taxonomy ID, if not given, query argument organism against tax db'],\
							('version', 0, int):[1, '', 1, 'which version'],\
							('run_type', 1, int):[1, 'y', 1, 'run type. 1: genBank fasta files. \n\
							2: scaffolds from WUSTL. \n\
							3: fully sequenced vervet BACs. \n\
							4: fully-assembled vervet ref genome from WUSTL. '],\
							('maxNoOfFastaRecords', 1, int):[500, 'x', 1, 'maximum number of fasta records to be inserted (in the input file order)'],\
							})
	option_default_dict[('schema', 0, )][0] = 'genome'
	option_default_dict.pop((('outputFname', 0, )))
	
	def __init__(self, inputFnameLs=None, **keywords):
		"""
		2008-07-27
			use option_default_dict
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		AbstractDBInteractingJob.__init__(self, inputFnameLs=inputFnameLs, **keywords)
		#self.connectDB() called within its __init__()
		
		
		self.FigureOutTaxID_ins = FigureOutTaxID(db_user=self.db_user,
								db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname)
		if self.organism is not None:
			from annot.bin.codense.common import org_short2long, org2tax_id
			if org_short2long(self.organism):
				self.tax_id = org2tax_id(org_short2long(self.organism))
			else:
				self.tax_id = self.FigureOutTaxID_ins.returnTaxIDGivenSentence(self.organism)
		
		#self.p_chromosome = re.compile(r'[a-zA-Z]+_chr(\w+).fa')
		self.p_chromosome = re.compile(r'chromosome (\w+)[,\n\r]?')	#the last ? means [,\n\r] is optional
		self.p_acc_ver = re.compile(r'(\w+)\.(\d+)')
		
		self.parseFastaDescriptionDict = {1: self.parseFastaDescriptionForGenBank, \
										2: self.parseFastaDescriptionForWUSTLVervetScaffolds,\
										3: self.parseFastaDescriptionForFullVervetBACs,\
										4: self.parseFastaDescriptionForWUSTLVervetChromosomeGenome}
	
	def saveRawSequence(self, session, seq_to_db, passingdata, aa_attr_instance):
		"""
		2010-12-17
			RawSequence.annot_assembly is a foreign key element now.
		2008-07-29
			to store one sequence segment
		"""
		passingdata.current_stop = passingdata.current_start+len(seq_to_db)-1
		raw_sequence = RawSequence(start=passingdata.current_start, stop=passingdata.current_stop, sequence=seq_to_db)
		raw_sequence.annot_assembly = aa_attr_instance
		session.add(raw_sequence)
		if not passingdata.raw_sequence_initiated:
			session.flush()	# 2010-12-17 to get raw_sequence.id
			passingdata.raw_sequence_initiated = True
			aa_attr_instance.raw_sequence_start_id = raw_sequence.id
		passingdata.current_start += len(seq_to_db)
	
	def parseFastaDescriptionForGenBank(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		"""
		"""
		possible header lines:
		
		>gi|51511461|ref|NC_000001.8|NC_000001 H**o sapiens chromosome 1, complete sequence
		>gi|186497660|ref|NC_003070.6| Arabidopsis thaliana chromosome 1, complete sequence
		>gi|26556996|ref|NC_001284.2| Arabidopsis thaliana mitochondrion, complete genome
		>gi|115442598|ref|NC_008394.1| Oryza sativa (japonica cultivar-group) genomic DNA, chromosome 1
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split('|')
		_tax_id = FigureOutTaxID_ins.returnTaxIDGivenSentence(header[4])
		
		if self.p_chromosome.search(header[4]) is not None:
			chromosome = self.p_chromosome.search(header[4]).groups()[0]
		elif header[4].find('mitochondrion')!=-1:
			chromosome = 'mitochondrion'
		elif header[4].find('chloroplast')!=-1:
			chromosome = 'chloroplast'
		else:	#something else, take the whole before ','
			chromosome = header[4].split(',')[0]
		gi = int(header[1])
		acc_ver = header[3]
		comment = header[4]
		return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)

	def parseFastaDescriptionForWUSTLVervetScaffolds(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		"""
		"""
		possible header lines:
		>Contig0  12652774 13406928

		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split()
		chromosome = header[0]	#contig name is taken as chromosome
		"""
		p_chromosome = re.compile(r'Contig(\d+)')
		if p_chromosome.search(header[0]) is not None:
			chromosome = p_chromosome.search(header[0]).groups()[0]
		else:
			chromosome = None
		"""
		gi = None
		acc_ver = None
		comment = None
		return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
	
	
	def parseFastaDescriptionForWUSTLVervetChromosomeGenome(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2013.05.09 include 'CAE' in the chromosome ID name.
		2013.04.12 header looks like, CAE stands for C. aethiops:
		
			>CAE1
			GTGAAAGAAGCCAAAAAG
			
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split()
		p_chromosome = re.compile(r'(CAE[\dXYxy]+)')
		if p_chromosome.search(header[0]) is not None:
			chromosome = p_chromosome.search(header[0]).groups()[0]
		else:
			chromosome = None
		gi = None
		acc_ver = None
		comment = None
		return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
	
	def parseFastaDescriptionForFullVervetBACs(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		possible header lines:
			
		>gi|285026568|gb|AC239257.2| Chlorocebus aethiops chromosome UNK clone CH252-270J24, WORKING DRAFT SEQUENCE, 2 unordered pieces
		>gi|281332227|gb|AC238852.3| Chlorocebus aethiops BAC clone CH252-133A18 from chromosome 3, complete sequence
		>gi|285002488|gb|AC239185.3| Chlorocebus aethiops BAC clone CH252-404N12 from chromosome unknown, complete sequence
		
		
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split('|')
		_tax_id = None
		p_chromosome = re.compile(r'UNK clone ([^,]+),')	# 1st type of clone description
		p2_chromosome = re.compile(r'clone ([^,]+),')	# 2nd type of clone description
		
		if p_chromosome.search(header[4]) is not None:
			chromosome = p_chromosome.search(header[4]).groups()[0]
		else:
			if p2_chromosome.search(header[4]) is not None:
				chromosome = p2_chromosome.search(header[4]).groups()[0]
			else:
				chromosome = None
		gi = int(header[1])
		acc_ver = header[3]
		comment = header[4]
		return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
	
	def parse_chromosome_fasta_file(self, db=None, filename=None, tax_id=None, version=None, chunk_size=10000, \
								sequence_type_name=None, sequence_type_id=None, run_type=1,
								maxNoOfFastaRecords=500):
		"""
		2011-7-10
			add argument maxNoOfFastaRecords: the max number of fasta records before quitting
		2011-7-6
			add argument run_type
				1: chromosome sequences from NCBI genbank
				2: vervet scaffolds from WUSTL
				3: full vervet BACs from McGill
		2010-12-15
			fix a bug that _tax_id shall be used in query AnnotAssembly.
			This bug caused the db redundancy check to fail.
		2010-12-15
			if entry already exists in AnnotAssembly, skip it.
		2008-07-29
			figure out tax_id via FigureOutTaxID
			filename could contain multiple fasta blocks
		2008-07-27
			change to use data structures from GenomeDB.py
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		inf = utils.openGzipFile(filename, openMode='r')
		
		line = inf.readline()
		new_fasta_block = 1	#'line' is not enough to stop the 'while' loop. after the file reading is exhausted by "for line in inf:", 'line' still contains the stuff from the last line.
		no_of_fasta_blocks = 0
		while line and new_fasta_block:
			new_fasta_block = 0	#set it to 0, assuming only one fasta block, change upon new fasta block
			if line[0]!='>':	#not fasta block header
				for line in inf:	#exhaust this fasta block as it's not what's wanted.
					if line[0]=='>':
						new_fasta_block = 1
						break	#start from while again
				continue
			headerData = self.parseFastaDescriptionDict[run_type](line, self.FigureOutTaxID_ins)
			if not headerData.chromosome:
				sys.stderr.write("Error chromosome for header %s is empty %s.\n"%(line, headerData.chromosome))
				import pdb
				pdb.set_trace()
			if tax_id is not None and headerData.tax_id and tax_id!=headerData.tax_id:
				sys.stderr.write("tax_id (%s) not matching the one given (%s). Ignore.\n"%(headerData.tax_id, tax_id))
				line = inf.readline()
				new_fasta_block = 1
				continue
			
			chromosome = headerData.chromosome
			sequence_type = db.getSequenceType(short_name=sequence_type_name, id=sequence_type_id)
			start = 1
			aa_attr_instance = db.checkAnnotAssembly(version=version, tax_id=tax_id, \
								chromosome=chromosome, start=start, stop=None, \
								sequence_type_id=sequence_type.id)
			if aa_attr_instance and aa_attr_instance.raw_sequence_start_id is not None:
				# if raw sequences have been associated with this AnnotAssembly and 
				sys.stderr.write("raw sequences have been associated with this AnnotAssembly (tax_id %s, chr=%s, start=%s). Ignore.\n"%\
								(tax_id, chromosome, start))
				line = inf.readline()
				new_fasta_block = 1
				continue
			if aa_attr_instance is None:
				aa_attr_instance = db.getAnnotAssembly(gi=headerData.gi, acc_ver=headerData.acc_ver, accession =None, \
						version =version, tax_id=tax_id, chromosome =chromosome, \
						start =start, stop =None, orientation=None, sequence = None,\
						raw_sequence_start_id=None, original_path=os.path.abspath(filename),\
						sequence_type_id=sequence_type.id, \
						chromosome_type_id=None, chromosome_type_name=None, comment=headerData.comment)
				if aa_attr_instance.acc_ver and self.p_acc_ver.search(aa_attr_instance.acc_ver):
					aa_attr_instance.accession, aa_attr_instance.version = self.p_acc_ver.search(aa_attr_instance.acc_ver).groups()
					aa_attr_instance.version = int(aa_attr_instance.version)
				else:
					aa_attr_instance.accession = None
					aa_attr_instance.version = version
				if self.debug:
					sys.stderr.write("tax_id=%s for %s.\n"%(aa_attr_instance.tax_id, line))
				#aa_attr_instance.raw_sequence_start_id = self.get_current_max_raw_sequence_id(curs, raw_sequence_table)+1
			passingdata = PassingData()
			passingdata.current_start = 1
			passingdata.raw_sequence_initiated = False
			seq = ''
			for line in inf:
				if line[0]=='>':
					if seq:	#last segment from the previous fasta block
						self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
						seq = ''	#set to nothing to avoid saving one more RawSequence
					new_fasta_block = 1
					break	#start from while again
				
				seq += line.strip()
				if len(seq)>=chunk_size:
					seq_to_db = seq[:chunk_size]
					self.saveRawSequence(db.session, seq_to_db, passingdata, aa_attr_instance)
					seq = seq[chunk_size:]	#remove the one already in db
					if self.report:
						sys.stderr.write("%s\t%s\t%s"%('\x08'*20, no_of_fasta_blocks, passingdata.current_start/chunk_size+1))
			if seq:	# last segment from last line
				self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
			aa_attr_instance.stop = passingdata.current_stop
			db.session.add(aa_attr_instance)
			db.session.flush()
			no_of_fasta_blocks += 1
			if no_of_fasta_blocks>=maxNoOfFastaRecords:
				break
		sys.stderr.write("  Number of fasta records/chromosomes: %s.\n"%(no_of_fasta_blocks))
		del inf
	
	def connectDB(self):
		"""
		2013.3.14
		"""
		db = GenomeDatabase(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		self.db = db
		
	def run(self):
		"""
		2008-07-27
			
			--GenomeDatabase
			--parse_chromosome_fasta_file()
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		sys.stderr.write("\tTotally, %d files to be processed.\n"%(len(self.inputFnameLs)))
		
		session = self.db.session
		session.begin()
		for filename in self.inputFnameLs:
			sys.stderr.write("%d/%d:\t%s "%(self.inputFnameLs.index(filename)+1,\
											len(self.inputFnameLs),filename))
			self.parse_chromosome_fasta_file(db=self.db, filename=filename, tax_id=self.tax_id, version=self.version, \
											 chunk_size=10000, \
									sequence_type_name=self.sequence_type_name, \
									sequence_type_id=self.sequence_type_id,\
									run_type=self.run_type, maxNoOfFastaRecords=self.maxNoOfFastaRecords)
		if self.commit:
			session.commit()
		else:
			session.rollback()