예제 #1
0
    def run(self):
        if self.debug:
            import pdb

            pdb.set_trace()
        import MySQLdb

        mysql_conn = MySQLdb.connect(db=self.dbname, host="banyan.usc.edu", user=self.db_user, passwd=self.db_passwd)
        mysql_curs = mysql_conn.cursor()
        from pymodule import get_gene_symbol2gene_id_set

        gene_symbol2gene_id_set = get_gene_symbol2gene_id_set(
            mysql_curs, 3702, table="genome.gene_symbol2id", upper_case_gene_symbol=1
        )  # 3702 is At's tax id

        from variation.src.DrawSNPRegion import DrawSNPRegion

        DrawSNPRegion_ins = DrawSNPRegion(
            db_user=self.db_user,
            db_passwd=self.db_passwd,
            hostname=self.hostname,
            database=self.dbname,
            input_fname="/tmp/dumb",
            output_dir="/tmp",
            debug=0,
        )  # input_fname and output_dir are just random stuff
        gene_annotation = DrawSNPRegion_ins.dealWithGeneAnnotation(
            self.gene_annotation_picklef, cls_with_db_args=DrawSNPRegion_ins
        )
        self.improveTAIRGeneGFF(self.input_fname, gene_symbol2gene_id_set, gene_annotation, self.output_fname)
예제 #2
0
	def run(self):
		if self.debug==1:
			import pdb
			pdb.set_trace()
		
		from DrawSNPRegion import DrawSNPRegion
		DrawSNPRegion_instance = DrawSNPRegion(db_user=self.db_user, db_passwd=self.db_passwd, hostname=self.hostname, \
											database=self.dbname, input_fname='/tmp/dumb', output_dir='/tmp', debug=0)
		
		grand_dataStructure = DrawSNPRegion_instance.loadDataStructure(self.gene_annotation_picklef, self.LD_info_picklef, self.LD_fname, min_MAF=self.min_MAF, min_distance=20000, list_type_id=None)		
		snp_region_ls = self.get_snp_region_ls(self.input_fname, grand_dataStructure.snp_info, self.min_distance)
		value_criteria={(1, 'value'):8., (7, 'value'):6., (5, 'rank'):self.min_margarita_rank,(6, 'rank'):self.min_rf_rank}	#minimum threshold for different analysis methods
		self.checkRegions(DrawSNPRegion_instance, grand_dataStructure.snp_info, snp_region_ls, self.output_fname, value_criteria)
예제 #3
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		#session.begin()
		snps_context_wrapper = GeneListRankTest.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest)
		gene_annotation = DrawSNPRegion.dealWithGeneAnnotation(self.gene_annotation_picklef)
		snp_info = DrawSNPRegion.getSNPInfo(db)
		
		snp_annotation_short_name2id = self.getSNPAnnotationShortName2id()
		self._constructSNPAnnotation(session, snp_info, snps_context_wrapper, gene_annotation, snp_annotation_short_name2id)
		if self.commit:
			session.flush()
			session.commit()
예제 #4
0
    def calculateOverlappingStatForOneCombo(self, db, phenotype_method_id, call_method_id, analysis_method_id_ls, \
             no_of_top_snps=1000, association_overlapping_type=None, commit=False, \
             results_directory=None):
        """
		2012.3.23
			pass argument db_250k to ResultsMethod2Results.rm2result()
		2009-11-2
		"""
        sys.stderr.write("Calculating overlapping stat for phenotype %s and combo %s ...\n"%(phenotype_method_id, \
                            repr(analysis_method_id_ls),))
        session = db.session
        snp_id_set_ls = []
        for analysis_method_id in analysis_method_id_ls:
            rm = Stock_250kDB.ResultsMethod.query.filter_by(phenotype_method_id=phenotype_method_id).\
              filter_by(call_method_id=call_method_id).filter_by(analysis_method_id=analysis_method_id).first()
            if rm.id in self.results_id2snp_id_set:
                snp_id_set = self.results_id2snp_id_set.get(rm.id)
            else:
                association_entries = Stock_250kDB.Results.query.filter_by(results_id=rm.id).\
                  filter(Stock_250kDB.Results.rank<=no_of_top_snps)
                no_of_association_entries = association_entries.count()
                if no_of_association_entries < no_of_top_snps:
                    min_rank = no_of_association_entries + 1
                    max_rank = no_of_top_snps
                    if self.snp_info is None:
                        self.snp_info = DrawSNPRegion.getSNPInfo(db)
                    ResultsMethod2Results.rm2result(session, rm, self.snp_info, min_rank=min_rank, max_rank=max_rank, \
                           commit=commit, results_directory=results_directory, db_250k=db)
                    association_entries = Stock_250kDB.Results.query.filter_by(results_id=rm.id).\
                      filter(Stock_250kDB.Results.rank<=no_of_top_snps)
                no_of_association_entries = association_entries.count()
                if no_of_association_entries != no_of_top_snps:
                    sys.stderr.write(
                        "Error: The number of SNPs %s from Result %s (analysis_method_id %s) doesn't match the no_of_top_snps %s.\n"
                        % (no_of_association_entries, rm.id,
                           rm.analysis_method_id, no_of_top_snps))
                    return
                snp_id_set = set()
                for entry in association_entries:
                    snp_id_set.add(entry.snps_id)
                self.results_id2snp_id_set[rm.id] = snp_id_set
            snp_id_set_ls.append(snp_id_set)
        overlapping_snp_id_set = snp_id_set_ls[0]

        for i in range(1, len(snp_id_set_ls)):
            snp_id_set = snp_id_set_ls[i]
            overlapping_snp_id_set = overlapping_snp_id_set & snp_id_set
        no_of_overlapping_snps = len(overlapping_snp_id_set)

        entry = Stock_250kDB.AssociationOverlappingStat(phenotype_method_id=phenotype_method_id, call_method_id=call_method_id, \
                no_of_top_snps=no_of_top_snps, no_of_overlapping_snps=no_of_overlapping_snps)
        entry.overlapping_type = association_overlapping_type
        session.save(entry)
        session.flush()
        sys.stderr.write("%s overlapping SNPs out of %s results. Done.\n" %
                         (no_of_overlapping_snps, len(snp_id_set_ls)))
예제 #5
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		#session.begin()
		snps_context_wrapper = GeneListRankTest.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, \
																	self.get_closest)
		gene_annotation = DrawSNPRegion.dealWithGeneAnnotation(self.gene_annotation_picklef, tax_id=self.tax_id, \
															cls_with_db_args=self)
		snp_info = DrawSNPRegion.getSNPInfo(db)
		
		snp_annotation_short_name2id = self.getSNPAnnotationShortName2id()
		self._constructSNPAnnotation(session, snp_info, snps_context_wrapper, gene_annotation, snp_annotation_short_name2id)
		if self.commit:
			session.flush()
			session.commit()
예제 #6
0
	def calculateOverlappingStatForOneCombo(self, db, phenotype_method_id, call_method_id, analysis_method_id_ls, \
										no_of_top_snps=1000, association_overlapping_type=None, commit=False, \
										results_directory=None):
		"""
		2009-11-2
		"""
		sys.stderr.write("Calculating overlapping stat for phenotype %s and combo %s ...\n"%(phenotype_method_id, \
																						repr(analysis_method_id_ls),))
		session = db.session
		snp_id_set_ls = []
		for analysis_method_id in analysis_method_id_ls:
			rm = Stock_250kDB.ResultsMethod.query.filter_by(phenotype_method_id=phenotype_method_id).\
					filter_by(call_method_id=call_method_id).filter_by(analysis_method_id=analysis_method_id).first()
			if rm.id in self.results_id2snp_id_set:
				snp_id_set = self.results_id2snp_id_set.get(rm.id)
			else:
				association_entries = Stock_250kDB.Results.query.filter_by(results_id=rm.id).\
						filter(Stock_250kDB.Results.rank<=no_of_top_snps)
				no_of_association_entries = association_entries.count()
				if no_of_association_entries<no_of_top_snps:
					min_rank = no_of_association_entries+1
					max_rank = no_of_top_snps
					if self.snp_info is None:
						self.snp_info = DrawSNPRegion.getSNPInfo(db)
					ResultsMethod2Results.rm2result(session, rm, self.snp_info, min_rank=min_rank, max_rank=max_rank, \
												commit=commit, results_directory=results_directory)
					association_entries = Stock_250kDB.Results.query.filter_by(results_id=rm.id).\
							filter(Stock_250kDB.Results.rank<=no_of_top_snps)
				no_of_association_entries = association_entries.count()
				if no_of_association_entries!=no_of_top_snps:
					sys.stderr.write("Error: The number of SNPs %s from Result %s (analysis_method_id %s) doesn't match the no_of_top_snps %s.\n"%(no_of_association_entries, rm.id, rm.analysis_method_id, no_of_top_snps))
					return
				snp_id_set = set()
				for entry in association_entries:
					snp_id_set.add(entry.snps_id)
				self.results_id2snp_id_set[rm.id] = snp_id_set
			snp_id_set_ls.append(snp_id_set)
		overlapping_snp_id_set = snp_id_set_ls[0]
		
		for i in range(1, len(snp_id_set_ls)):
			snp_id_set = snp_id_set_ls[i]
			overlapping_snp_id_set = overlapping_snp_id_set&snp_id_set
		no_of_overlapping_snps = len(overlapping_snp_id_set)
		
		entry = Stock_250kDB.AssociationOverlappingStat(phenotype_method_id=phenotype_method_id, call_method_id=call_method_id, \
										no_of_top_snps=no_of_top_snps, no_of_overlapping_snps=no_of_overlapping_snps)
		entry.overlapping_type = association_overlapping_type
		session.save(entry)
		session.flush()
		sys.stderr.write("%s overlapping SNPs out of %s results. Done.\n"%(no_of_overlapping_snps, len(snp_id_set_ls)))
예제 #7
0
	def on_button_draw_annotation_clicked(self, widget, data=None):
		"""
		2008-12-16
			use DrawSNPRegion.drawGeneModel() to draw gene models
		2008-02-02
		"""
		if not self.chr_id2size:
			sys.stderr.write("No genome-wide pvalue plot has been drawn yet. Do it first!\n")
			return
		#if not self.gene_id2model:
		#	self.gene_id2model, self.chr_id2gene_id_ls = self.get_gene_id2model(self.postgres_curs, tax_id=3702)
		if not self.gene_annotation:
			self.db_connect()
		
		xlim = self.axe_gene_model.get_xlim()
		left_chr, left_pos = get_chr_pos_from_x_axis_pos(xlim[0], self.chr_gap, self.chr_id2cumu_size, self.chr_id_ls)
		right_chr, right_pos = get_chr_pos_from_x_axis_pos(xlim[1], self.chr_gap, self.chr_id2cumu_size, self.chr_id_ls)
		
		#fake a snps_within_this_region for drawGeneModel()
		snps_within_this_region = PassingData(chr_pos_ls=[[left_chr, left_pos],[right_chr, right_pos]])
		base_y_value = 1
		gene_width = 0.8
		gene_position_cycle = 5
		
		return_data = DrawSNPRegion.drawGeneModel(self.axe_gene_model, snps_within_this_region, self.gene_annotation, candidate_gene_set=None,\
								gene_width=gene_width, gene_position_cycle=gene_position_cycle, base_y_value=base_y_value, \
								gene_box_text_gap=20, label_gene=0, rotate_xy=False,\
								chr_id2cumu_size=self.chr_id2cumu_size, chr_id2size=self.chr_id2size, chr_gap=self.chr_gap,\
								artist_obj_id2artist_gene_id_ls=self.artist_obj_id2artist_gene_id_ls, \
								gene_id2artist_object_id=self.gene_id2artist_object_id, drawGeneOnTheBoundary=False)
					#set drawGeneOnTheBoundary to False because later adding text to these genes would corrupt the running program.
		self.axe_gene_model.set_ylim([base_y_value-gene_width, gene_position_cycle+gene_width*2])
		
		"""
		for gene_id in self.chr_id2gene_id_ls[left_chr]:
			gene_model = self.gene_id2model[gene_id]
			if gene_model.start!=None and gene_model.stop!=None and gene_model.stop>left_pos and gene_id not in self.gene_id2artist_object_id:
				if left_chr==right_chr:	#same chromosome
					if gene_model.start>right_pos:	#totally out of range, skip it
						continue
				y_value = len(self.gene_id2artist_object_id)%4	#cycling through the y position to avoid clogging
				self.plot_one_gene(self.ax, gene_id, self.gene_id2model, self.chr_id2cumu_size, self.chr_id2size, self.chr_gap, y_value=-1-y_value, gene_width=self.gene_width)
		if left_chr!=right_chr:
			for gene_id in self.chr_id2gene_id_ls[right_chr]:
				gene_model = self.gene_id2model[gene_id]
				if gene_model.start!=None and gene_model.stop!=None and gene_model.start<right_pos and gene_id not in self.gene_id2artist_object_id:
					y_value = len(self.gene_id2artist_object_id)%4	#cycling through the y position to avoid clogging
					self.plot_one_gene(self.ax, gene_id, self.gene_id2model, self.chr_id2cumu_size, self.chr_id2size, self.chr_gap, y_value=-1-y_value, gene_width=self.gene_width)
		"""
		self.canvas_matplotlib.draw()
예제 #8
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()
        import MySQLdb
        mysql_conn = MySQLdb.connect(db=self.dbname,
                                     host='banyan.usc.edu',
                                     user=self.db_user,
                                     passwd=self.db_passwd)
        mysql_curs = mysql_conn.cursor()
        from pymodule import get_gene_symbol2gene_id_set
        gene_symbol2gene_id_set = get_gene_symbol2gene_id_set(
            mysql_curs,
            3702,
            table='genome.gene_symbol2id',
            upper_case_gene_symbol=1)  #3702 is At's tax id

        from variation.src.DrawSNPRegion import DrawSNPRegion
        DrawSNPRegion_ins = DrawSNPRegion(db_user=self.db_user, db_passwd=self.db_passwd, hostname=self.hostname, database=self.dbname,\
               input_fname='/tmp/dumb', output_dir='/tmp', debug=0) #input_fname and output_dir are just random stuff
        gene_annotation = DrawSNPRegion_ins.dealWithGeneAnnotation(
            self.gene_annotation_picklef, cls_with_db_args=DrawSNPRegion_ins)
        self.improveTAIRGeneGFF(self.input_fname, gene_symbol2gene_id_set,
                                gene_annotation, self.output_fname)
예제 #9
0
	def db_connect(self):
		"""
		2010-1-15
			pass "cls_with_db_args=self" to DrawSNPRegion.dealWithGeneAnnotation()
		2009-12-09
			add db_user, db_passwd to MySQLdb.connect()
		2008-12-16
			add gene_annotation_picklef
		2008-02-01
			read the data in dialog_db_connect and establish the connections to two databases
		"""
		sys.stderr.write("Database Connecting ...")
		self.drivername = 'mysql'
		self.hostname = self.entry_mysql_hostname.get_text()
		self.dbname = self.entry_mysql_dbname.get_text()
		self.db_user = self.xml.get_widget("entry_db_user").get_text()
		self.db_passwd = self.xml.get_widget("entry_db_passwd").get_text()
		
		import MySQLdb
		try:
			self.mysql_conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd)
			self.mysql_curs = self.mysql_conn.cursor()
			self.db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
					   password=self.db_passwd, hostname=self.hostname, database=self.dbname)
			self.db.setup(create_tables=False)
			self.session = self.db.session
		except:
			sys.stderr.write('DB connection error: %s\n'%repr(sys.exc_info()))
			traceback.print_exc()
		
		if not self.gene_annotation:
			gene_annotation_picklef = self.entry_gene_annotation_picklef.get_text()
			self.gene_annotation = DrawSNPRegion.dealWithGeneAnnotation(gene_annotation_picklef, cls_with_db_args=self)
		
		#2010-1-13 for postgresql. commented out
		#hostname = self.entry_postgres_hostname.get_text()
		#dbname = self.entry_postgres_dbname.get_text()
		#schema = self.entry_postgres_schema.get_text()
		
		#from annot.bin.codense.common import db_connect			#2008-12-16 don't need postgres conn anymore
		#self.postgres_conn, self.postgres_curs = db_connect(hostname, dbname, schema)
		
		sys.stderr.write("Done.\n")
예제 #10
0
	def run(self):
		"""
		2009-6-10
			set Results.beta = getattr(data_obj, 'beta1', None)
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
									password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		session.begin()
		
		snp_info = DrawSNPRegion.getSNPInfo(db)
		
		query = Stock_250kDB.ResultsMethod.query.filter_by(call_method_id=self.call_method_id).\
			filter(Stock_250kDB.ResultsMethod.analysis_method_id.in_(self.analysis_method_id_ls))
		for rm in query:
			self.rm2result(session, rm, snp_info, max_rank=self.max_rank, commit=self.commit, results_directory=self.results_directory)
		if self.commit:
			session.commit()
예제 #11
0
    def improveTAIRGeneGFF(self, input_fname, gene_symbol2gene_id_set, gene_annotation, output_fname):
        """
		2009-2-5
			apply the improvement to any non-chromosome lines with 'ID' entry
			escape ';' by '%3B', which is regarded as a separator for every "name=value"
			escape ',' by '%2C', which is regarded as a separator for every "value"
			esacpe gene_symbol/Alias whose value matches individual chromosome (like gene 'CHR5' = 'Gene CHR5')
		2009-2-4
			if the last column has 'ID' in it, find corresponding gene id using its value and add gene_symbol and description
		"""
        sys.stderr.write("Improving TAIR Gene GFF with symbols and descriptions ...\n")
        import re

        p_ID_acc_ver = re.compile(r"ID=(\w+)\.(\d+);")
        p_ID_acc = re.compile(r"ID=(\w+);")
        p_ID_protein_acc = re.compile(r"ID=(\w+)\.(\d+)-Protein;")
        p_chr_name = re.compile(r"CHR\d+$")  # to esacpe gene_symbol/Alias whose value matches individual chromosome
        delimiter = figureOutDelimiter(input_fname)
        reader = csv.reader(open(input_fname), delimiter=delimiter)
        writer = csv.writer(
            open(output_fname, "w"), delimiter=delimiter, lineterminator="\n"
        )  # lineterminator is important. GFF3Loader would break down if it's dos terminator('\r\n').
        counter = 0
        success_counter = 0
        for row in reader:
            last_col = row[-1]
            tair_id = None
            if p_ID_acc_ver.search(last_col):
                tair_id, version = p_ID_acc_ver.search(last_col).groups()
            if p_ID_acc.search(last_col):
                tair_id, = p_ID_acc.search(last_col).groups()
            if p_ID_protein_acc.search(last_col):
                tair_id, version = p_ID_protein_acc.search(last_col).groups()
            counter += 1
            if tair_id is not None and row[2] != "chromosome":
                gene_id_set = getGeneIDSetGivenAccVer(tair_id, gene_symbol2gene_id_set)
                gene_id = None

                if gene_id_set == None:
                    sys.stderr.write(
                        "Linking to gene id failed for %s. No such gene_symbol, %s, in gene_symbol2gene_id_set.\n"
                        % (last_col, tair_id)
                    )
                elif len(gene_id_set) == 1:
                    gene_id = list(gene_id_set)[0]
                    success_counter += 1
                elif len(gene_id_set) > 1:
                    sys.stderr.write("Too many gene_ids: %s, %s.\n" % (tair_id, gene_id_set))
                elif len(gene_id_set) == 0:
                    sys.stderr.write(
                        "Linking to gene id failed for %s. There is gene_symbol, %s, in gene_symbol2gene_id_set but it's empty.\n"
                        % (last_col, tair_id)
                    )
                else:
                    sys.stderr.write(
                        "not supposed to happen: original_name=%s, gene_symbol=%s, gene_id_set=%s\n."
                        % (last_col, tair_id, gene_id_set)
                    )
                if gene_id is not None:
                    gene_model = gene_annotation.gene_id2model.get(gene_id)
                    if gene_model is not None:
                        gene_commentary = gene_model.gene_commentaries[0]
                        gene_desc_ls = DrawSNPRegion.returnGeneDescLs(
                            self.gene_desc_names,
                            gene_model,
                            gene_commentary,
                            cutoff_length=600,
                            replaceNoneElemWithEmptyStr=1,
                        )
                        local_gene_desc_names = map(string.upper, self.gene_desc_names)
                        description = ",  ".join(
                            [": ".join(entry) for entry in zip(local_gene_desc_names, gene_desc_ls)]
                        )
                        description = description.replace(
                            ";", "%3B"
                        )  # escape ';', which is regarded as a separator for every "name=value"
                        description = description.replace(
                            ",", "%2C"
                        )  # escape ',', which is regarded as a separator for every "value"

                        if last_col[-1] != ";":  # no ; delimiter at the end, append one
                            last_col += ";"
                        gene_symbol = gene_model.gene_symbol
                        gene_symbol = gene_symbol.replace(";", "%3B")
                        gene_symbol = gene_symbol.replace(",", "%2C")
                        if p_chr_name.match(gene_symbol):  # match the chromosome name, change
                            gene_symbol = "Gene %s" % gene_symbol
                        last_col += "Alias=%s;" % gene_symbol
                        last_col += "description=%s" % description
                        row[-1] = last_col
            if last_col[-1] == ";":
                last_col = last_col[:-1]
                row[-1] = last_col
            if counter % 5000 == 0:
                sys.stderr.write("%s%s\t%s" % ("\x08" * 80, success_counter, counter))
            writer.writerow(row)

        sys.stderr.write("%s/%s Done.\n" % (success_counter, counter))
예제 #12
0
	def run(self):
		"""
		2008-12-08 if the plot under configuration is already in db, abort only if the program is gonna commit the database transaction.
		2008-10-19
			save figures in database if commit
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		#session.begin()
		
		if self.results_type==1:
			ResultsClass = Stock_250kDB.ResultsMethod
			snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest)
		elif self.results_type==2:
			ResultsClass = Stock_250kDB.ResultsByGene
		else:
			sys.stderr.write("Invalid results type : %s.\n"%self.results_type)
			return None
		
		hist_type = self.getHistType(self.call_method_id, self.min_distance, self.get_closest, self.min_MAF, \
									self.allow_two_sample_overlapping, self.results_type, self.null_distribution_type_id)
		
		candidate_gene_list = self.getGeneList(self.list_type_id)
		if len(candidate_gene_list)<self.min_sample_size:
			sys.stderr.write("Candidate gene list of %s too small: %s.\n"%(self.list_type_id, len(candidate_gene_list)))
			sys.exit(4)
		#candidate_gene_list = []		#2009-01-12 just to plot the histogram of pvalue
		
		candidate_gene_set = Set(candidate_gene_list)
		list_type = Stock_250kDB.GeneListType.get(self.list_type_id)
		if list_type is None:
			sys.exit(3)
		
		phenotype_id2results_id_ls = self.getResultsIDLs(db, ResultsClass, self.results_type, self.phenotype_id_ls, \
														self.min_distance, self.get_closest, self.min_MAF, self.call_method_id)
		
			
		param_data = PassingData(results_directory=self.results_directory, candidate_gene_list=candidate_gene_list, \
			min_MAF=self.min_MAF, allow_two_sample_overlapping=self.allow_two_sample_overlapping, need_the_value=1, \
			do_log10_transformation=False)
			#need_the_value means to get the pvalue/score
			#force no log10 transformation. otherwise, transformation based on analysis_method
		if self.null_distribution_type_id==2 or self.null_distribution_type_id==3:	#gw-looping or random gene list
			snp_info = DrawSNPRegion.getSNPInfo(db)
			candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls(candidate_gene_set, snp_info.chr_pos_ls, snps_context_wrapper)
			no_of_snps = len(snp_info.chr_pos_ls)
			no_of_permutations = no_of_snps/len(candidate_gene_snp_index_ls) + 1
			param_data.chr_pos2index = snp_info.chr_pos2index	#pass to getGenomeWideResultFromFile
			if self.null_distribution_type_id==2:
				non_candidate_gene_snp_index_ls = self.get_non_candidate_gene_snp_index_ls_by_permutation(candidate_gene_snp_index_ls, no_of_snps, no_of_permutations)
				
			elif self.null_distribution_type_id == 3:
				gene_id_ls = get_total_gene_ls(db.metadata.bind)
				no_of_candidate_genes = len(candidate_gene_set)
				non_candidate_gene_snp_index_ls = numpy.zeros(0, numpy.int)
				while len(non_candidate_gene_snp_index_ls)<no_of_snps:
					non_candidate_gene_set = Set(random.sample(gene_id_ls, no_of_candidate_genes))
					_non_candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls(non_candidate_gene_set, snp_info.chr_pos_ls, snps_context_wrapper)
					non_candidate_gene_snp_index_ls = numpy.hstack((non_candidate_gene_snp_index_ls, _non_candidate_gene_snp_index_ls))
		
		for phenotype_id, results_id_ls in phenotype_id2results_id_ls.iteritems():
			if hist_type.id:	#hist_type already in database
				rows = Stock_250kDB.ScoreRankHistogram.query.filter_by(phenotype_method_id=phenotype_id).\
					filter_by(list_type_id=self.list_type_id).filter_by(hist_type_id=hist_type.id)
				if rows.count()>0 and self.commit:	#2008-12-08 only skip if the database transaction is gonna commit.
					row = rows.first()
					sys.stderr.write("Histogram already in database. id=%s, phenotype_id=%s, list_type_id=%s, hist_type_id=%s.\n"%\
									(row.id, row.phenotype_method_id, row.list_type_id, row.hist_type_id))
					continue
			phenotype_method = Stock_250kDB.PhenotypeMethod.get(phenotype_id)
			if not phenotype_method:
				continue
			score_rank_data_ls = []
			sys.stderr.write("Checking phenotype %s (%s) on list_type %s (%s) ...\n"%\
							(phenotype_method.id, phenotype_method.short_name, list_type.id, list_type.short_name))
			
			for results_id in results_id_ls:
				try:
					rm = ResultsClass.get(results_id)
					score_rank_data = None
					if self.null_distribution_type_id==1:
						if self.results_type==1:
							permData = self.prepareDataForPermutationRankTest(rm, snps_context_wrapper, param_data)
							if not permData:
								continue
							score_rank_data = PassingData(candidate_score_ls=permData.candidate_gene_snp_value_ls, \
													candidate_rank_ls=permData.candidate_gene_snp_rank_ls,\
									non_candidate_score_ls=permData.non_candidate_gene_snp_value_ls, non_candidate_rank_ls=permData.non_candidate_gene_snp_rank_ls,\
									analysis_method=rm.analysis_method)
							del permData
						elif self.results_type==2:
							score_rank_data = self.getScoreRankFromRBG(rm, candidate_gene_set, self.results_directory)
					elif self.null_distribution_type_id==2 or self.null_distribution_type_id==3:
						genome_wide_result = self.getResultMethodContent(rm, param_data.results_directory, param_data.min_MAF, pdata=param_data)
						if not genome_wide_result:
							continue
						score_rank_data = self.getScoreRankFromPermIndexLs(genome_wide_result, candidate_gene_snp_index_ls, non_candidate_gene_snp_index_ls)
						if score_rank_data:
							score_rank_data.analysis_method = rm.analysis_method
					
					if score_rank_data:
						score_rank_data_ls.append(score_rank_data)
				except:
						sys.stderr.write("Exception happened for results_id=%s, phenotype_id=%s.\n"%(results_id, phenotype_id))
						traceback.print_exc()
						sys.stderr.write('%s.\n'%repr(sys.exc_info()))
						continue
			if score_rank_data_ls:

				score_png_data, score_svg_data = self.plotHistForOnePhenotype(phenotype_method, list_type, score_rank_data_ls, self.output_dir, data_type='score', commit=self.commit)
				rank_png_data, rank_svg_data = self.plotHistForOnePhenotype(phenotype_method, list_type, score_rank_data_ls, self.output_dir, data_type='rank', commit=self.commit)
				if self.commit:
					score_rank_hist = Stock_250kDB.ScoreRankHistogram(phenotype_method_id=phenotype_id, list_type_id=list_type.id)
					score_rank_hist.hist_type = hist_type
					score_rank_hist.score_hist = score_png_data.getvalue()
					score_rank_hist.score_hist_svg = score_svg_data.getvalue()
					score_rank_hist.rank_hist = rank_png_data.getvalue()
					score_rank_hist.rank_hist_svg = rank_svg_data.getvalue()
					session.save(score_rank_hist)
					session.flush()
					del score_png_data, score_svg_data, rank_png_data, rank_svg_data
		"""
예제 #13
0
    def improveTAIRGeneGFF(self, input_fname, gene_symbol2gene_id_set,
                           gene_annotation, output_fname):
        """
		2009-2-5
			apply the improvement to any non-chromosome lines with 'ID' entry
			escape ';' by '%3B', which is regarded as a separator for every "name=value"
			escape ',' by '%2C', which is regarded as a separator for every "value"
			esacpe gene_symbol/Alias whose value matches individual chromosome (like gene 'CHR5' = 'Gene CHR5')
		2009-2-4
			if the last column has 'ID' in it, find corresponding gene id using its value and add gene_symbol and description
		"""
        sys.stderr.write(
            "Improving TAIR Gene GFF with symbols and descriptions ...\n")
        import re
        p_ID_acc_ver = re.compile(r'ID=(\w+)\.(\d+);')
        p_ID_acc = re.compile(r'ID=(\w+);')
        p_ID_protein_acc = re.compile(r'ID=(\w+)\.(\d+)-Protein;')
        p_chr_name = re.compile(
            r'CHR\d+$'
        )  #to esacpe gene_symbol/Alias whose value matches individual chromosome
        delimiter = figureOutDelimiter(input_fname)
        reader = csv.reader(open(input_fname), delimiter=delimiter)
        writer = csv.writer(
            open(output_fname, 'w'), delimiter=delimiter, lineterminator='\n'
        )  #lineterminator is important. GFF3Loader would break down if it's dos terminator('\r\n').
        counter = 0
        success_counter = 0
        for row in reader:
            last_col = row[-1]
            tair_id = None
            if p_ID_acc_ver.search(last_col):
                tair_id, version = p_ID_acc_ver.search(last_col).groups()
            if p_ID_acc.search(last_col):
                tair_id, = p_ID_acc.search(last_col).groups()
            if p_ID_protein_acc.search(last_col):
                tair_id, version = p_ID_protein_acc.search(last_col).groups()
            counter += 1
            if tair_id is not None and row[2] != 'chromosome':
                gene_id_set = getGeneIDSetGivenAccVer(tair_id,
                                                      gene_symbol2gene_id_set)
                gene_id = None

                if gene_id_set == None:
                    sys.stderr.write(
                        "Linking to gene id failed for %s. No such gene_symbol, %s, in gene_symbol2gene_id_set.\n"
                        % (last_col, tair_id))
                elif len(gene_id_set) == 1:
                    gene_id = list(gene_id_set)[0]
                    success_counter += 1
                elif len(gene_id_set) > 1:
                    sys.stderr.write("Too many gene_ids: %s, %s.\n" %
                                     (tair_id, gene_id_set))
                elif len(gene_id_set) == 0:
                    sys.stderr.write(
                        "Linking to gene id failed for %s. There is gene_symbol, %s, in gene_symbol2gene_id_set but it's empty.\n"
                        % (last_col, tair_id))
                else:
                    sys.stderr.write(
                        "not supposed to happen: original_name=%s, gene_symbol=%s, gene_id_set=%s\n."
                        % (last_col, tair_id, gene_id_set))
                if gene_id is not None:
                    gene_model = gene_annotation.gene_id2model.get(gene_id)
                    if gene_model is not None:
                        gene_commentary = gene_model.gene_commentaries[0]
                        gene_desc_ls = DrawSNPRegion.returnGeneDescLs(self.gene_desc_names, gene_model, gene_commentary, cutoff_length=600,\
                                   replaceNoneElemWithEmptyStr=1)
                        local_gene_desc_names = map(string.upper,
                                                    self.gene_desc_names)
                        description = ',  '.join([
                            ': '.join(entry) for entry in zip(
                                local_gene_desc_names, gene_desc_ls)
                        ])
                        description = description.replace(
                            ';', '%3B'
                        )  #escape ';', which is regarded as a separator for every "name=value"
                        description = description.replace(
                            ',', '%2C'
                        )  #escape ',', which is regarded as a separator for every "value"

                        if last_col[
                                -1] != ';':  #no ; delimiter at the end, append one
                            last_col += ';'
                        gene_symbol = gene_model.gene_symbol
                        gene_symbol = gene_symbol.replace(';', '%3B')
                        gene_symbol = gene_symbol.replace(',', '%2C')
                        if p_chr_name.match(
                                gene_symbol
                        ):  #match the chromosome name, change
                            gene_symbol = 'Gene %s' % gene_symbol
                        last_col += 'Alias=%s;' % gene_symbol
                        last_col += 'description=%s' % description
                        row[-1] = last_col
            if last_col[-1] == ';':
                last_col = last_col[:-1]
                row[-1] = last_col
            if counter % 5000 == 0:
                sys.stderr.write("%s%s\t%s" %
                                 ('\x08' * 80, success_counter, counter))
            writer.writerow(row)

        sys.stderr.write("%s/%s Done.\n" % (success_counter, counter))