def readInMonkeysFromDB(self, db_vervet=None, countryIDList=None, maxLongitude=None, addOnlyVWPMonkeys=False):
		"""
		2012.12.2 add argument addOnlyVWPMonkeys
		2012.11.26
		"""
		sys.stderr.write("Fetching all monkeys of country %s from db , maxLongitude = %s ..."%\
						(utils.getStrOutOfList(countryIDList), maxLongitude))
		
		monkeyID2Info = {}
		
		query_string = "select * from view_individual"
		where_condition_ls = ["country_id in (%s)"%(utils.getStrOutOfList(countryIDList)), \
							"latitude is not null and longitude is not null"]
		if maxLongitude is not None:
			where_condition_ls.append(" longitude<=%s "%(maxLongitude))
		if addOnlyVWPMonkeys:
			where_condition_ls.append(" ucla_id ~ '^VWP' ")
		if where_condition_ls:
			query_string = "%s where %s"%(query_string, " and ".join(where_condition_ls))
		
		query = db_vervet.metadata.bind.execute(query_string)
		for row in query:
			monkeyID2Info[row.ucla_id] = PassingData(latitude=row.latitude, longitude=row.longitude, sex=row.sex, db_id=row.id,\
												country=row.country, site_name=row.site_name, \
												alignment_id=None, alignment_depth=None)
			properAlignment = db_vervet.getProperAlignmentGivenIndividualID(ucla_id=row.ucla_id)
			if properAlignment:
				monkeyID2Info[row.ucla_id].alignment_id = properAlignment.alignment_id
				monkeyID2Info[row.ucla_id].alignment_depth = properAlignment.median_depth
		sys.stderr.write("%s monkeys.\n"%(len(monkeyID2Info)))
		return monkeyID2Info
	def readInSequencedMonkeys(self, db_vervet=None, countryIDList=None):
		"""
		2012.11.26
		"""
		sys.stderr.write("Fetching sequenced monkeys of country %s from db ..."%\
							(utils.getStrOutOfList(countryIDList)))
		
		query = db_vervet.metadata.bind.execute("select * from view_individual_sequence where country_id in (%s)"%\
								(utils.getStrOutOfList(countryIDList)))
		monkey_UCLAID_set = set()
		for row in query:
			monkey_UCLAID_set.add(row.ucla_id)
		
		sys.stderr.write("%s monkeys.\n"%(len(monkey_UCLAID_set)))
		return monkey_UCLAID_set
    def handleTitle(self, ):
        """
		2012.8.16
			add min_overlap_ratio, no of phenotypes, total_no_of_results into the title.
		"""
        if self.title:
            title = self.title
        else:
            phenotype_method_id_ls = list(self.phenotype_method_id_set)
            no_of_phenotypes = len(phenotype_method_id_ls)
            title = 'min_overlap %s, #results %s, #phenotypes %s'%(utils.getStrOutOfList(list(self.min_overlap_ratio_set)),\
               utils.getStrOutOfList(list(self.total_no_of_results_set)), no_of_phenotypes)
            #title = yh_matplotlib.constructTitleFromTwoDataSummaryStat(self.invariantPData.x_ls, self.invariantPData.y_ls)
        pylab.title(title)
        return title
Esempio n. 4
0
	def discoverAssociationLocus(self, associationPeakGraph=None, min_overlap_ratio=0.1):
		"""
		2012.12.12 try to output the peaks that are associated with one locus. for each peak, output
				* result-id 
				* phenotype id
				* chromosome
				* start
				* stop
				* start_locus
				* stop_locus
				* no_of_loci
				* peak_locus
				* peak-score
		2012.11.20
		2012.6.24
		"""
		sys.stderr.write("Discovering association loci from graph of %s nodes. %s edges. %s connected components..."%\
						(associationPeakGraph.number_of_nodes(), associationPeakGraph.number_of_edges(), \
						nx.number_connected_components(associationPeakGraph) ))
		cc_graph_list = nx.connected_component_subgraphs(associationPeakGraph)
		counter = 0
		associationLocusList = []
		for cc_graph in cc_graph_list:
			#calculate connectivity of this component
			ne = cc_graph.number_of_edges()
			nn = cc_graph.number_of_nodes()
			if nn>1:
				connectivity = ne/float(nn*(nn-1)/2)
			else:
				connectivity = 1
			start_ls = []
			stop_ls = []
			association_peak_ls = []
			#get span of each node, then take median of all its start/stop
			result_id_set = set()
			chromosome_set = set()	#should be only one chromosome
			phenotype_id_set = set()
			for n in cc_graph:
				nodeObject = associationPeakGraph.node[n]
				chromosome_set.add(nodeObject['chromosome'])
				span = nodeObject['span']
				start_ls.append(span[0])
				stop_ls.append(span[1])
				association_peak_ls.extend(nodeObject['association_peak_ls'])
				result_id_set.add(nodeObject['result_id'])
				phenotype_id_set.add(nodeObject['phenotype_method_id'])
			if len(chromosome_set)>1:
				sys.stderr.write("Error: %s chromosomes (%s) in one connected component.\n"%(len(chromosome_set), repr(chromosome_set)))
				sys.exit(7)
			median_start = numpy.median(start_ls)
			median_stop = numpy.median(stop_ls)
			no_of_results = len(result_id_set)
			
			associationLocus = PassingDataList()
			#assign each value separately to impose the order of variables in associationLocus's internal list
			associationLocus.chromosome = chromosome_set.pop()
			associationLocus.start=median_start
			associationLocus.stop=median_stop
			associationLocus.no_of_peaks=nn
			associationLocus.connectivity=connectivity
			associationLocus.no_of_results=no_of_results
			associationLocus.association_peak_ls=association_peak_ls
			phenotype_id_ls = list(phenotype_id_set)
			phenotype_id_ls.sort()
			associationLocus.phenotype_id_ls_in_str = utils.getStrOutOfList(phenotype_id_ls) 
			#PassingDataList is sortable via (chromosome, start, stop ...)
			associationLocusList.append(associationLocus)
			counter += 1
		sys.stderr.write("%s association loci.\n"%(counter))
		return associationLocusList