def readInMonkeysFromDB(self, db_vervet=None, countryIDList=None, maxLongitude=None, addOnlyVWPMonkeys=False): """ 2012.12.2 add argument addOnlyVWPMonkeys 2012.11.26 """ sys.stderr.write("Fetching all monkeys of country %s from db , maxLongitude = %s ..."%\ (utils.getStrOutOfList(countryIDList), maxLongitude)) monkeyID2Info = {} query_string = "select * from view_individual" where_condition_ls = ["country_id in (%s)"%(utils.getStrOutOfList(countryIDList)), \ "latitude is not null and longitude is not null"] if maxLongitude is not None: where_condition_ls.append(" longitude<=%s "%(maxLongitude)) if addOnlyVWPMonkeys: where_condition_ls.append(" ucla_id ~ '^VWP' ") if where_condition_ls: query_string = "%s where %s"%(query_string, " and ".join(where_condition_ls)) query = db_vervet.metadata.bind.execute(query_string) for row in query: monkeyID2Info[row.ucla_id] = PassingData(latitude=row.latitude, longitude=row.longitude, sex=row.sex, db_id=row.id,\ country=row.country, site_name=row.site_name, \ alignment_id=None, alignment_depth=None) properAlignment = db_vervet.getProperAlignmentGivenIndividualID(ucla_id=row.ucla_id) if properAlignment: monkeyID2Info[row.ucla_id].alignment_id = properAlignment.alignment_id monkeyID2Info[row.ucla_id].alignment_depth = properAlignment.median_depth sys.stderr.write("%s monkeys.\n"%(len(monkeyID2Info))) return monkeyID2Info
def readInSequencedMonkeys(self, db_vervet=None, countryIDList=None): """ 2012.11.26 """ sys.stderr.write("Fetching sequenced monkeys of country %s from db ..."%\ (utils.getStrOutOfList(countryIDList))) query = db_vervet.metadata.bind.execute("select * from view_individual_sequence where country_id in (%s)"%\ (utils.getStrOutOfList(countryIDList))) monkey_UCLAID_set = set() for row in query: monkey_UCLAID_set.add(row.ucla_id) sys.stderr.write("%s monkeys.\n"%(len(monkey_UCLAID_set))) return monkey_UCLAID_set
def handleTitle(self, ): """ 2012.8.16 add min_overlap_ratio, no of phenotypes, total_no_of_results into the title. """ if self.title: title = self.title else: phenotype_method_id_ls = list(self.phenotype_method_id_set) no_of_phenotypes = len(phenotype_method_id_ls) title = 'min_overlap %s, #results %s, #phenotypes %s'%(utils.getStrOutOfList(list(self.min_overlap_ratio_set)),\ utils.getStrOutOfList(list(self.total_no_of_results_set)), no_of_phenotypes) #title = yh_matplotlib.constructTitleFromTwoDataSummaryStat(self.invariantPData.x_ls, self.invariantPData.y_ls) pylab.title(title) return title
def discoverAssociationLocus(self, associationPeakGraph=None, min_overlap_ratio=0.1): """ 2012.12.12 try to output the peaks that are associated with one locus. for each peak, output * result-id * phenotype id * chromosome * start * stop * start_locus * stop_locus * no_of_loci * peak_locus * peak-score 2012.11.20 2012.6.24 """ sys.stderr.write("Discovering association loci from graph of %s nodes. %s edges. %s connected components..."%\ (associationPeakGraph.number_of_nodes(), associationPeakGraph.number_of_edges(), \ nx.number_connected_components(associationPeakGraph) )) cc_graph_list = nx.connected_component_subgraphs(associationPeakGraph) counter = 0 associationLocusList = [] for cc_graph in cc_graph_list: #calculate connectivity of this component ne = cc_graph.number_of_edges() nn = cc_graph.number_of_nodes() if nn>1: connectivity = ne/float(nn*(nn-1)/2) else: connectivity = 1 start_ls = [] stop_ls = [] association_peak_ls = [] #get span of each node, then take median of all its start/stop result_id_set = set() chromosome_set = set() #should be only one chromosome phenotype_id_set = set() for n in cc_graph: nodeObject = associationPeakGraph.node[n] chromosome_set.add(nodeObject['chromosome']) span = nodeObject['span'] start_ls.append(span[0]) stop_ls.append(span[1]) association_peak_ls.extend(nodeObject['association_peak_ls']) result_id_set.add(nodeObject['result_id']) phenotype_id_set.add(nodeObject['phenotype_method_id']) if len(chromosome_set)>1: sys.stderr.write("Error: %s chromosomes (%s) in one connected component.\n"%(len(chromosome_set), repr(chromosome_set))) sys.exit(7) median_start = numpy.median(start_ls) median_stop = numpy.median(stop_ls) no_of_results = len(result_id_set) associationLocus = PassingDataList() #assign each value separately to impose the order of variables in associationLocus's internal list associationLocus.chromosome = chromosome_set.pop() associationLocus.start=median_start associationLocus.stop=median_stop associationLocus.no_of_peaks=nn associationLocus.connectivity=connectivity associationLocus.no_of_results=no_of_results associationLocus.association_peak_ls=association_peak_ls phenotype_id_ls = list(phenotype_id_set) phenotype_id_ls.sort() associationLocus.phenotype_id_ls_in_str = utils.getStrOutOfList(phenotype_id_ls) #PassingDataList is sortable via (chromosome, start, stop ...) associationLocusList.append(associationLocus) counter += 1 sys.stderr.write("%s association loci.\n"%(counter)) return associationLocusList