def orderListTypeAnalysisMethodID(self, list_type_id_ls, analysis_method_id_ls): """ 2008-08-29 deal with separator (list_type_id=-1) in list_type_id_ls """ sys.stderr.write("Orderinig list type id and analysis_method id ... ") list_type_id_analysis_method_id_ls = [] list_type_id_analysis_method_id2index = {} list_type_analysis_method_label_ls = [] no_of_separators = 0 for list_type_id in list_type_id_ls: if list_type_id==-1: #separator no_of_separators += 1 tup = (-no_of_separators,-1) list_type_id_analysis_method_id2index[tup] = len(list_type_id_analysis_method_id_ls) list_type_id_analysis_method_id_ls.append(tup) list_type_analysis_method_label_ls.append('') continue list_type_short_name = GeneListType.get(list_type_id).short_name for analysis_method_id in analysis_method_id_ls: analysis_method_short_name = AnalysisMethod.get(analysis_method_id).short_name tup = (list_type_id, analysis_method_id) list_type_id_analysis_method_id2index[tup] = len(list_type_id_analysis_method_id_ls) list_type_id_analysis_method_id_ls.append(tup) list_type_analysis_method_label_ls.append('%s_%s_%s'%(analysis_method_short_name, list_type_short_name, list_type_id)) return_data = PassingData() return_data.list_type_id_analysis_method_id_ls = list_type_id_analysis_method_id_ls return_data.list_type_id_analysis_method_id2index = list_type_id_analysis_method_id2index return_data.list_type_analysis_method_label_ls = list_type_analysis_method_label_ls sys.stderr.write("Done.\n") return return_data
def get_symbol2MAJ_MIN(self, symbol2counts): #construct a dictionary to map input symbols to MAJ, MIN or '?' symbol2MAJ_MIN = {self.input_NA_char:'?'} #'NA' is always '?' symbols = symbol2counts.keys() if len(symbols) == 0: major = '' minor = '' elif len(symbols) == 1: symbol2MAJ_MIN[symbols[0]] = MAJ major = symbols[0] minor = '' elif len(symbols) ==2: major, minor = symbols if symbol2counts[major]<symbol2counts[minor]: minor, major = symbols #reverse them symbol2MAJ_MIN[major] = MAJ symbol2MAJ_MIN[minor] = MIN elif len(symbols)>2: major, minor = None, None symbol2MAJ_MIN = 3 passingdata = PassingData() passingdata.symbol2MAJ_MIN = symbol2MAJ_MIN passingdata.major = major passingdata.minor = minor return passingdata
def getStrainIDInfo(self, db, strain_id_info_query, strain_id_set=None): """ 2008-08-29 """ sys.stderr.write("Getting strain id info ...") rows = db.metadata.bind.execute(strain_id_info_query) strain_id_ls = [] strain_id2index = {} strain_label_ls = [] prev_country_abbr = None no_of_separators = 0 for row in rows: if strain_id_set and row.strainid not in strain_id_set: #skip continue if prev_country_abbr == None: prev_country_abbr = row.abbr elif row.abbr!=prev_country_abbr: prev_country_abbr = row.abbr no_of_separators += 1 strain_id2index[-no_of_separators] = len(strain_id_ls) strain_id_ls.append(-no_of_separators) strain_label_ls.append('') strain_id2index[row.strainid] = len(strain_id_ls) strain_id_ls.append(row.strainid) if len(row.sitename)>10: sitename = row.sitename[:10] else: sitename = row.sitename strain_label_ls.append('%s_%s_%s_%s'%(row.abbr, sitename, row.nativename, row.strainid)) strain_id_info = PassingData() strain_id_info.strain_id_ls = strain_id_ls strain_id_info.strain_id2index = strain_id2index strain_id_info.strain_label_ls = strain_label_ls sys.stderr.write("Done.\n") return strain_id_info
def computing_node_handler(self, communicator, data, param_obj): """ 2009-9-16 parameter test_type is renamed to test_type_id 2008-08-20 wrap all parameters into pd and pass it to run_wilcox_test 2008-07-17 """ node_rank = communicator.rank sys.stderr.write("Node no.%s working...\n"%node_rank) data = cPickle.loads(data) result_ls = [] pd = PassingData(snps_context_wrapper=param_obj.snps_context_wrapper,\ results_directory=param_obj.results_directory,\ min_MAF=param_obj.min_MAF, get_closest=self.get_closest, min_distance=self.min_distance, \ min_sample_size=self.min_sample_size, test_type_id=self.test_type_id, \ results_type=self.results_type, no_of_permutations=self.no_of_permutations,\ no_of_min_breaks=self.no_of_min_breaks) for results_method_id, list_type_id in data: pd.results_id = results_method_id pd.list_type_id = list_type_id result = self.run_wilcox_test(pd) if result is not None: result_ls.append(result) sys.stderr.write("Node no.%s done with %s results.\n"%(node_rank, len(result_ls))) return result_ls
def returnGeneSegments(self, db, elem=None, gene_commentary=None, commentary_type=None): """ 2012.5.15 add argument commentary_type to stop replicating gene_commentary.gene_commentary_type 2008-07-28 """ start_ls, stop_ls, gi_ls = self.return_location_list(elem) gene_segments = [] min_start = start_ls[0] max_stop = stop_ls[0] if commentary_type: gene_commentary_type = db.getGeneCommentaryType(commentary_type=commentary_type) else: gene_commentary_type = gene_commentary.gene_commentary_type for i in range(len(start_ls)): start = start_ls[i] stop = stop_ls[i] min_start_stop = min(start, stop) max_start_stop = max(start, stop) if min_start_stop < min_start: min_start = min_start_stop if max_start_stop > max_stop: max_stop = max_start_stop gi = gi_ls[i] gene_segment = GeneSegment(start=start, stop=stop, gi=gi, gene_commentary_type=gene_commentary_type) gene_segment.gene_commentary = gene_commentary gene_segments.append(gene_segment) passingdata = PassingData() passingdata.gene_segments = gene_segments passingdata.start = min_start passingdata.stop = max_stop return passingdata
def getPhenotypeInfo(self, db, where_condition): """ 2008-08-29 add -1 as a separator into phenotype_method_id_ls and others """ sys.stderr.write("Getting phenotype method info ...") rows = db.metadata.bind.execute("select distinct r.phenotype_method_id, p.biology_category_id from %s p, %s and p.id=r.phenotype_method_id order by p.biology_category_id, r.phenotype_method_id"\ %(PhenotypeMethod.table.name, where_condition)) phenotype_method_id_ls = [] phenotype_method_id2index = {} phenotype_method_label_ls = [] prev_biology_category_id = None no_of_separators = 0 for row in rows: if prev_biology_category_id == None: prev_biology_category_id = row.biology_category_id elif row.biology_category_id!=prev_biology_category_id: prev_biology_category_id = row.biology_category_id #add a blank phenotype id as separator no_of_separators += 1 phenotype_method_id2index[-no_of_separators] = len(phenotype_method_id_ls) phenotype_method_id_ls.append(-no_of_separators) phenotype_method_label_ls.append('') phenotype_method_id2index[row.phenotype_method_id] = len(phenotype_method_id_ls) phenotype_method_id_ls.append(row.phenotype_method_id) pm = PhenotypeMethod.get(row.phenotype_method_id) phenotype_method_label_ls.append('%s_%s'%(pm.id, pm.short_name)) phenotype_info = PassingData() phenotype_info.phenotype_method_id2index = phenotype_method_id2index phenotype_info.phenotype_method_id_ls = phenotype_method_id_ls phenotype_info.phenotype_method_label_ls = phenotype_method_label_ls sys.stderr.write("Done.\n") return phenotype_info
def getCallMethodInfo(cls, affiliated_table_name, extra_condition=None, extra_tables=None): """ 2009-1-30 similar to getPhenotypeInfo, getListTypeInfo, getAnalysisMethodInfo """ table_str = "%s s, %s p" % (affiliated_table_name, model.Stock_250kDB.CallMethod.table.name) if extra_tables: table_str += ", %s" % extra_tables where_condition = "p.id=s.call_method_id" if extra_condition: where_condition += " and %s" % extra_condition rows = model.db.metadata.bind.execute( "select distinct p.id, p.short_name from %s \ where %s order by p.id" % (table_str, where_condition) ) id_ls = [] id2index = {} label_ls = [] prev_biology_category_id = -1 no_of_separators = 0 for row in rows: id2index[row.id] = len(id_ls) id_ls.append(row.id) label_ls.append("%s %s" % (row.id, row.short_name)) list_info = PassingData() list_info.id2index = id2index list_info.id_ls = id_ls list_info.label_ls = label_ls return list_info
def getDataStructureFromSNPsD(self, snpsd): """ 05/07/08 """ sys.stderr.write("Reading data ...") no_of_rows = len(snpsd.positions) no_of_cols = len(snpsd.accessions) snps = [] nucs = [] for i in range(no_of_rows): one_snp_ls, symbol2counts = self.get_symbol2counts(snpsd.snps, fixed_index=i, no_of_rolls=no_of_cols, by_row=0) passingdata = self.get_symbol2MAJ_MIN(symbol2counts) if passingdata.symbol2MAJ_MIN==3: sys.stderr.write("Error: SNP %s (%s) has more than 2 alleles: %s.\n"%(i, snpsd.positions[i], repr(symbol2counts))) sys.exit(2) map_func = lambda x: passingdata.symbol2MAJ_MIN[x] one_snp_ls = map(map_func, one_snp_ls) snps.append(''.join(one_snp_ls)) nucs += [(passingdata.major, passingdata.minor)] passingdata = PassingData() passingdata.snps = array(snps) passingdata.sdps = Set(snps) passingdata.nucs = array(nucs) passingdata.numSamps = no_of_cols sys.stderr.write("Done.\n") return passingdata.snps, passingdata.sdps, passingdata.nucs, passingdata.numSamps
def merge_call_on_one_row(cls, ecotypeid_duplicate_index_ls, data_matrix, no_of_cols, NA_set=Set([0, -2])): """ 2008-07-11 calculate the inconsistency ratio among duplicates 2008-05-12 -2 is also ruled out, add NA_set """ one_row = numpy.zeros(no_of_cols) passingdata = PassingData() passingdata.no_of_non_NA_pairs = 0 passingdata.no_of_non_NA_inconsistent_pairs = 0 for i in range(no_of_cols): call_counter_ls = [0]*11 non_NA_call_number_set = Set() for index in ecotypeid_duplicate_index_ls: call_number = data_matrix[index][i] if call_number not in NA_set: #dont' need NA and non-touched bit call_counter_ls[call_number] += 1 non_NA_call_number_set.add(call_number) if len(non_NA_call_number_set)>0: passingdata.no_of_non_NA_pairs += 1 if len(non_NA_call_number_set)>1: passingdata.no_of_non_NA_inconsistent_pairs += 1 one_row[i] = dbSNP2data.get_majority_call_number(call_counter_ls) passingdata.one_row = one_row return passingdata
def get_data_matrix(self, db, phenotype_info, list_type_analysis_method_info, where_condition): sys.stderr.write("Getting data matrix ...") data_matrix = num.zeros([len(list_type_analysis_method_info.list_type_id_analysis_method_id2index), len(phenotype_info.phenotype_method_id2index)], num.float) data_matrix[:] = -1 i = 0 rows = db.metadata.bind.execute("select r.analysis_method_id, r.phenotype_method_id, c.* from %s order by analysis_method_id"\ %(where_condition)) min_value = None max_value = None for row in rows: tup = (row.list_type_id, row.analysis_method_id) row_index = list_type_analysis_method_info.list_type_id_analysis_method_id2index[tup] col_index = phenotype_info.phenotype_method_id2index[row.phenotype_method_id] if row.pvalue>0: data_value = -math.log10(row.pvalue) if min_value==None: min_value = data_value elif data_value<min_value: min_value = data_value if max_value==None: max_value=data_value elif data_value>max_value: max_value =data_value else: data_value = -2 #0 pvalue data_matrix[row_index, col_index] = data_value sys.stderr.write("Done.\n") return_data = PassingData() return_data.data_matrix = data_matrix return_data.min_value = min_value return_data.max_value = max_value return return_data
def preReduce(self, workflow=None, passingData=None, transferOutput=True, **keywords): """ 2013.2.10 """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def getPhenotypeInfo( cls, affiliated_table_name=None, extra_condition=None, extra_tables=None, with_category_separator=True ): """ 2009-12-1 add argument with_category_separator 2008-10-30 affiliated_table_name becomes optional 2008-10-19 add option extra_tables 2008-10-16 sort phenotype by biology_category_id and return other info as well """ if affiliated_table_name: table_str = "%s s, %s p" % (affiliated_table_name, model.Stock_250kDB.PhenotypeMethod.table.name) where_condition = ["p.id=s.phenotype_method_id"] else: table_str = "%s p" % (model.Stock_250kDB.PhenotypeMethod.table.name) where_condition = [] if extra_tables: table_str += ", %s" % extra_tables if extra_condition: where_condition.append(extra_condition) if where_condition: # 2009-3-9 where_condition = "where " + " and ".join(where_condition) else: where_condition = "" rows = model.db.metadata.bind.execute( "select distinct p.id, p.biology_category_id, p.short_name from %s\ %s order by p.biology_category_id, p.id" % (table_str, where_condition) ) phenotype_method_id_ls = [] phenotype_method_id2index = {} phenotype_method_label_ls = [] prev_biology_category_id = -1 no_of_separators = 0 for row in rows: if prev_biology_category_id == -1: prev_biology_category_id = row.biology_category_id elif with_category_separator and row.biology_category_id != prev_biology_category_id: prev_biology_category_id = row.biology_category_id # add a blank phenotype id as separator no_of_separators += 1 phenotype_method_id2index[-no_of_separators] = len(phenotype_method_id_ls) phenotype_method_id_ls.append(-no_of_separators) phenotype_method_label_ls.append("=====") phenotype_method_id2index[row.id] = len(phenotype_method_id_ls) phenotype_method_id_ls.append(row.id) phenotype_method_label_ls.append("%s %s" % (row.id, row.short_name)) phenotype_info = PassingData() phenotype_info.phenotype_method_id2index = phenotype_method_id2index phenotype_info.phenotype_method_id_ls = phenotype_method_id_ls phenotype_info.phenotype_method_label_ls = phenotype_method_label_ls return phenotype_info
def getListTypeInfo(cls, affiliated_table_name=None, extra_condition=None, extra_tables=None): """ 2009-3-9 handle the case in which there is no the where_condition at all. 2008-10-30 affiliated_table_name becomes optional 2008-10-19 add option extra_tables 2008-10-16 sort gene list type by biology_category_id and return other info as well add -1 as a separator into list_type_id_ls """ if affiliated_table_name: table_str = "%s s, %s p" % (affiliated_table_name, model.Stock_250kDB.GeneListType.table.name) where_condition = ["p.id=s.list_type_id"] else: table_str = "%s p" % (model.Stock_250kDB.GeneListType.table.name) where_condition = [] if extra_tables: table_str += ", %s" % extra_tables if extra_condition: where_condition.append(extra_condition) if where_condition: # 2009-3-9 where_condition = "where " + " and ".join(where_condition) else: where_condition = "" rows = model.db.metadata.bind.execute( "select distinct p.id, p.biology_category_id, p.short_name from %s \ %s order by p.biology_category_id, p.id" % (table_str, where_condition) ) list_type_id_ls = [] list_type_id2index = {} list_type_label_ls = [] prev_biology_category_id = -1 no_of_separators = 0 for row in rows: if prev_biology_category_id == -1: prev_biology_category_id = row.biology_category_id elif row.biology_category_id != prev_biology_category_id: prev_biology_category_id = row.biology_category_id no_of_separators += 1 list_type_id2index[-no_of_separators] = len(list_type_id_ls) list_type_id_ls.append(-no_of_separators) list_type_label_ls.append("====\n====") list_type_id2index[row.id] = len(list_type_id_ls) list_type_id_ls.append(row.id) list_type_label_ls.append("%s %s" % (row.id, row.short_name)) list_info = PassingData() list_info.list_type_id2index = list_type_id2index list_info.list_type_id_ls = list_type_id_ls list_info.list_type_label_ls = list_type_label_ls return list_info
def reduceAfterEachAlignment(self, workflow=None, passingData=None, mapEachChromosomeDataLs=None,\ reduceAfterEachChromosomeDataLs=None,\ transferOutput=True, **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.mapEachChromosomeDataLs = mapEachChromosomeDataLs returnData.reduceAfterEachChromosomeDataLs = reduceAfterEachChromosomeDataLs return returnData
def reduceEachVCF(self, workflow=None, chromosome=None, passingData=None, mapEachIntervalDataLs=None,\ transferOutput=True, **keywords): """ 2013.05.01 #. concatenate all the sub-VCFs into one """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.mapEachIntervalDataLs = mapEachIntervalDataLs refineGenotypeJobLs = [pdata.refineGenotypeJob for pdata in mapEachIntervalDataLs] mergeVCFReplicateColumnsJobLs = [pdata.mergeVCFReplicateColumnsJob for pdata in mapEachIntervalDataLs] realInputVolume = passingData.jobData.file.noOfIndividuals * passingData.jobData.file.noOfLoci baseInputVolume = 200*2000000 #base is 4X coverage in 20Mb region => 120 minutes walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=500).value #base is 4X, => 5000M job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=2000, \ minJobPropertyValue=2000, maxJobPropertyValue=8000).value self.concatenateOverlapIntervalsIntoOneVCFSubWorkflow(passingData=passingData, \ intervalJobLs=[pdata.beagleJob for pdata in mapEachIntervalDataLs],\ outputDirJob=self.beagleReduceDirJob, \ transferOutput=True, job_max_memory=job_max_memory, walltime=walltime,\ **keywords) self.concatenateOverlapIntervalsIntoOneVCFSubWorkflow(passingData=passingData, \ intervalJobLs=refineGenotypeJobLs, outputDirJob=self.replicateVCFDirJob, \ transferOutput=True, job_max_memory=job_max_memory, walltime=walltime, \ **keywords) self.concatenateOverlapIntervalsIntoOneVCFSubWorkflow(passingData=passingData, \ intervalJobLs=mergeVCFReplicateColumnsJobLs, outputDirJob=self.reduceOutputDirJob, \ transferOutput=True, job_max_memory=job_max_memory, walltime=walltime,\ **keywords) for pdata in mapEachIntervalDataLs: #add this output to the union job """ self.addInputToStatMergeJob(statMergeJob=self.reduceBeaglePhaseReplicateConcordanceJob_AllSites, \ parentJobLs=[pdata.beaglePhasedReplicateConcordanceJob]) self.addInputToStatMergeJob(statMergeJob=self.reduceBeaglePhaseReplicateConcordanceJob_HomoOnly, \ parentJobLs=[pdata.beaglePhasedReplicateConcordanceJob]) """ self.addInputToStatMergeJob(statMergeJob=self.reduceTrioCallerReplicateConcordanceJob_AllSites, \ parentJobLs=[pdata.trioCallerReplicateConcordanceJob]) self.addInputToStatMergeJob(statMergeJob=self.reduceTrioCallerReplicateConcordanceJob_HomoOnly, \ parentJobLs=[pdata.trioCallerReplicateConcordanceJob]) return returnData
def mapEachInterval(self, workflow=None, alignmentData=None, intervalData=None,\ VCFJobData=None, passingData=None, transferOutput=True, **keywords): """ 2012.9.17 """ if workflow is None: workflow = self returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob alignment = alignmentData.alignment parentJobLs = alignmentData.jobLs bamF = alignmentData.bamF baiF = alignmentData.baiF bamFnamePrefix = passingData.bamFnamePrefix if intervalData.file: mpileupInterval = intervalData.interval bcftoolsInterval = intervalData.file else: mpileupInterval = intervalData.interval bcftoolsInterval = intervalData.interval intervalFileBasenameSignature = intervalData.intervalFileBasenameSignature overlapInterval = intervalData.overlapInterval overlapFileBasenameSignature = intervalData.overlapIntervalFnameSignature VCFFile = VCFJobData.file annotationName = passingData.annotationName outputFile = File(os.path.join(topOutputDirJob.output, '%s_%s.%s.vcf'%(bamFnamePrefix, overlapFileBasenameSignature, annotationName))) variantAnnotatorJob = self.addGATKVariantAnnotatorJob(workflow, executable=workflow.annotateVariantJava, \ GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, bamFile=bamF, \ VCFFile=VCFFile, annotationName=annotationName, interval=bcftoolsInterval, outputFile=outputFile, \ refFastaFList=passingData.refFastaFList, parentJobLs=[topOutputDirJob]+parentJobLs, extraDependentInputLs=[baiF, VCFFile.tbi_F], \ transferOutput=False, \ extraArguments=None, job_max_memory=4000) outputFile = File(os.path.join(topOutputDirJob.output, '%s_%s.%s.tsv'%(bamFnamePrefix, overlapFileBasenameSignature, annotationName))) extractInfoJob = self.addGenericJob(workflow=workflow, executable=workflow.ExtractInfoFromVCF, inputFile=variantAnnotatorJob.output, \ inputArgumentOption="-i", \ outputFile=outputFile, outputArgumentOption="-o", \ parentJobLs=[variantAnnotatorJob], extraDependentInputLs=None, extraOutputLs=None, transferOutput=False, \ extraArguments="-k %s"%(annotationName), extraArgumentList=None, job_max_memory=2000, sshDBTunnel=None, \ key2ObjectForJob=None) returnData.jobDataLs.append(PassingData(jobLs=[variantAnnotatorJob, extractInfoJob], file=variantAnnotatorJob.output, \ fileLs=[variantAnnotatorJob.output, extractInfoJob.output])) returnData.variantAnnotatorJob=variantAnnotatorJob returnData.extractInfoJob=extractInfoJob #add the sub-alignment to the alignment merge job self.no_of_jobs += 2 return returnData
def getStrainInfoGivenPlateInfo(self, db, plate_info, strain_id_info_query, strain_id_set=None): """ 2008-09-13 order/group the strains according to plate_set, country, strain longitude fetch appropriate labels for each strain """ sys.stderr.write("Getting strain_info given plate_info ...") #fetch appropriate label, and put strain id in country_longitude order within each plate plate_set2strain_id_ls_in_GPS_order = {} strain_id2label = {} rows = db.metadata.bind.execute(strain_id_info_query) for row in rows: if strain_id_set and row.strainid not in strain_id_set: #skip continue plate_set = plate_info.strain_id2plate_set[row.strainid] if plate_set not in plate_set2strain_id_ls_in_GPS_order: plate_set2strain_id_ls_in_GPS_order[plate_set] = [] plate_set2strain_id_ls_in_GPS_order[plate_set].append(row.strainid) if len(row.sitename)>10: #cut short on the site name sitename = row.sitename[:10] else: sitename = row.sitename strain_label = '%s_%s_%s_%s_%s'%(row.abbr, sitename, row.nativename, row.strainid, repr(plate_set)[1:-1]) strain_id2label[row.strainid] = strain_label #put in plate_set order, assign row index plate_set_ls = plate_set2strain_id_ls_in_GPS_order.keys() plate_set_ls.sort() no_of_plates = len(plate_set_ls) strain_id_ls = [] strain_id2index = {} strain_label_ls = [] for i in range(no_of_plates): plate_set = plate_set_ls[i] plate_strain_id_ls = plate_set2strain_id_ls_in_GPS_order[plate_set] if i!=0: #insert separator, but not before the first plate_set strain_id2index[-i] = len(strain_id2index) strain_id_ls.append(-i) strain_label_ls.append('') for strain_id in plate_strain_id_ls: strain_id2index[strain_id] = len(strain_id2index) strain_id_ls.append(strain_id) strain_label_ls.append(strain_id2label[strain_id]) strain_id_info = PassingData() strain_id_info.strain_id_ls = strain_id_ls strain_id_info.strain_id2index = strain_id2index strain_id_info.strain_label_ls = strain_label_ls sys.stderr.write("Done.\n") return strain_id_info
def linkMapToReduce(self, workflow=None, mapEachIntervalData=None, preReduceReturnData=None, passingData=None, transferOutput=True, **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] for jobData in mapEachIntervalData.jobDataLs: calculaJob = jobData.jobLs[0] self.addInputToStatMergeJob(workflow, statMergeJob=preReduceReturnData.aggregateAndHClusterDistanceMatrixJob, \ inputF=calculaJob.output, \ parentJobLs=[calculaJob]) return returnData
def addOneEntry(self, row): from pymodule import PassingData data_obj = PassingData() for table_field in self.table_field_ls: setattr(data_obj, table_field, getattr(row, table_field, None)) data_obj.chr = row.snp.chromosome data_obj.pos = row.snp.position chr_pos_key = (data_obj.chr, data_obj.pos) if chr_pos_key not in self.chr_pos2index_ls: self.chr_pos2index_ls[chr_pos_key] = [] self.chr_pos2index_ls[chr_pos_key].append(len(self.data_ls)) self.data_ls.append(data_obj)
def addAllJobs(self, workflow=None, db_250k=None, association_result_ls=None, \ data_dir=None, min_MAF=None, \ neighbor_distance=None, max_neighbor_distance=None, \ min_score_ls=None, min_overlap_ratio_ls=None, ground_score=None,\ peakPadding=None, tax_id=None, \ outputDirPrefix="", transferOutput=True, job_max_memory=2000, **keywords): """ 2013.2.27 run ms estimate parameters from ms forward simulator with estimated ms-parameters or take the output of ms as input """ if workflow is None: workflow = self sys.stderr.write("Adding jobs for pop-gen & pedigree sequence simulation #jobs=%s... \n"%\ (self.no_of_jobs)) returnData = PassingData() returnData.jobDataLs = [] passingData = PassingData(fileBasenamePrefix=None, \ outputDirPrefix=outputDirPrefix, \ jobData=None,\ preReduceReturnData=None,\ association_group_key2orderIndex = {},\ association_group_key2resultList = {},\ association_group_key2reduceAssociationPeakJobMatrix = {},\ association_group_key2countAssociationLocusJobList = {},\ resultID2defineLandscapeJobData = {}, ) preReduceReturnData = self.preReduce(workflow=workflow, outputDirPrefix=outputDirPrefix, \ passingData=passingData, transferOutput=False,\ **keywords) mapDirJob = preReduceReturnData.mapDirJob plotOutputDirJob = preReduceReturnData.plotOutputDirJob countAssociationLocusOutputDirJob = preReduceReturnData.countAssociationLocusOutputDirJob reduceOutputDirJob = preReduceReturnData.reduceOutputDirJob passingData.preReduceReturnData = preReduceReturnData #add output pedigree job for i in xrange(self.noOfReplicates): popGenSimulationFolderJob = self.addMkDirJob(outputDir=os.path.join(mapDirJob.output, 'popGenSim%s'%(i)), \ parentJobLs=[mapDirJob]) popSimulationJob = self.addPopGenSimulationJob()
def generate_parameters(self, parameter_names, parameter_depth=2): """ 2008-05-19 min_call_probability = self.min_call_probability 2008-05-11 put NA rate into passing parameters as well. too much memory consumption on each computing node """ sys.stderr.write("Generating parameter settings ...") param_d = PassingData() for parameter_name in parameter_names: parameter_value = getattr(self, parameter_name) parameter_value = parameter_value.split(",") parameter_value = map(float, parameter_value) setattr(self, parameter_name, parameter_value) """ #2008-05-19 commented out. use self.min_call_probability #figure out call probability from input_fname import re call_prob_pattern = re.compile(r'_(\d+)\.csv') call_prob_p_result = call_prob_pattern.search(self.input_fname) if call_prob_p_result: min_call_probability = float(call_prob_p_result.groups()[0]) else: min_call_probability = -1 """ min_call_probability = self.min_call_probability # only 1st 4, last 2 passed to computing node parameters = [] for max_call_mismatch_rate in getattr(self, parameter_names[0]): for max_call_NA_rate in getattr(self, parameter_names[1]): for max_snp_mismatch_rate in getattr(self, parameter_names[2]): for max_snp_NA_rate in getattr(self, parameter_names[3]): for npute_window_size in getattr(self, parameter_names[4]): parameters.append( [ min_call_probability, max_call_mismatch_rate, max_call_NA_rate, max_snp_mismatch_rate, max_snp_NA_rate, npute_window_size, ] ) param_d.parameters = parameters param_d.max_snp_NA_rate_ls = self.max_snp_NA_rate_ls param_d.npute_window_size_ls = self.npute_window_size_ls sys.stderr.write(" %s parameter settings to process. Done.\n" % len(parameters)) return param_d
def addJobs(self, workflow=None, inputData=None, db_vervet=None, genotypeMethodShortName=None, commit=None,\ data_dir=None, checkEmptyVCFByReading=False, transferOutput=True,\ maxContigID=None, outputDirPrefix="", needSSHDBTunnel=False): """ 2012.5.9 """ sys.stderr.write("Adding VCF2DB jobs for %s vcf files ... "%(len(inputData.jobDataLs))) topOutputDir = "%sVCF2DB"%(outputDirPrefix) topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir) firstVCFFile = inputData.jobDataLs[0].vcfFile logFile = File(os.path.join(topOutputDir, 'AddGenotypeMethod2DB.log')) addGM2DBJob = self.addAddGenotypeMethod2DBJob(executable=self.AddGenotypeMethod2DB, inputFile=firstVCFFile, \ genotypeMethodShortName=genotypeMethodShortName,\ logFile=logFile, data_dir=data_dir, commit=commit, parentJobLs=[], extraDependentInputLs=[], transferOutput=True, \ extraArguments=None, job_max_memory=10, sshDBTunnel=needSSHDBTunnel) updateGMlogFile = File(os.path.join(topOutputDir, 'updateGM.log')) updateGMNoOfLociJob = self.addUpdateGenotypeMethodNoOfLociJob(executable=self.UpdateGenotypeMethodNoOfLoci, \ genotypeMethodShortName=genotypeMethodShortName,\ logFile=updateGMlogFile, data_dir=data_dir, commit=commit, parentJobLs=[topOutputDirJob], \ extraDependentInputLs=[], transferOutput=True, \ extraArguments=None, job_max_memory=20, sshDBTunnel=needSSHDBTunnel) returnData = PassingData() returnData.jobDataLs = [] for jobData in inputData.jobDataLs: inputF = jobData.vcfFile if maxContigID: contig_id = self.getContigIDFromFname(inputF.name) try: contig_id = int(contig_id) if contig_id>maxContigID: #skip the small contigs continue except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() logFile = File(os.path.join(topOutputDir, 'AddVCFFile2DB_%s.log'%(self.getChrFromFname(inputF.name)))) addVCFJob = self.addAddVCFFile2DBJob(executable=self.AddVCFFile2DB, inputFile=inputF, genotypeMethodShortName=genotypeMethodShortName,\ logFile=logFile, format="VCF", data_dir=data_dir, checkEmptyVCFByReading=checkEmptyVCFByReading, commit=commit, \ parentJobLs=[addGM2DBJob]+jobData.jobLs, extraDependentInputLs=[], transferOutput=True, \ extraArguments=None, job_max_memory=1000, sshDBTunnel=needSSHDBTunnel) workflow.depends(parent=addVCFJob, child=updateGMNoOfLociJob) sys.stderr.write("%s jobs.\n"%(self.no_of_jobs)) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[updateGMNoOfLociJob], file=updateGMlogFile, \ fileLs=[updateGMlogFile])) return returnData
def get_qccall_results(self, input_dir): import os,sys,csv from variation.src.MpiQCCall import MpiQCCall from pymodule import PassingData """ var_name_ls = ['strain or snp', 'after_imputation'] + MpiQCCall.common_var_name_ls avg_var_name_pair_ls, partial_header_avg = MpiQCCall.generate_avg_variable_names(MpiQCCall.avg_var_name_ls) var_name_ls += partial_header_avg """ files = os.listdir(input_dir) passingdata_ls = [] no_of_objects = len(files) var_name_ls = [] for i in range(no_of_objects): sys.stderr.write("\t%d/%d: from %s ... \n"%(i+1, no_of_objects, files[i])) filename = os.path.join(input_dir, files[i]) reader = csv.reader(open(filename)) try: row = reader.next() if len(var_name_ls)==0: var_name_ls = row except: if self.debug: import traceback traceback.print_exc() sys.stderr.write('%s\n'%sys.exc_info()) sys.stderr.write('\terror in reading this file. ignored.\n') del reader continue for row in reader: passingdata = PassingData() for i in range(len(var_name_ls)): var_name = var_name_ls[i] if var_name!='strain or snp': value = float(row[i]) else: #the first column is strain or snp, no float conversion value = row[i] setattr(passingdata, var_name, value) # #two new variables record no of accessions/snps lost passingdata.no_of_total_accessions_filtered = passingdata.no_of_accessions_filtered_by_mismatch + passingdata.no_of_accessions_filtered_by_na passingdata.no_of_total_snps_filtered = passingdata.no_of_snps_filtered_by_mismatch +\ passingdata.no_of_snps_filtered_by_na passingdata.no_of_total_snps_removed = passingdata.no_of_total_snps_filtered +\ passingdata.no_of_monomorphic_snps_removed passingdata_ls.append(passingdata) del reader return passingdata_ls, var_name_ls
def reduceBeforeEachAlignment(self, workflow=None, passingData=None, preReduceReturnData=None, transferOutput=True, **keywords): """ 2012.9.17 add a merge variant annotation job, GW plot job """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] outputDirPrefix = passingData.outputDirPrefix statOutputDirJob = preReduceReturnData.statOutputDirJob plotOutputDirJob = preReduceReturnData.plotOutputDirJob mergeOutputF = File(os.path.join(statOutputDirJob.output, '%s_%s.tsv'%(passingData.bamFnamePrefix, passingData.annotationName))) mergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \ outputF=mergeOutputF, transferOutput=transferOutput, parentJobLs=[statOutputDirJob],) returnData.jobDataLs.append(PassingData(jobLs=[mergeJob ], file=mergeJob.output, fileLs=[mergeJob.output], mergeJob=mergeJob)) self.no_of_jobs += 1 outputFnamePrefix = os.path.join(plotOutputDirJob.output, '%s_%s_Plot'%(passingData.bamFnamePrefix, passingData.annotationName)) # whichColumnPlotLabel and xColumnPlotLabel should not contain spaces or ( or ). because they will disrupt shell commandline self.addPlotVCFtoolsStatJob(executable=workflow.PlotVCFtoolsStat, inputFileList=[mergeOutputF], \ outputFnamePrefix=outputFnamePrefix, \ whichColumn=None, whichColumnHeader=passingData.annotationName, whichColumnPlotLabel=passingData.annotationName, \ need_svg=False, \ logY=0, valueForNonPositiveYValue=-1, \ xColumnPlotLabel="position", chrLengthColumnHeader=None, chrColumnHeader="CHROM", \ minChrLength=None, xColumnHeader="POS", minNoOfTotal=50,\ figureDPI=100, ylim_type=2, samplingRate=0.01,\ parentJobLs=[mergeJob, plotOutputDirJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=True, sshDBTunnel=self.needSSHDBTunnel) self.no_of_jobs += 1 outputFile = File( os.path.join(plotOutputDirJob.output, '%s_%s_Hist.png'%(passingData.bamFnamePrefix, passingData.annotationName))) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[mergeJob.output], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader=passingData.annotationName, whichColumnPlotLabel=passingData.annotationName, \ logY=None, logCount=True, valueForNonPositiveYValue=-1,\ minNoOfTotal=10,\ figureDPI=100, samplingRate=0.1,\ parentJobLs=[plotOutputDirJob, mergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=True, job_max_memory=2000) self.no_of_jobs += 1 return returnData
def getResultsMethodIDInfo(self, db, call_method_id_ls, min_distance, get_closest, min_MAF): """ 2008-09-05 use results_by_gene.id as main result id """ sys.stderr.write("Gettiing ResultsMethodIDInfo ...") results_method_id_info = PassingData() results_method_id_ls = [] results_method_id2index = {} results_method_id_label_ls = [] rows = db.metadata.bind.execute("select distinct rg.id, rg.results_method_id, r.analysis_method_id, r.phenotype_method_id, \ p.biology_category_id from %s rg, %s r, %s p \ where rg.results_method_id=r.id and p.id=r.phenotype_method_id and r.call_method_id in (%s) \ and rg.min_distance=%s and rg.get_closest=%s and rg.min_MAF>=%s-0.0001 and rg.min_MAF<=%s+0.0001 \ order by p.biology_category_id, r.phenotype_method_id, r.analysis_method_id"%(ResultsByGene.table.name, \ ResultsMethod.table.name, PhenotypeMethod.table.name, repr(call_method_id_ls)[1:-1], min_distance, get_closest, min_MAF, min_MAF)) prev_phenotype_method_id = None prev_biology_category_id = None no_of_separators = 0 for row in rows: if prev_biology_category_id==None: prev_biology_category_id = row.biology_category_id elif row.biology_category_id!=prev_biology_category_id: prev_biology_category_id = row.biology_category_id no_of_separators += 1 results_method_id2index[-no_of_separators] = len(results_method_id_ls) results_method_id_ls.append(-no_of_separators) results_method_id_label_ls.append('') if prev_phenotype_method_id == None: prev_phenotype_method_id = row.phenotype_method_id elif row.phenotype_method_id!=prev_phenotype_method_id: prev_phenotype_method_id = row.phenotype_method_id #add a blank phenotype id as separator no_of_separators += 1 results_method_id2index[-no_of_separators] = len(results_method_id_ls) results_method_id_ls.append(-no_of_separators) results_method_id_label_ls.append('') results_method_id2index[row.id] = len(results_method_id_ls) results_method_id_ls.append(row.id) am = AnalysisMethod.get(row.analysis_method_id) pm = PhenotypeMethod.get(row.phenotype_method_id) results_method_id_label_ls.append('%s_%s_%s'%(am.short_name, pm.short_name, pm.id)) results_method_id_info.results_method_id_ls = results_method_id_ls results_method_id_info.results_method_id2index = results_method_id2index results_method_id_info.results_method_id_label_ls = results_method_id_label_ls sys.stderr.write("%s results. Done.\n"%(len(results_method_id_ls))) return results_method_id_info
def run(self): """ 2008-10-28 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session hist_type = CheckCandidateGeneRank.getHistType(self.call_method_id, self.min_distance, self.get_closest, self.min_MAF, \ self.allow_two_sample_overlapping, self.results_type, self.null_distribution_type_id) snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest) param_obj = PassingData(call_method_id=self.call_method_id, \ analysis_method_id=getattr(self, 'analysis_method_id', None),\ analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\ phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\ list_type_id_ls=self.list_type_id_ls, \ results_type=self.results_type,\ no_check_gene_list=True) params_ls = self.generate_params(param_obj) pd = PassingData(snps_context_wrapper=snps_context_wrapper, \ results_directory=self.results_directory, \ min_MAF=self.min_MAF, get_closest=self.get_closest, min_distance=self.min_distance,\ no_of_top_snps=self.no_of_top_snps, min_sample_size=self.min_sample_size, test_type_id=self.test_type_id, \ results_type=self.results_type, no_of_permutations=self.no_of_permutations,\ no_of_min_breaks=self.no_of_min_breaks, type=hist_type,\ null_distribution_type_id=self.null_distribution_type_id,\ allow_two_sample_overlapping=self.allow_two_sample_overlapping, min_score=self.min_score, session=session,\ commit=self.commit) for results_id, list_type_id in params_ls: pd.list_type_id = list_type_id pd.results_id = results_id self.pick_candidate_genes(pd)
def countAlleleComboFrequency(cls, snp1_id, gene1_id, genotype_ls1, snp2_id, gene2_id, genotype_ls2, phenotype_index, phenotype_ls, \ min_data_point=3): """ 2009-2-18 to test how many distinct allele-combos each SNP pair has """ return_ls = [] no_of_rows = len(genotype_ls1) allele_combo2freq = {} for i in range(no_of_rows): allele1 = genotype_ls1[i] allele2 = genotype_ls2[i] phenotype_value = phenotype_ls[i] if numpy.isnan(allele1) or numpy.isnan(allele2) or numpy.isnan( phenotype_value): continue allele_combo = (allele1, allele2) if allele_combo not in allele_combo2freq: allele_combo2freq[allele_combo] = 0 allele_combo2freq[allele_combo] += 1 pdata = PassingData(snp1_id=snp1_id, gene1_id=gene1_id, snp2_id=snp2_id, gene2_id=gene2_id, pvalue=None,\ count1=len(allele_combo2freq), count2=None, phenotype_index=phenotype_index, coeff_list=allele_combo2freq) return_ls.append(pdata) return return_ls
def computing_node_handler(self, communicator, data, computing_parameter_obj): """ 2008-09-10 add source_id to PassingData 2008-08-28 """ node_rank = communicator.rank sys.stderr.write("Node no.%s working...\n" % node_rank) data = cPickle.loads(data) result_ls = [] twoSNPData = computing_parameter_obj.twoSNPData QC_method_id = computing_parameter_obj.QC_method_id for row_id1, row_id2 in data: NA_rate, mismatch_rate, no_of_NAs, no_of_totals, no_of_mismatches, no_of_non_NA_pairs = twoSNPData.cmpOneRow( row_id1, row_id2) #the 2nd position in the row-id1 tuple is strain id if QC_method_id == 4: #the 2nd position in the row-id2 tuple is strain id target_id = row_id2[1] else: target_id = row_id2 qc_cross_match = PassingData(source_id=row_id1[0], strainid=row_id1[1], target_id=target_id, mismatch_rate=mismatch_rate, \ no_of_mismatches=no_of_mismatches, no_of_non_NA_pairs=no_of_non_NA_pairs) result_ls.append(qc_cross_match) sys.stderr.write("Node no.%s done with %s results.\n" % (node_rank, len(result_ls))) return result_ls
def readThroughAndProvideSummary(self): """ 2013.08.30 called by vervet/src/db/input/AddAlignmentDepthIntervalFile2DB.py """ col_name2index = self.smartReadHeader() if col_name2index is None: pdata = self.parseRow(self._row) self._postProcessParsedRowDataForSummary(pdata) for row in self: pdata = self.parseRow(row) self._postProcessParsedRowDataForSummary(pdata) self.min_interval_length = numpy.min(self.interval_length_ls) self.max_interval_length = numpy.max(self.interval_length_ls) self.median_interval_length = numpy.median(self.interval_length_ls) self.mean_interval_value = numpy.mean(self.interval_value_ls) self.median_interval_value = numpy.median(self.interval_value_ls) return PassingData(no_of_intervals=self.no_of_intervals, chromosome_size=self.chromosome_size, \ mean_interval_value=self.mean_interval_value,\ median_interval_value=self.median_interval_value,\ min_interval_value=self.min_interval_value,\ max_interval_value=self.max_interval_value,\ min_interval_length=self.min_interval_length,\ max_interval_length=self.max_interval_length ,\ median_interval_length=self.median_interval_length)
def general_output_node(self, output_dir, phenotype_index_ls, phenotype_label_ls, free_computing_nodes): """ 2009-2-8 general strategy for output node to do while it's computing refactored out of run() """ if not os.path.isdir(output_dir): os.makedirs(output_dir) writer_dict = {} header_row = [ 'snp1_id', 'gene1_id', 'snp2_id', 'gene2_id', 'bool_type', 'pvalue', 'count1', 'count2', 'var_perc', 'coeff_list', 'coeff_p_value_list' ] for phenotype_index in phenotype_index_ls: phenotype_label = phenotype_label_ls[phenotype_index] phenotype_label = phenotype_label.replace( '/', '_') #'/' is taken as folder separator output_fname = os.path.join(output_dir, 'SNPpair_%s.tsv' % phenotype_label) writer = csv.writer(open(output_fname, 'w'), lineterminator='\n', delimiter='\t') writer.writerow(header_row) writer_dict[phenotype_index] = writer param_obj = PassingData(writer_dict=writer_dict, header_row=header_row) self.output_node(free_computing_nodes, param_obj, self.output_node_handler) del writer_dict
def get_strain_id_info(self, QC_method_id, ignore_strains_with_qc=True): """ 2008-08-18 to generate data structure related to strain_id, preparation to get data_matrix strainid not QCed yet link to tg_ecotypeid """ sys.stderr.write("Getting strain_id info ... ") strain_id2index = {} strain_id_list = [] strain_id2acc = {} strain_id2category = {} rows = StockDB.Strain.query.all() for row in rows: if ignore_strains_with_qc: ignore_this = 0 for call_qc in row.call_qc_ls: if call_qc.qc_method_id == QC_method_id: #QC already done ignore_this = 1 break if ignore_this: continue strain_id = row.id strain_index = len(strain_id_list) strain_id_list.append(strain_id) strain_id2index[strain_id] = strain_index strain_id2acc[ strain_id] = row.ecotypeid_strainid2tg_ecotypeid.tg_ecotypeid strain_id2category[strain_id] = strain_id passingdata = PassingData(strain_id2index=strain_id2index, strain_id_list=strain_id_list, strain_id2acc=strain_id2acc,\ strain_id2category=strain_id2category) sys.stderr.write("%s strains. Done.\n" % (len(strain_id_list))) return passingdata
def loadDataStructure(self, gene_annotation_picklef, min_MAF=0, min_distance=20000, \ list_type_id=None, snp_matrix_fname=None, snp_matrix_data_type=1): """ 2009-5-30 add argument snp_matrix_fname 2008-11-25 2008-10-01 wrap a few functions up, convenient for both run() and drawSNPRegion() """ db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) self.db = db snp_info = self.getSNPInfo(db) gene_annotation = self.dealWithGeneAnnotation(gene_annotation_picklef) if list_type_id: candidate_gene_list = self.getGeneList(list_type_id) candidate_gene_set = Set(candidate_gene_list) else: candidate_gene_set = Set() if snp_matrix_fname: if snp_matrix_data_type==3: matrix_data_type=float #2009-3-23 for CNV amplitude file else: matrix_data_type=int snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=matrix_data_type) #2008-12-05 fake a snp_info for findSNPsInRegion self.construct_chr_pos2index_forSNPData(snpData) else: snpData = None return_data = PassingData(gene_annotation=gene_annotation, snp_info=snp_info, \ candidate_gene_set=candidate_gene_set, snpData=snpData) return return_data
def parseFastaDescriptionForGenBank(self, descriptionLine=None, FigureOutTaxID_ins=None): """ 2011-7-6 """ """ possible header lines: >gi|51511461|ref|NC_000001.8|NC_000001 H**o sapiens chromosome 1, complete sequence >gi|186497660|ref|NC_003070.6| Arabidopsis thaliana chromosome 1, complete sequence >gi|26556996|ref|NC_001284.2| Arabidopsis thaliana mitochondrion, complete genome >gi|115442598|ref|NC_008394.1| Oryza sativa (japonica cultivar-group) genomic DNA, chromosome 1 """ header = descriptionLine[1:-1] #discard '>' and '\n' header = header.split('|') _tax_id = FigureOutTaxID_ins.returnTaxIDGivenSentence(header[4]) if self.p_chromosome.search(header[4]) is not None: chromosome = self.p_chromosome.search(header[4]).groups()[0] elif header[4].find('mitochondrion')!=-1: chromosome = 'mitochondrion' elif header[4].find('chloroplast')!=-1: chromosome = 'chloroplast' else: #something else, take the whole before ',' chromosome = header[4].split(',')[0] gi = int(header[1]) acc_ver = header[3] comment = header[4] return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
def findSNPsInRegion(self, snp_info, chromosome, start, stop, center_snp_position=None): """ 2008-10-1 called by plotSNPRegion() find SNPs in this region, if center_snp_position is not given, find one. similar to getSNPsAroundThisSNP() """ if self.report: sys.stderr.write("Get SNPs in this region ...") from DrawSNPRegion import SNPPassingData chr_pos_ls = [] chr_pos2adjacent_window = {} j = 0 midpoint = (start+stop)/2. if center_snp_position is None: _center_snp_position = start else: _center_snp_position = center_snp_position center_snp = SNPPassingData(chromosome=chromosome, position=_center_snp_position, snps_id=None) for i in range(start-1, stop+2): new_pos = i new_chr_pos = (chromosome, new_pos) if new_chr_pos in snp_info.chr_pos2index: if center_snp_position is None and abs(new_pos-midpoint)<abs(center_snp.position-midpoint): #this SNP is closer to the center center_snp.position = new_pos chr_pos_ls.append(new_chr_pos) if j!=0: #add_mid_point(chr_pos_ls, chr_pos2adjacent_window) pass j += 1 center_snp.snps_id = '%s_%s'%(center_snp.chromosome, center_snp.position) snp_region = PassingData(chr_pos_ls=chr_pos_ls, chr_pos2adjacent_window=chr_pos2adjacent_window, center_snp=center_snp) if self.report: sys.stderr.write("Done.\n") return snp_region
def returnLocusLowMapQualityIndicator(self, alignedReadLs=None, minMapQGoodRead=2, minFractionOfGoodRead=0.9): """ 2013.12.04 """ totalNoOfReads = 0 noOfGoodReads = 0.0 medianMapQ=-10 mapQList=[] for alignedRead in alignedReadLs: totalNoOfReads +=1 mapQList.append(alignedRead.mapq) if alignedRead.mapq>=minMapQGoodRead: noOfGoodReads += 1 else: pass if totalNoOfReads>0: fractionOfGoodRead = noOfGoodReads/(totalNoOfReads) medianMapQ = numpy.median(mapQList) else: fractionOfGoodRead = -1 medianMapQ = -10 if fractionOfGoodRead>=minFractionOfGoodRead: locusLowMapQIndicator = 0 else: locusLowMapQIndicator = 2 return PassingData(locusLowMapQIndicator=locusLowMapQIndicator, totalNoOfReads=totalNoOfReads, \ noOfGoodReads=noOfGoodReads, fractionOfGoodRead=fractionOfGoodRead,\ medianMapQ=medianMapQ)
def IntLMOnTwoSNPs(cls, snp1_id, gene1_id, genotype_ls1, snp2_id, gene2_id, genotype_ls2, phenotype_index, phenotype_ls, \ min_data_point=3): """ 2009-2-8 interaction detection linear model y = b + SNP1xSNP2 + SNP1 + SNP2 + e interaction is the 1st term. therefore the pvalue directly returned is also for this term. """ return_ls = [] genotype_ls1 = numpy.resize(genotype_ls1, [len(genotype_ls1), 1]) genotype_ls2 = numpy.resize(genotype_ls2, [len(genotype_ls2), 1]) snp_int_matrix = genotype_ls1 * genotype_ls2 genotype_ls = numpy.hstack( (snp_int_matrix, genotype_ls1, genotype_ls2)) #interaction variable is the 1st position pdata = Association.linear_model(genotype_ls, phenotype_ls, min_data_point, snp_index=snp1_id + snp2_id) if pdata: pdata = PassingData(snp1_id=snp1_id, gene1_id=gene1_id, snp2_id=snp2_id, gene2_id=gene2_id, pvalue=pdata.pvalue,\ count1=pdata.count_ls[0], count2=pdata.count_ls[1], bool_type=None, phenotype_index=phenotype_index,\ var_perc=pdata.var_perc, coeff_list=pdata.coeff_list, coeff_p_value_list=pdata.coeff_p_value_list) return_ls.append(pdata) return return_ls
def get_enrichment_pvalue_by_gw_looping(self, candidate_sample_size, top_snp_index_ls, candidate_gene_set, \ snps_context_wrapper, \ no_of_total_snps, total_chr_pos_ar=None, no_of_permutations=20000, no_of_min_breaks=30): """ 2008-10-30 2008-10-22 get enrichment pvalue by genome-wide looping of SNP positions. a permutation to preserve LD. """ if self.debug: sys.stderr.write("Getting enrichment pvalue by gw-looping ... ") i = 0 no_of_hits = 0 while i<no_of_permutations: looped_chr_pos_ls = self.get_looped_chr_pos_ls(top_snp_index_ls, no_of_total_snps, total_chr_pos_ar) looped_candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls(candidate_gene_set, \ looped_chr_pos_ls, snps_context_wrapper) new_candidate_sample_size = len(looped_candidate_gene_snp_index_ls) if new_candidate_sample_size>=candidate_sample_size: #pvalue = Prob(X>=candidate_sample_size) no_of_hits += 1 i+=1 if no_of_min_breaks>0 and no_of_hits>=no_of_min_breaks: #if no_of_min_breaks<=0, no smart breaking break pvalue = no_of_hits/float(i) return_data = PassingData(pvalue=pvalue, no_of_tests=i, no_of_tests_passed=no_of_hits) if self.debug: sys.stderr.write("%s/%s tests in total. Done.\n"%(no_of_hits, i)) return return_data
def get_enrichment_pvalue_by_random_gene_list(self, sample_pvalue, total_gene_id_ls, candidate_gene_set, \ total_chr_pos_ls, snps_context_wrapper, top_snp_chr_pos_ls, n,k,\ no_of_permutations=20000, no_of_min_breaks=30): """ 2008-10-22 """ if self.debug: sys.stderr.write("Getting enrichment pvalue by random gene list ... ") i = 0 no_of_hits = 0 no_of_candidate_genes = len(candidate_gene_set) no_of_total_snps = len(total_chr_pos_ls) no_of_top_snps = len(top_snp_chr_pos_ls) while i<no_of_permutations: random_candidate_gene_set = Set(random.sample(total_gene_id_ls, no_of_candidate_genes)) random_candidate_gene_snp_gw_index_ls = self.get_candidate_gene_snp_index_ls(random_candidate_gene_set, total_chr_pos_ls, snps_context_wrapper) random_candidate_gene_snp_sample_index_ls = self.get_candidate_gene_snp_index_ls(random_candidate_gene_set, top_snp_chr_pos_ls, snps_context_wrapper) x = len(random_candidate_gene_snp_sample_index_ls) m = len(random_candidate_gene_snp_gw_index_ls) n = no_of_total_snps - m k = no_of_top_snps new_sample_pvalue = rpy.r.phyper(x-1,m,n,k, lower_tail = rpy.r.FALSE) if new_sample_pvalue<=sample_pvalue: #watch: pvalue = Prob(X<=sample_pvalue). chance of getting more significant (smaller) pvalues no_of_hits += 1 i+=1 if no_of_min_breaks>0 and no_of_hits>=no_of_min_breaks: #if no_of_min_breaks<=0, no smart breaking break pvalue = no_of_hits/float(i) return_data = PassingData(pvalue=pvalue, no_of_tests=i, no_of_tests_passed=no_of_hits) if self.debug: sys.stderr.write("%s/%s tests in total. Done.\n"%(no_of_hits, i)) return return_data
def prepareDataForHGTest(self, rm, snps_context_wrapper, candidate_gene_list, results_directory=None, min_MAF=None, \ no_of_top_snps=None,\ db_250k=None): """ 2012.3.23 add argument db_250k 2008-08-20 """ sys.stderr.write("Preparing data for HG test ... ") genome_wide_result = db_250k.getResultMethodContent(rm.id, results_directory, min_MAF) genome_wide_result.data_obj_ls.sort() #in value descending order. each SNP object has a defined method for comparison based on its value genome_wide_result.data_obj_ls.reverse() candidate_gene_set = Set(candidate_gene_list) candidate_gene_in_top_set = Set([]) non_candidate_gene_in_top_set = Set([]) for i in range(no_of_top_snps): data_obj = genome_wide_result.data_obj_ls[i] snps_context_matrix = snps_context_wrapper.returnGeneLs(data_obj.chromosome, data_obj.position) for snps_context in snps_context_matrix: snps_id, disp_pos, gene_id = snps_context if gene_id in candidate_gene_set: candidate_gene_in_top_set.add(gene_id) else: non_candidate_gene_in_top_set.add(gene_id) passingdata = PassingData(candidate_gene_in_top_set=candidate_gene_in_top_set, non_candidate_gene_in_top_set=non_candidate_gene_in_top_set) sys.stderr.write("Done.\n") return passingdata
def organizeProbesIntoChromosome(cls, xy_ls, chr_pos_ls, probes_id_ls): """ 2010-4-29 add chr_pos2index to map (chr, pos) to its index in chr_pos_ls 2009-11-24 split out of calculateProbeQuartilePerChromosome() xy_ls, chr_pos_ls, probes_id_ls are already in chromosomal order. """ sys.stderr.write("Getting probes into each chromosome ...") chr2xy_ls = {} chr2probe_id_ls = {} chr_pos2index = { } # 2010-4-29. map (chr, pos) to its index in chr_pos_ls for i in range(len(xy_ls)): chr, pos = chr_pos_ls[i] if chr not in chr2xy_ls: chr2xy_ls[chr] = [] chr2probe_id_ls[chr] = [] #initialize with the start_probe_id chr2xy_ls[chr].append(xy_ls[i]) chr2probe_id_ls[chr].append(probes_id_ls[i]) chr_pos2index[(chr, pos)] = i sys.stderr.write("Done.\n") return PassingData(chr2xy_ls=chr2xy_ls, chr2probe_id_ls=chr2probe_id_ls, chr_pos2index=chr_pos2index)
def getScoreRankFromPermIndexLs(self, genome_wide_result, candidate_gene_snp_index_ls, non_candidate_gene_snp_index_ls): """ 2008-10-21 """ sys.stderr.write("Getting Score rank data given index ls...") candidate_score_ls = [] non_candidate_score_ls = [] candidate_rank_ls = [] non_candidate_rank_ls = [] for index in candidate_gene_snp_index_ls: if genome_wide_result.data_obj_ls[index]: candidate_score_ls.append( genome_wide_result.data_obj_ls[index].value) no_of_candidate_scores = len(candidate_score_ls) for index in non_candidate_gene_snp_index_ls: if genome_wide_result.data_obj_ls[index]: non_candidate_score_ls.append( genome_wide_result.data_obj_ls[index].value) total_score_ls = candidate_score_ls + non_candidate_score_ls import rpy rank_ls = rpy.r.rank(total_score_ls) candidate_rank_ls = rank_ls[:no_of_candidate_scores] non_candidate_rank_ls = rank_ls[no_of_candidate_scores:] score_rank_data = PassingData(candidate_score_ls=candidate_score_ls, candidate_rank_ls=candidate_rank_ls,\ non_candidate_score_ls=non_candidate_score_ls, non_candidate_rank_ls=non_candidate_rank_ls) sys.stderr.write("Done.\n") return score_rank_data
def parseFastaDescriptionForFullVervetBACs(self, descriptionLine=None, FigureOutTaxID_ins=None): """ 2011-7-6 possible header lines: >gi|285026568|gb|AC239257.2| Chlorocebus aethiops chromosome UNK clone CH252-270J24, WORKING DRAFT SEQUENCE, 2 unordered pieces >gi|281332227|gb|AC238852.3| Chlorocebus aethiops BAC clone CH252-133A18 from chromosome 3, complete sequence >gi|285002488|gb|AC239185.3| Chlorocebus aethiops BAC clone CH252-404N12 from chromosome unknown, complete sequence """ header = descriptionLine[1:-1] #discard '>' and '\n' header = header.split('|') _tax_id = None p_chromosome = re.compile(r'UNK clone ([^,]+),') # 1st type of clone description p2_chromosome = re.compile(r'clone ([^,]+),') # 2nd type of clone description if p_chromosome.search(header[4]) is not None: chromosome = p_chromosome.search(header[4]).groups()[0] else: if p2_chromosome.search(header[4]) is not None: chromosome = p2_chromosome.search(header[4]).groups()[0] else: chromosome = None gi = int(header[1]) acc_ver = header[3] comment = header[4] return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
def mergeSegmentsForOneArray(self, db_250k, array_id, raw_cnv_method_id = None, max_gap_ratio=0.3, \ max_gap_len=None, maxDeletionLength=50000, param_obj=None): """ 2010-7-29 """ sys.stderr.write("Merging segments for array %s ... \n" % array_id) query = Stock_250kDB.CNVCall.query.filter_by(cnv_method_id=raw_cnv_method_id).\ filter_by(array_id=array_id).filter_by(cnv_type_id=param_obj.cnv_type_id).\ order_by(Stock_250kDB.CNVCall.chromosome).order_by(Stock_250kDB.CNVCall.start).order_by(Stock_250kDB.CNVCall.stop) segment_ls = [] for row in query: segment = [row.chromosome, row.start, row.stop, row.start_probe_id, row.stop_probe_id, row.no_of_probes_covered, \ row.size_affected,\ row.amplitude, row.probability] segment_ls.append(segment) merged_segment_ls = self.mergeOverlappingORCloseSegmentsByGraph(segment_ls, max_reciprocal_gap_ratio=max_gap_ratio, \ max_gap_len=max_gap_len, mergeFunc=self.mergeTwoSegments,\ maxDeletionLength=maxDeletionLength,\ maxNeighborDist=getattr(param_obj, 'maxNeighborDist', 5000)) from CNVPredictDeletionBySVM import CNVPredictDeletionBySVM for merged_segment in merged_segment_ls: chromosome, start, stop, start_probe_id, stop_probe_id, no_of_probes_covered, size_affected,\ amplitude, probability = merged_segment[:9] cnv_segment_obj = PassingData(array_id=array_id, start_probe_id=start_probe_id, stop_probe_id=stop_probe_id, \ no_of_probes=no_of_probes_covered, amplitude=amplitude, segment_length=size_affected, \ segment_chromosome=chromosome, \ segment_start_pos=start, segment_stop_pos=stop, \ median_intensity=None, probability=probability) CNVPredictDeletionBySVM.saveSegmentObj(param_obj, cnv_segment_obj) sys.stderr.write("Done.\n")
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session array_id2median_intensity = self.get_array_id2median_intensity(min_array_median_intensity=self.min_array_median_intensity) arrays_to_form_model = self.getModelArrays(db, self.training_cnv_method_id, array_id2median_intensity) if self.debug: # 2010-7-25 for debug, temporary arrays_to_form_model = arrays_to_form_model[:4] array_id2model = self.constructSVMModels(db, arrays_to_form_model, array_id2median_intensity,\ minPercUnCoveredByLerContig=self.minPercUnCoveredByLerContig, cnv_method_id=self.training_cnv_method_id,\ C=self.SVM_C, gamma=self.SVM_gamma, eps=self.SVM_eps, deletedFractionType=self.deletedFractionType) array_id2model_array_id_ls = self.mapAnyArray2ModelArray(array_id2median_intensity, array_id2model, \ max_median_intensity_dist=self.max_median_intensity_dist,\ minNoOfModelArrays=self.minNoOfModelArrays) param_obj = PassingData(session=session, no_of_total=0, no_of_into_db=0, report=self.report,\ cnv_method_id=self.cnv_method_id, cnv_type_id=self.cnv_type_id) self.predictALLSegments(self.input_fname, array_id2model_array_id_ls, array_id2model,\ max_amplitude=self.max_amplitude, param_obj=param_obj) session.flush() session.expunge_all() session.commit()
def run(self): """ 11-13-05 --db_connect() --parse_entrezgene_xml_file() --is_gi_valid_in_annot_assembly_table() --find_info_dict() --return_location_list() --submit_to_entrezgene_mapping_table() """ if self.debug: import pdb pdb.set_trace() sys.stderr.write("\tTotally, %d files to be processed.\n"%len(self.inputfiles)) db = GenomeDatabase(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) #2010-6-22 session = db.session param_obj = PassingData(session=db.session, no_of_genes_already_in_db=0, no_of_entrezgene_mappings_already_in_db=0,\ no_of_total=0, no_of_into_db=0, report=self.report, no_of_commentaries_already_in_db=0,\ no_of_gene_segments_already_in_db=0, no_of_gene2go_already_in_db=0) for f in self.inputfiles: sys.stderr.write("%d/%d:\t%s\n"%(self.inputfiles.index(f)+1,len(self.inputfiles),f)) self.parse_xml_file(session, f, tax_id=self.tax_id, param_obj=param_obj) session.flush() if self.commit: session.commit() else: session.rollback()
def avgKey2DataLs(self, key2dataLs, no_of_key_columns=1, header=[]): """ 2012.1.9 1. take mean/median/stdev of every cell in dataLs, 2. modify newHeader to reflect that """ sys.stderr.write("Averaging key2dataLs (%s entries ) ..." % (len(key2dataLs))) newKey2DataLs = {} newHeader = [] keyColHeader = header[:no_of_key_columns] valueColHeader = header[no_of_key_columns:] newValueColHeader = [] no_of_value_columns = len(valueColHeader) for i in xrange(no_of_value_columns): valueColName = valueColHeader[i] newValueColHeader += [ 'mean_%s' % (valueColName), 'median_%s' % (valueColName), 'stdev_%s' % (valueColName) ] for key, dataLs in key2dataLs.iteritems(): if key not in newKey2DataLs: newKey2DataLs[key] = [] no_of_value_columns = len(dataLs) for i in xrange(no_of_value_columns): meanValue = numpy.mean(dataLs[i]) medianValue = numpy.median(dataLs[i]) stdev = numpy.std(dataLs[i]) newKey2DataLs[key] += [meanValue, medianValue, stdev] sys.stderr.write("Done.\n") return PassingData(key2dataLs=newKey2DataLs, header=keyColHeader + newValueColHeader)
def get_enrichment_pvalue_by_gw_looping(self, candidate_sample_size, top_loci_in_cumu_pos, candidate_gene_set=None, \ genomeRBDict=None, cumuSpan2ChrRBDict=None, no_of_permutations=20000, \ no_of_min_breaks=30,\ param_data=None): """ 2011-3-18 do the test against permData.captured_candidate_gene_set 2011-3-12 get enrichment pvalue by genome-wide looping of SNP positions. a permutation to preserve LD. """ if self.debug: sys.stderr.write("Getting enrichment pvalue by gw-looping ... ") i = 0 no_of_hits = 0 while i<no_of_permutations: permuted_top_loci_in_chr_start_stop = self.applyGWLoopToCumuPos(top_loci_in_cumu_pos, cumuSpan2ChrRBDict) permData = self.prepareDataForPermutationRankTest(permuted_top_loci_in_chr_start_stop, genomeRBDict, param_data) new_candidate_sample_size = len(permData.captured_candidate_gene_set) if new_candidate_sample_size>=candidate_sample_size: #pvalue = Prob(X>=candidate_sample_size) no_of_hits += 1 i+=1 if no_of_min_breaks>0 and no_of_hits>=no_of_min_breaks: #if no_of_min_breaks<=0, no smart breaking break pvalue = no_of_hits/float(i) return_data = PassingData(pvalue=pvalue, no_of_tests=i, no_of_tests_passed=no_of_hits) if self.debug: sys.stderr.write("%s/%s tests in total. Done.\n"%(no_of_hits, i)) return return_data
def getAlignmentMatrix(self, alignment_id): sys.stderr.write("Getting alignment matrix for alignment=%s ..." % (alignment_id)) snp_pos_ls = [] accession_id_ls = [] name_ls = [] data_matrix = [] rows = Sequence.query.filter_by(alignment=alignment_id).order_by( Sequence.accession).all() counter = 0 for row in rows: if counter == 0: snp_pos_ls = self.get_snp_pos_ls(row.alignment_obj.target, row.alignment_obj.chromosome, row.alignment_obj.start) accession_id_ls.append(row.accession) name_ls.append(row.accession_obj.name) data_row = dict_map(nt2number, row.bases) data_matrix.append(data_row) counter += 1 data_matrix = num.array(data_matrix, num.int8) passingdata = PassingData(snp_pos_ls=snp_pos_ls, accession_id_ls=accession_id_ls, name_ls=name_ls, data_matrix=data_matrix) sys.stderr.write(' %s accessions, %s bases. Done.\n' % (len(accession_id_ls), len(snp_pos_ls))) return passingdata
def remove_rows_with_too_many_NAs( cls, data_matrix, row_cutoff, cols_with_too_many_NAs_set=None, NA_set=Set([0, -2]), debug=0, is_cutoff_max=0 ): """ 2008-05-19 if is_cutoff_max=1, anything > row_cutoff is deemed as having too many NAs if is_cutoff_max=0 (cutoff is minimum), anything >= row_cutoff is deemed as having too many NAs 2008-05-12 made more robust add cols_with_too_many_NAs_set add NA_set 2008-05-08 become classmethod """ sys.stderr.write("Removing rows with NA rate >= %s ..." % (row_cutoff)) no_of_rows, no_of_cols = data_matrix.shape rows_with_too_many_NAs_set = Set() total_cols_set = Set(range(no_of_cols)) if cols_with_too_many_NAs_set: cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set else: cols_to_be_checked = total_cols_set row_index2no_of_NAs = {} for i in range(no_of_rows): no_of_NAs = 0.0 for j in cols_to_be_checked: if data_matrix[i][j] in NA_set: no_of_NAs += 1 if no_of_cols != 0: NA_ratio = no_of_NAs / no_of_cols else: NA_ratio = 0.0 row_index2no_of_NAs[i] = NA_ratio if is_cutoff_max: if NA_ratio > row_cutoff: rows_with_too_many_NAs_set.add(i) else: if NA_ratio >= row_cutoff: rows_with_too_many_NAs_set.add(i) if debug: print print "rows_with_too_many_NAs_set" print rows_with_too_many_NAs_set passingdata = PassingData(rows_with_too_many_NAs_set=rows_with_too_many_NAs_set) passingdata.row_index2no_of_NAs = row_index2no_of_NAs sys.stderr.write("%s strains removed, done.\n" % len(rows_with_too_many_NAs_set)) return passingdata
def getStrainidTargetidFromFile(self, db, QC_method_id, input_fname, max_mismatch_rate, min_no_of_non_NAs=20): """ 2008-09-10 column in input_fname is determined on the fly 2008-08-29 to get strain id and target id set from the qc_cross_match result file. """ sys.stderr.write("Getting set of strain_id & target_id ... \n") reader = csv.reader(open(input_fname), delimiter='\t') #figure out which variable is in which column header = reader.next() col_name2index = {} for i in range(len(header)): column_name = header[i] col_name2index[column_name] = i strain_id_set = Set() target_id_set = Set() i = 0 for row in reader: #id, strainid, target_id, qc_method_id, mismatch_rate, no_of_mismatches, no_of_non_NA_pairs, readme_id =row strainid = int(row[col_name2index['strainid']]) #2008-09-10 target_id = int(row[col_name2index['target_id']]) qc_method_id = int(row[col_name2index['qc_method_id']]) mismatch_rate = float(row[col_name2index['mismatch_rate']]) no_of_mismatches = int(row[col_name2index['no_of_mismatches']]) no_of_non_NA_pairs = int(row[col_name2index['no_of_non_NA_pairs']]) if qc_method_id == QC_method_id and no_of_non_NA_pairs>=min_no_of_non_NAs and mismatch_rate<=max_mismatch_rate: if QC_method_id==4: #strain_id_set = target_id_set strain_id_set.add(strainid) strain_id_set.add(target_id) else: strain_id_set.add(strainid) target_id_set.add(target_id) i +=1 if self.report and i%100000==0: sys.stderr.write("%s\t%s"%('\x08'*40, i)) if self.debug and i>1000000: break if self.report: sys.stderr.write("%s\t%s\n"%('\x08'*40, i)) return_data = PassingData() return_data.strain_id_set = strain_id_set return_data.target_id_set = target_id_set del reader sys.stderr.write("%s strainids and %s target_ids. Done.\n"%(len(strain_id_set), len(target_id_set))) return return_data
def rm2result(cls, session, rm=None, chr_pos2db_id=None, max_rank=1000, commit=False, min_rank=1, results_directory=None, \ min_score=None,update=True,db_id2chr_pos=None, db_250k=None): """ 2012.3.23 add argument db_250k 2010-3-8 add argument min_score to exclude SNPs whose scores are too low. This argument has an AND relationship with max_rank. log transformation is automatically determined based on analysis_method.smaller_score_more_significant in db. 2009-11-2 split out of run() """ # 2009-5-1 check whether it's already in db. db_entries = Stock_250kDB.Results.query.filter_by(results_id=rm.id) result_exists = False if db_entries.count() == max_rank - min_rank + 1: if update: db_entries.delete() sys.stderr.write("%s already in db. Deleting rows.\n" % rm.id) else: sys.stderr.write("%s already in db. Ignore.\n" % rm.id) param_data = PassingData(min_MAC=0, db_id2chr_pos=db_id2chr_pos) genome_wide_result = db_250k.getResultMethodContent(rm.id, results_directory=results_directory, min_MAF=0., \ pdata=param_data, min_value_cutoff=min_score) counter = 0 no_of_saved = 0 if genome_wide_result: for rank in range(min_rank, max_rank + 1): if rank > len( genome_wide_result.data_obj_ls ): # rank has gone past the total number of SNPs. break the for loop. break data_obj = genome_wide_result.get_data_obj_at_given_rank(rank) if data_obj is not None: counter += 1 snps_id = chr_pos2db_id.get( (data_obj.chromosome, data_obj.position)) if data_obj.extra_col_ls: result_obj = cPickle.dumps(data_obj.extra_col_ls) else: result_obj = None # 2010-3-8 check if it's in db now. db_entries = Stock_250kDB.Results.query.filter_by( results_id=rm.id).filter_by(snps_id=snps_id) if db_entries.count() == 0: Stock_250kDB.Results(snps_id=snps_id, results_id=rm.id, score=data_obj.value, rank=rank, beta=getattr(data_obj, 'beta1', None),\ maf=data_obj.maf, mac=data_obj.mac, genotype_var_perc=data_obj.genotype_var_perc,\ correlation=getattr(data_obj, 'correlations', None), odds_ratio=getattr(data_obj, "odds_ratio_est",None),\ statistic=getattr(data_obj, "statistics",None), object=result_obj) no_of_saved += 1 if commit: session.flush() #session.commit() else: session.rollback() sys.stderr.write("%s out of %s saved in db.\n" % (no_of_saved, counter))
def mapEachChromosome(self, workflow=None, alignmentData=None, chromosome=None,\ VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None, transferOutput=True, **keywords): """ 2012.9.17 """ if workflow is None: workflow = self returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob alignment = alignmentData.alignment parentJobLs = alignmentData.jobLs bamF = alignmentData.bamF baiF = alignmentData.baiF bamFnamePrefix = passingData.bamFnamePrefix """ #2012.9.21 perhaps a downsampling job outputFname = os.path.join(topOutputDirJob.output, '%s_%s.bam'%(bamFnamePrefix, overlapFileBasenameSignature)) outputFile = File(outputFname) selectAlignmentJob, bamIndexJob1 = self.addSelectAlignmentJob(executable=workflow.samtools, inputFile=bamF, \ outputFile=outputFile, region=overlapInterval, parentJobLs=[topOutputDirJob] + parentJobLs, \ extraDependentInputLs=[baiF], transferOutput=False, \ extraArguments=None, job_max_memory=2000, needBAMIndexJob=True) """ """ #2012.9.21 count covariates job is moved to map() recalFile = File(os.path.join(topOutputDirJob.output, '%s_%s.recal_data.csv'%(bamFnamePrefix, chromosome))) countCovariatesJob = self.addGATKBaseRecalibratorJob(GenomeAnalysisTKJar=workflow.GenomeAnalysisTK2Jar, inputFile=bamF, \ VCFFile=VCFFile, interval=chromosome, outputFile=recalFile, \ refFastaFList=passingData.refFastaFList, parentJobLs=[topOutputDirJob]+parentJobLs, extraDependentInputLs=[baiF, VCFFile.tbi_F], \ transferOutput=False, \ extraArguments=None, job_max_memory=4000) self.no_of_jobs += 1 returnData.countCovariatesJob = countCovariatesJob returnData.jobDataLs.append(PassingData(jobLs=[countCovariatesJob], file=countCovariatesJob.recalFile, \ fileLs=[countCovariatesJob.recalFile])) """ return returnData
def preReduce(self, workflow=None, outputDirPrefix="", passingData=None, transferOutput=True, **keywords): """ 2012.9.17 """ parentPreReduceData = AbstractVervetWorkflow.preReduce(self, workflow=workflow, outputDirPrefix=outputDirPrefix, passingData=passingData, \ transferOutput=transferOutput, **keywords) returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] callOutputDir = "call" callOutputDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=callOutputDir) passingData.callOutputDirJob = callOutputDirJob matrixDir = "pairwiseDistMatrix" matrixDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=matrixDir) passingData.matrixDirJob = matrixDirJob reduceOutputDirJob = passingData.reduceOutputDirJob #2012.10.9 reduceOutputDirJob was added to passingData during AbstractVCFWorkflow.preReduce() #reduceOutputDir = "aggregateData" #reduceOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=reduceOutputDir) #passingData.reduceOutputDirJob = reduceOutputDirJob figureFnamePrefix = os.path.join(reduceOutputDirJob.output, 'aggregateDistanceMatrix') aggregateDistanceMatrixOutputF = File('%s.tsv'%(figureFnamePrefix)) PCAFile = File('%s_PCA.tsv'%(figureFnamePrefix)) aggregateAndHClusterDistanceMatrixJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.AggregateAndHClusterDistanceMatrix, \ outputF=aggregateDistanceMatrixOutputF, \ parentJobLs=[reduceOutputDirJob],extraOutputLs=[PCAFile, File('%s.png'%(figureFnamePrefix)), \ File('%s.svg'%(figureFnamePrefix))], \ extraDependentInputLs=[], transferOutput=True, extraArguments="-f %s"%(figureFnamePrefix)) returnData.aggregateAndHClusterDistanceMatrixJob = aggregateAndHClusterDistanceMatrixJob #2012.9.5 add the job to append meta info (country, sex, latitude, etc. of each monkey) outputF = File('%s_withMetaInfo.tsv'%(figureFnamePrefix)) appendInfo2PCAOutputJob = self.addGenericDBJob(executable=self.AppendInfo2SmartPCAOutput, inputFile=PCAFile, \ outputFile=outputF, \ parentJobLs=[aggregateAndHClusterDistanceMatrixJob], extraDependentInputLs=None, \ extraOutputLs=None,\ transferOutput=True, \ extraArgumentList=None, extraArguments=None, sshDBTunnel=self.needSSHDBTunnel, \ key2ObjectForJob=None, job_max_memory=2000) return returnData
def run(self): self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size-1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size-1 db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session session.begin() if node_rank == 0: snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest) if not self.results_id_ls: pdata = PassingData(call_method_id=self.call_method_id, analysis_method_id=self.analysis_method_id) self.results_id_ls = self.getResultsMethodIDLs(pdata) snps_context_wrapper_pickle = cPickle.dumps(snps_context_wrapper, -1) for node in free_computing_nodes: #send it to the computing_node sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node)) self.communicator.send(snps_context_wrapper_pickle, node, 0) sys.stderr.write(".\n") del snps_context_wrapper_pickle, snps_context_wrapper elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) snps_context_wrapper = cPickle.loads(data) del data else: pass self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=self.results_id_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=self.message_size) elif node_rank in free_computing_node_set: param_data = PassingData(session=session) param_data.results_directory = self.input_db_directory param_data.default_output_db_directory = self.default_output_db_directory param_data.output_db_directory = self.output_db_directory param_data.commit = self.commit param_data.min_MAF = self.min_MAF param_data.min_distance = self.min_distance param_data.get_closest = self.get_closest param_data.snps_context_wrapper = snps_context_wrapper self.computing_node(param_data, self.computing_node_handler) else: param_obj = PassingData() self.output_node(free_computing_nodes, param_obj, self.output_node_handler) self.synchronize() #to avoid some node early exits
def remove_rows_with_too_many_NAs(cls, data_matrix, row_cutoff, cols_with_too_many_NAs_set=None, NA_set=set([0, -2]), debug=0, is_cutoff_max=0): """ 2008-05-19 if is_cutoff_max=1, anything > row_cutoff is deemed as having too many NAs if is_cutoff_max=0 (cutoff is minimum), anything >= row_cutoff is deemed as having too many NAs 2008-05-12 made more robust add cols_with_too_many_NAs_set add NA_set 2008-05-08 become classmethod """ sys.stderr.write("Removing rows with NA rate >= %s ..."%(row_cutoff)) no_of_rows, no_of_cols = data_matrix.shape rows_with_too_many_NAs_set = set() total_cols_set = set(range(no_of_cols)) if cols_with_too_many_NAs_set: cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set else: cols_to_be_checked = total_cols_set row_index2no_of_NAs = {} for i in range(no_of_rows): no_of_NAs = 0.0 for j in cols_to_be_checked: if data_matrix[i][j] in NA_set: no_of_NAs += 1 if no_of_cols!=0: NA_ratio = no_of_NAs/no_of_cols else: NA_ratio = 0.0 row_index2no_of_NAs[i] = NA_ratio if is_cutoff_max: if NA_ratio > row_cutoff: rows_with_too_many_NAs_set.add(i) else: if NA_ratio >= row_cutoff: rows_with_too_many_NAs_set.add(i) if debug: print print 'rows_with_too_many_NAs_set' print rows_with_too_many_NAs_set passingdata = PassingData(rows_with_too_many_NAs_set=rows_with_too_many_NAs_set) passingdata.row_index2no_of_NAs = row_index2no_of_NAs sys.stderr.write("%s strains removed, done.\n"%len(rows_with_too_many_NAs_set)) return passingdata
def getEcotypeInfo(db, country_order_type=1): """ 2009-09-2 add region into ecotype_obj 2008-10-08 use ecotype_id2ecotype_obj to summarize ecotypeid2pos ecotypeid2nativename ecotypeid2country 2008-10-08 add option order_by_type get country2order moved from PlotGroupOfSNPs.py the db handle is not restricted to the stock database. could be any database on the same server. BUT StockDB has to be imported in the program where db connection is established just so that StockDB.Ecotype.table is setup while StockDB.Ecotype.table.metadata is None. 2008-10-07 """ sys.stderr.write("Getting Ecotype info ... ") import StockDB from pymodule import PassingData ecotype_info = PassingData() if country_order_type == 1: order_seq_sentence = 'c.latitude, c.longitude' else: order_seq_sentence = 'c.longitude, c.latitude' rows = db.metadata.bind.execute("select e.id as ecotype_id, e.nativename, e.latitude, e.longitude, a.region, \ c.abbr as country, c.latitude as country_latitude, \ c.longitude as country_longitude \ from stock.%s e, stock.%s s, stock.%s a, stock.%s c where e.siteid=s.id and s.addressid=a.id and \ a.countryid=c.id order by %s " %(getattr(StockDB.Ecotype.table, 'name', 'ecotype'), \ getattr(StockDB.Site.table, 'name', 'site'), getattr(StockDB.Address.table, 'name', 'address'), \ getattr(StockDB.Country.table, 'name', 'country'), order_seq_sentence)) ecotype_id2ecotype_obj = {} country2order = {} for row in rows: ecotype_obj = PassingData() for key, value in row.items(): #not iteritems() for RowProxy object setattr(ecotype_obj, key, value) ecotype_id2ecotype_obj[row.ecotype_id] = ecotype_obj if row.country not in country2order: country2order[row.country] = len(country2order) ecotype_info.ecotype_id2ecotype_obj = ecotype_id2ecotype_obj ecotype_info.country2order = country2order sys.stderr.write("%s ecotypes.\n" % (len(ecotype_id2ecotype_obj))) return ecotype_info
def getTranslationDataStructureForBackgroundLoci(self, db_250k, cnv_method_id=None, min_MAF=0.1): """ 2011-4-22 1. get all loci whose MAF is above min_MAF 2. construct a (chr,start,stop) 2 cumu_start dictionary 3. construct a (cumu_start, cumu_stop) 2 (chr, start, stop) RBDict """ sys.stderr.write("Getting translation structures between (chr, start, stop) and (cumu_start, cumu_stop) for cnv method %s ..."%\ cnv_method_id) TableClass = Stock_250kDB.CNV query = TableClass.query.filter_by( cnv_method_id=cnv_method_id).order_by( TableClass.chromosome).order_by(TableClass.start) chrSpan2cumuStartRBDict = RBDict() cumuSpan2ChrSpanRBDict = RBDict() cumu_start = 0 counter = 0 real_counter = 0 for row in query: counter += 1 maf = min(row.frequency, 1 - row.frequency) if maf <= min_MAF: continue real_counter += 1 chrSpanKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \ span_ls=[row.start, row.stop], \ min_reciprocal_overlap=0.00000000000001,) #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical. chrSpan2cumuStartRBDict[ chrSpanKey] = cumu_start #cumu_start is 0-based size = row.stop - row.start + 1 span_ls = [cumu_start + 1, cumu_start + size] segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=0, \ span_ls=span_ls, \ min_reciprocal_overlap=0.00000000000001,) #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical. if segmentKey not in cumuSpan2ChrSpanRBDict: cumuSpan2ChrSpanRBDict[segmentKey] = (row.chromosome, row.start, row.stop) else: sys.stderr.write( "Error: %s of chr %s is already in cumuSpan2ChrSpanRBDict.\n" % (segmentKey, row.chromosome)) cumu_start += size sys.stderr.write("%s out of %s CNVs are included. Done.\n" % (real_counter, counter)) return PassingData(cumuSpan2ChrSpanRBDict=cumuSpan2ChrSpanRBDict, chrSpan2cumuStartRBDict=chrSpan2cumuStartRBDict)
def run(self): """ 2011-10 """ if self.debug: import pdb pdb.set_trace() workflow = self.initiateWorkflow() self.registerExecutables() self.registerCustomExecutables() callMethodID2Data = {} for call_method_id in self.call_method_id_ls: callMethod = Stock_250kDB.CallMethod.get(call_method_id) if callMethod and callMethod.filename: datasetFile = self.registerOneInputFile(inputFname=db.supplantFilePathWithNewDataDir(filePath=callMethod.filename, \ oldDataDir=self.db_250k.data_dir, \ newDataDir=self.data_dir), \ folderName=self.pegasusFolderName) callMethodID2Data[callMethod.id] = PassingData( datasetFile=datasetFile, db_entry=callMethod) else: sys.stderr.write( "WARNING: call method %s is not in db or the filename column is empty.\n" % (call_method_id)) if self.kinship_fname: kinshipFile = self.registerOneInputFile( inputFname=self.kinship_fname, folderName=self.pegasusFolderName) else: kinshipFile = None if self.eigen_vector_fname: eigenVectorFile = self.registerOneInputFile( inputFname=self.eigen_vector_fname, folderName=self.pegasusFolderName) else: eigenVectorFile = None if self.genotype_fname_to_generate_kinship: genotypeFileToGenerateKinship = self.registerOneInputFile(inputFname=self.genotype_fname_to_generate_kinship, \ folderName=self.pegasusFolderName) else: genotypeFileToGenerateKinship = None self.addJobs(db_250k=self.db_250k, callMethodID2Data=callMethodID2Data, kinshipFile=kinshipFile, \ eigenVectorFile=eigenVectorFile, phenotype_method_id_ls=self.phenotype_method_id_ls,\ analysis_method_id_ls=self.analysis_method_id_ls, \ genotypeFileToGenerateKinship=genotypeFileToGenerateKinship, \ data_dir=self.data_dir, \ getPublicPhenotype=self.getPublicPhenotype,\ commit=self.commit, \ transferOutput=True, needSSHDBTunnel=self.needSSHDBTunnel, outputDirPrefix="") # Write the DAX to stdout outf = open(self.outputFname, 'w') self.writeXML(outf)
def getCNVFeatureData(cls, db_250k, array_id=None, \ minPercUnCoveredByLerContig=0.6, cnv_method_id=6, \ replaceAmpWithMedianIntensity=False, deletedFractionType=1): """ 2010-7-25 add argument deletedFractionType 1: CNVCall.percUnCoveredByLerContig 2: CNVCall.fractionDeletedInPECoverageData 2010-7-1 moved from CNV.CNVPredictionBySVM in misc.py """ sys.stderr.write("Getting CNV feature data (amplitude, #probes, probe density,) array %s, cnv_method %s, minPercUnCoveredByLerContig %s ... \n"%\ (array_id, cnv_method_id, minPercUnCoveredByLerContig)) i = 0 block_size = 5000 real_counter = 0 TableClass = Stock_250kDB.CNVCall query = TableClass.query.filter_by(array_id=array_id).filter_by(cnv_method_id=cnv_method_id) rows = query.offset(i).limit(block_size) session = db_250k.session ecotype_id = None percUnCoveredByLerContig_ls = [] feature_data = [] class_label_ls = [] c_ls = [] while rows.count()!=0: for row in rows: ecotype_id = row.array.maternal_ecotype_id if deletedFractionType==1: deletedFraction = row.percUnCoveredByLerContig else: deletedFraction = row.fractionDeletedInPECoverageData if deletedFraction is not None: #x_ls.append(row.amplitude) no_of_probes = math.log10(row.no_of_probes_covered) probeDensity = row.no_of_probes_covered*1000.0/(row.stop-row.start+1.0) if deletedFraction>=minPercUnCoveredByLerContig: class_label = -1 real_counter += 1 else: class_label = 1 class_label_ls.append(class_label) if replaceAmpWithMedianIntensity: amp = row.median_intensity else: amp = row.amplitude feature_data.append([amp, no_of_probes, probeDensity ]) percUnCoveredByLerContig_ls.append(deletedFraction) i += 1 if i%5000==0: sys.stderr.write("%s%s\t%s"%('\x08'*80, i, real_counter)) rows = query.offset(i).limit(block_size) sys.stderr.write("%s%s\t%s\n"%('\x08'*80, i, real_counter)) return PassingData(feature_data=feature_data, class_label_ls=class_label_ls, \ percUnCoveredByLerContig_ls=percUnCoveredByLerContig_ls, ecotype_id=ecotype_id)
def generate_parameters(self, parameter_names, parameter_depth=2): """ 2008-05-19 min_call_probability = self.min_call_probability 2008-05-11 put NA rate into passing parameters as well. too much memory consumption on each computing node """ sys.stderr.write("Generating parameter settings ...") param_d = PassingData() for parameter_name in parameter_names: parameter_value = getattr(self, parameter_name) parameter_value = parameter_value.split(',') parameter_value = map(float, parameter_value) setattr(self, parameter_name, parameter_value) """ #2008-05-19 commented out. use self.min_call_probability #figure out call probability from input_fname import re call_prob_pattern = re.compile(r'_(\d+)\.csv') call_prob_p_result = call_prob_pattern.search(self.input_fname) if call_prob_p_result: min_call_probability = float(call_prob_p_result.groups()[0]) else: min_call_probability = -1 """ min_call_probability = self.min_call_probability #only 1st 4, last 2 passed to computing node parameters = [] for max_call_mismatch_rate in getattr(self, parameter_names[0]): for max_call_NA_rate in getattr(self, parameter_names[1]): for max_snp_mismatch_rate in getattr(self, parameter_names[2]): for max_snp_NA_rate in getattr(self, parameter_names[3]): for npute_window_size in getattr( self, parameter_names[4]): parameters.append([min_call_probability, max_call_mismatch_rate, max_call_NA_rate, \ max_snp_mismatch_rate, max_snp_NA_rate, npute_window_size]) param_d.parameters = parameters param_d.max_snp_NA_rate_ls = self.max_snp_NA_rate_ls param_d.npute_window_size_ls = self.npute_window_size_ls sys.stderr.write(" %s parameter settings to process. Done.\n" % len(parameters)) return param_d