def addJobs(self, inputURL=None, relativePathList =[], outputDir="", username=None, password=None, \ transferOutput=True): """ 2012.6.27 """ sys.stderr.write("Adding wget jobs for %s input ... " % (len(relativePathList))) no_of_jobs = 0 topOutputDir = outputDir topOutputDirJob = yh_pegasus.addMkDirJob(self, mkdir=self.mkdirWrap, outputDir=topOutputDir) no_of_jobs += 1 returnData = PassingData() returnData.jobDataLs = [] for relativePath in relativePathList: #2013.06.26 remove all "/" from relativePath in case it's a folder relativePathNoFolder = relativePath.replace('/', '_') logFile = File('%s.log' % (relativePathNoFolder)) wgetJob = self.addWgetJob(executable=self.wget, url=inputURL, relativePath=relativePath, \ username=username, password=password,\ targetFolder=outputDir, logFile=logFile, cut_dir_number=self.cut_dir_number, parentJobLs=[topOutputDirJob], extraDependentInputLs=[], \ transferOutput=transferOutput, \ extraArguments=None, job_max_memory=50) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[wgetJob], file=wgetJob.output, \ fileLs=wgetJob.outputLs)) no_of_jobs += 1 sys.stderr.write("%s jobs.\n" % (no_of_jobs)) return returnData
def inputNodePrepare(self, snp_info=None): """ 2009-2-16 get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls 2009-2-11 refactored out of run() """ header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) del snps_context_wrapper gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData( phenData) #2009-2-16 self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn( phenData, Set(self.phenotype_method_id_ls)) if not self.phenotype_index_ls: self.phenotype_index_ls = range(len(phenData.col_id_ls)) pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) other_data_pickle = cPickle.dumps(other_data, -1) del other_data output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \ phenotype_index_ls=self.phenotype_index_ls) output_node_data_pickle = cPickle.dumps(output_node_data, -1) snpData_pickle = cPickle.dumps(snpData, -1) del snpData, data_matrix return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\ output_node_data_pickle=output_node_data_pickle, params_ls=params_ls) return return_data
def addJobs(self, inputData=None, outputDirPrefix="", ntDatabaseFileList=None, noOfTotalSequences=None, \ transferOutput=True, makeBlastDBJob=None): """ 2012.5.24 """ sys.stderr.write("Adding blast jobs for %s input ... " % (len(inputData.jobDataLs))) no_of_jobs = 0 topOutputDir = "%sBlast" % (outputDirPrefix) topOutputDirJob = yh_pegasus.addMkDirJob(self, mkdir=self.mkdirWrap, outputDir=topOutputDir) no_of_jobs += 1 allBlastResultFile = File(os.path.join(topOutputDir, 'blast.tsv')) allBlastMergeJob = self.addStatMergeJob(statMergeProgram=self.mergeSameHeaderTablesIntoOne, \ outputF=allBlastResultFile, transferOutput=transferOutput, parentJobLs=[topOutputDirJob]) no_of_jobs += 1 ntDatabaseFile = ntDatabaseFileList[0] returnData = PassingData() returnData.jobDataLs = [] for jobData in inputData.jobDataLs: inputF = jobData.output outputFnamePrefix = os.path.join( topOutputDir, os.path.splitext(os.path.basename(inputF.name))[0]) splitFastaJob = self.addSplitFastaFileJob(executable=self.SplitFastaFile, inputFile=inputF, outputFnamePrefix=outputFnamePrefix, \ noOfSequencesPerSplitFile=self.blockSize, filenameSuffix=".fasta", noOfTotalSequences=noOfTotalSequences,\ parentJobLs=jobData.jobLs + [topOutputDirJob], extraDependentInputLs=[], transferOutput=False, \ extraArguments=None, job_max_memory=500) no_of_jobs += 1 for splitFastaOutput in splitFastaJob.outputList: outputFile = File('%s.tsv' % (splitFastaOutput.name)) blastJob = self.addBlastWrapperJob(executable=self.BlastWrapper, inputFile=splitFastaOutput, outputFile=outputFile, \ outputFnamePrefix=splitFastaOutput.name , databaseFile=ntDatabaseFile,\ maxNoOfMismatches=self.maxNoOfMismatches, minNoOfIdentities=self.minNoOfIdentities, \ minIdentityPercentage=self.minIdentityPercentage, blastallPath=self.blastallPath, \ parentJobLs=[splitFastaJob, makeBlastDBJob], extraDependentInputLs=ntDatabaseFileList, transferOutput=False, \ extraArguments=None, job_max_memory=1000) #add output to some reduce job self.addInputToStatMergeJob(statMergeJob=allBlastMergeJob, \ inputF=blastJob.output, parentJobLs=[blastJob]) no_of_jobs += 1 sys.stderr.write("%s jobs. Done.\n" % (no_of_jobs)) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[allBlastMergeJob], file=allBlastResultFile, \ fileLs=[allBlastResultFile])) return returnData
def run(self): self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size-1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size-1 db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session session.begin() if node_rank == 0: snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest) if not self.results_id_ls: pdata = PassingData(call_method_id=self.call_method_id, analysis_method_id=self.analysis_method_id) self.results_id_ls = self.getResultsMethodIDLs(pdata) snps_context_wrapper_pickle = cPickle.dumps(snps_context_wrapper, -1) for node in free_computing_nodes: #send it to the computing_node sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node)) self.communicator.send(snps_context_wrapper_pickle, node, 0) sys.stderr.write(".\n") del snps_context_wrapper_pickle, snps_context_wrapper elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) snps_context_wrapper = cPickle.loads(data) del data else: pass self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=self.results_id_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=self.message_size) elif node_rank in free_computing_node_set: param_data = PassingData(session=session) param_data.results_directory = self.input_db_directory param_data.default_output_db_directory = self.default_output_db_directory param_data.output_db_directory = self.output_db_directory param_data.commit = self.commit param_data.min_MAF = self.min_MAF param_data.min_distance = self.min_distance param_data.get_closest = self.get_closest param_data.snps_context_wrapper = snps_context_wrapper self.computing_node(param_data, self.computing_node_handler) else: param_obj = PassingData() self.output_node(free_computing_nodes, param_obj, self.output_node_handler) self.synchronize() #to avoid some node early exits
def run(self): """ 2008-10-28 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session db_id2chr_pos = db.snp_id2chr_pos hist_type = CheckCandidateGeneRank.getHistType(self.call_method_id, self.min_distance, self.get_closest, self.min_MAF, \ self.allow_two_sample_overlapping, self.results_type, self.null_distribution_type_id) snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest) param_obj = PassingData(call_method_id=self.call_method_id, \ analysis_method_id=getattr(self, 'analysis_method_id', None),\ analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\ phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\ list_type_id_ls=self.list_type_id_ls, \ results_type=self.results_type,\ no_check_gene_list=True) params_ls = self.generate_params(param_obj) pd = PassingData(snps_context_wrapper=snps_context_wrapper, \ results_directory=self.results_directory, \ min_MAF=self.min_MAF, get_closest=self.get_closest, min_distance=self.min_distance,\ no_of_top_snps=self.no_of_top_snps, min_sample_size=self.min_sample_size, test_type_id=self.test_type_id, \ results_type=self.results_type, no_of_permutations=self.no_of_permutations,\ no_of_min_breaks=self.no_of_min_breaks, type=hist_type,\ null_distribution_type_id=self.null_distribution_type_id,\ allow_two_sample_overlapping=self.allow_two_sample_overlapping, min_score=self.min_score, session=session,\ commit=self.commit,\ db_id2chr_pos = db_id2chr_pos) for results_id, list_type_id in params_ls: pd.list_type_id = list_type_id pd.results_id = results_id self.pick_candidate_genes(pd)
def run(self): """ 2008-07-17 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session session.begin() snps_context_wrapper = self.dealWithSnpsContextWrapper( self.snps_context_picklef, self.min_distance, self.get_closest) param_data = PassingData() param_data.results_directory = self.input_db_directory param_data.default_output_db_directory = self.default_output_db_directory param_data.output_db_directory = self.output_db_directory param_data.commit = self.commit param_data.min_MAF = self.min_MAF param_data.min_distance = self.min_distance param_data.get_closest = self.get_closest param_data.snps_context_wrapper = snps_context_wrapper if not self.results_id_ls: pdata = PassingData(call_method_id=self.call_method_id, analysis_method_id=self.analysis_method_id, \ phenotype_method_id_ls=self.phenotype_method_id_ls) self.results_id_ls = self.getResultsMethodIDLs(pdata) for results_method_id in self.results_id_ls: rm = Stock_250kDB.ResultsMethod.get(results_method_id) if not rm: sys.stderr.write( "No results method available for results_method_id=%s.\n" % results_method_id) continue self.saveResultsByGene(session, rm, param_data) if self.commit: session.commit() session.clear() else: session.rollback()
def merge_call_on_one_row(cls, ecotypeid_duplicate_index_ls, data_matrix, no_of_cols, NA_set=Set([0, -2])): """ 2008-07-11 calculate the inconsistency ratio among duplicates 2008-05-12 -2 is also ruled out, add NA_set """ one_row = numpy.zeros(no_of_cols) passingdata = PassingData() passingdata.no_of_non_NA_pairs = 0 passingdata.no_of_non_NA_inconsistent_pairs = 0 for i in range(no_of_cols): call_counter_ls = [0] * 11 non_NA_call_number_set = Set() for index in ecotypeid_duplicate_index_ls: call_number = data_matrix[index][i] if call_number not in NA_set: #dont' need NA and non-touched bit call_counter_ls[call_number] += 1 non_NA_call_number_set.add(call_number) if len(non_NA_call_number_set) > 0: passingdata.no_of_non_NA_pairs += 1 if len(non_NA_call_number_set) > 1: passingdata.no_of_non_NA_inconsistent_pairs += 1 one_row[i] = dbSNP2data.get_majority_call_number(call_counter_ls) passingdata.one_row = one_row return passingdata
def summarize_NA_mismatch_ls(self, NA_mismatch_ls_ls, avg_var_name_pair_ls): """ 05/12/2008 called by output_node_handler() calculate average NA_rate, mismatch_rate, relative_NA_rate from NA_mismatch_ls_ls """ passingdata = PassingData() for avg_var_name_pair in avg_var_name_pair_ls: ls_var_name, avg_var_name, std_var_name = avg_var_name_pair setattr(passingdata, ls_var_name, []) setattr(passingdata, avg_var_name, -1) setattr(passingdata, std_var_name, -1) for i in range(len(NA_mismatch_ls_ls)): NA_mismatch_ls = NA_mismatch_ls_ls[i] NA_rate, mismatch_rate, no_of_NAs, no_of_totals, \ no_of_mismatches, no_of_non_NA_pairs, \ relative_NA_rate, relative_no_of_NAs, relative_no_of_totals = NA_mismatch_ls if NA_rate != -1 and mismatch_rate != -1 and relative_NA_rate != -1: #no non-valid values passingdata.NA_rate_ls.append(NA_rate) passingdata.mismatch_rate_ls.append(mismatch_rate) passingdata.relative_NA_rate_ls.append(relative_NA_rate) for avg_var_name_pair in avg_var_name_pair_ls: ls_var_name, avg_var_name, std_var_name = avg_var_name_pair this_ls = getattr(passingdata, ls_var_name) sample_size = len(this_ls) setattr(passingdata, 'sample_size', sample_size) if sample_size > 0: setattr(passingdata, avg_var_name, numpy.average(this_ls)) if sample_size > 1: setattr(passingdata, std_var_name, numpy.std(this_ls)) return passingdata
def testAllPlateIDinPlateSet(self, plate_id_ls, plate_id2plate_set): """ 2008-09-12 """ plate_set = None all_plate_id_in_plate_set = 1 #test whether all plate ids in previous plate sets or not for plate_id in plate_id_ls: if plate_id != 0: if plate_id not in plate_id2plate_set: all_plate_id_in_plate_set = 0 break else: if plate_set == None: plate_set = plate_id2plate_set[plate_id] elif plate_id2plate_set[plate_id] != plate_set: sys.stderr.write( "This plate_id_ls, %s, has >1 plate_sets: %s, %s.\n" % (repr(plate_id_ls), plate_set, plate_id2plate_set[plate_id])) all_plate_id_in_plate_set = 0 break return_data = PassingData() return_data.all_plate_id_in_plate_set = all_plate_id_in_plate_set return_data.plate_set = plate_set return return_data
def findSNPsInRegion(self, snp_info, chromosome, start, stop, center_snp_position=None): """ 2008-10-1 called by plotSNPRegion() find SNPs in this region, if center_snp_position is not given, find one. similar to getSNPsAroundThisSNP() """ if self.report: sys.stderr.write("Get SNPs in this region ...") from DrawSNPRegion import SNPPassingData chr_pos_ls = [] chr_pos2adjacent_window = {} j = 0 midpoint = (start+stop)/2. if center_snp_position is None: _center_snp_position = start else: _center_snp_position = center_snp_position center_snp = SNPPassingData(chromosome=chromosome, position=_center_snp_position, snps_id=None) for i in range(start-1, stop+2): new_pos = i new_chr_pos = (chromosome, new_pos) if new_chr_pos in snp_info.chr_pos2index: if center_snp_position is None and abs(new_pos-midpoint)<abs(center_snp.position-midpoint): #this SNP is closer to the center center_snp.position = new_pos chr_pos_ls.append(new_chr_pos) if j!=0: #add_mid_point(chr_pos_ls, chr_pos2adjacent_window) pass j += 1 center_snp.snps_id = '%s_%s'%(center_snp.chromosome, center_snp.position) snp_region = PassingData(chr_pos_ls=chr_pos_ls, chr_pos2adjacent_window=chr_pos2adjacent_window, center_snp=center_snp) if self.report: sys.stderr.write("Done.\n") return snp_region
def mapEachInterval(self, workflow=None, inputJobData=None, selectIntervalJobData=None, \ chromosome=None,intervalData=None,\ mapEachChromosomeData=None, \ passingData=None, transferOutput=False, **keywords): """ 2013.04.08 use inputJobData 2012.10.3 #. extract flanking sequences from the input Input (ref sequence file => contig ref sequence) #. blast them #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py #. where hit length match query length, and no of mismatches <=2 => good => infer new coordinates #. output a mapping file between old SNP and new SNP coordinates. #. reduce this thing by combining everything #. make a new Input file based on the input split Input file (replace contig ID , position with the new one's, remove the header part regarding chromosomes or replace it) """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] #passingData.intervalFileBasenamePrefix #passingData.splitInputFile #passingData.unitNumber """ ## 2013.06.19 structures available from passingData, specific to the interval passingData.splitInputFile = splitInputFile passingData.unitNumber = unitNumber passingData.intervalFileBasenamePrefix = '%s_%s_splitInput_u%s'%(chromosome, commonPrefix, unitNumber) passingData.noOfIndividuals = jobData.file.noOfIndividuals passingData.span = self.intervalSize + self.intervalOverlapSize*2 #2013.06.19 for memory/walltime gauging """ return returnData
def readInSNPID2GenotypeVectorLs(self, inputFname=None, returnType=1): """ returnType 1: snp_pos2returnData is snp_pos2genotypeVectorLs 2: snp_pos2returnData is snp_pos2returnData 2013.07.19 bugfix 2013.07.11 """ sys.stderr.write("Finding SNPs that have same positions from %s ..."%(inputFname)) reader = VCFFile(inputFname=inputFname) counter = 0 real_counter = 0 snp_pos2returnData = {} for vcfRecord in reader: key = (vcfRecord.chromosome, vcfRecord.position) if key not in snp_pos2returnData: if returnType==1: snp_pos2returnData[key] = [] else: snp_pos2returnData[key] = 0 else: real_counter += 1 if returnType==1: snp_pos2returnData[key].append(vcfRecord.data_row[1:]) #[0] is reference else: snp_pos2returnData[key] += 1 counter += 1 reader.close() sys.stderr.write("%s snp coordinates from %s vcf records. %s entries with same-positions.\n"%\ (len(snp_pos2returnData), counter, real_counter)) return PassingData(snp_pos2returnData=snp_pos2returnData)
def avgKey2DataLs(self, key2dataLs, no_of_key_columns=1, header=[]): """ 2012.1.9 1. take mean/median/stdev of every cell in dataLs, 2. modify newHeader to reflect that """ sys.stderr.write("Averaging key2dataLs (%s entries ) ..."%(len(key2dataLs))) newKey2DataLs = {} newHeader = [] keyColHeader = header[:no_of_key_columns] valueColHeader = header[no_of_key_columns:] newValueColHeader = [] no_of_value_columns = len(valueColHeader) for i in xrange(no_of_value_columns): valueColName = valueColHeader[i] newValueColHeader += ['mean_%s'%(valueColName), 'median_%s'%(valueColName), 'stdev_%s'%(valueColName)] for key, dataLs in key2dataLs.iteritems(): if key not in newKey2DataLs: newKey2DataLs[key] = [] no_of_value_columns = len(dataLs) for i in xrange(no_of_value_columns): meanValue = numpy.mean(dataLs[i]) medianValue = numpy.median(dataLs[i]) stdev = numpy.std(dataLs[i]) newKey2DataLs[key] += [meanValue, medianValue, stdev] sys.stderr.write("Done.\n") return PassingData(key2dataLs= newKey2DataLs, header=keyColHeader + newValueColHeader)
def getDataStructureFromSNPsD(self, snpsd): """ 05/07/08 """ sys.stderr.write("Reading data ...") no_of_rows = len(snpsd.positions) no_of_cols = len(snpsd.accessions) snps = [] nucs = [] for i in range(no_of_rows): one_snp_ls, symbol2counts = self.get_symbol2counts( snpsd.snps, fixed_index=i, no_of_rolls=no_of_cols, by_row=0) passingdata = self.get_symbol2MAJ_MIN(symbol2counts) if passingdata.symbol2MAJ_MIN == 3: sys.stderr.write( "Error: SNP %s (%s) has more than 2 alleles: %s.\n" % (i, snpsd.positions[i], repr(symbol2counts))) sys.exit(2) map_func = lambda x: passingdata.symbol2MAJ_MIN[x] one_snp_ls = map(map_func, one_snp_ls) snps.append(''.join(one_snp_ls)) nucs += [(passingdata.major, passingdata.minor)] passingdata = PassingData() passingdata.snps = array(snps) passingdata.sdps = set(snps) passingdata.nucs = array(nucs) passingdata.numSamps = no_of_cols sys.stderr.write("Done.\n") return passingdata.snps, passingdata.sdps, passingdata.nucs, passingdata.numSamps
def get_symbol2MAJ_MIN(self, symbol2counts): #construct a dictionary to map input symbols to MAJ, MIN or '?' symbol2MAJ_MIN = {self.input_NA_char: '?'} #'NA' is always '?' symbols = symbol2counts.keys() if len(symbols) == 0: major = '' minor = '' elif len(symbols) == 1: symbol2MAJ_MIN[symbols[0]] = MAJ major = symbols[0] minor = '' elif len(symbols) == 2: major, minor = symbols if symbol2counts[major] < symbol2counts[minor]: minor, major = symbols #reverse them symbol2MAJ_MIN[major] = MAJ symbol2MAJ_MIN[minor] = MIN elif len(symbols) > 2: major, minor = None, None symbol2MAJ_MIN = 3 passingdata = PassingData() passingdata.symbol2MAJ_MIN = symbol2MAJ_MIN passingdata.major = major passingdata.minor = minor return passingdata
def reduceEachChromosome(self, workflow=None, chromosome=None, passingData=None, mapEachInputDataLs=None, chromosome2mapEachIntervalDataLs=None,\ reduceEachInputDataLs=None,\ transferOutput=True, \ **keywords): """ """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] returnData.mapEachInputDataLs = mapEachInputDataLs returnData.reduceEachInputDataLs = reduceEachInputDataLs #reduce matrix by chosen column and average p-value outputFile = File( os.path.join( self.reduceEachChromosomeDirJob.output, 'chr_%s_LocusLiftOverProbability.tsv.gz' % (chromosome))) reduceChromosomeJob = self.addStatMergeJob(statMergeProgram=self.mergeSameHeaderTablesIntoOne, \ outputF=outputFile, \ parentJobLs=[self.reduceEachChromosomeDirJob],extraOutputLs=None, \ extraDependentInputLs=None, transferOutput=False) #extraArgumentList=['--keyColumnLs 0-6 --valueColumnLs 7'],\ mapEachIntervalDataLs = chromosome2mapEachIntervalDataLs.get( chromosome) for mapEachIntervalData in mapEachIntervalDataLs: for jobData in mapEachIntervalData.jobDataLs: self.addInputToStatMergeJob(statMergeJob=reduceChromosomeJob, parentJobLs=[jobData.job]) #add the reduction job to final stat merge job self.addInputToStatMergeJob(statMergeJob=self.reduceJob, parentJobLs=[reduceChromosomeJob]) return returnData
def returnLocusLowMapQualityIndicator(self, alignedReadLs=None, minMapQGoodRead=2, minFractionOfGoodRead=0.9): """ 2013.12.04 """ totalNoOfReads = 0 noOfGoodReads = 0.0 medianMapQ=-10 mapQList=[] for alignedRead in alignedReadLs: totalNoOfReads +=1 mapQList.append(alignedRead.mapq) if alignedRead.mapq>=minMapQGoodRead: noOfGoodReads += 1 else: pass if totalNoOfReads>0: fractionOfGoodRead = noOfGoodReads/(totalNoOfReads) medianMapQ = numpy.median(mapQList) else: fractionOfGoodRead = -1 medianMapQ = -10 if fractionOfGoodRead>=minFractionOfGoodRead: locusLowMapQIndicator = 0 else: locusLowMapQIndicator = 2 return PassingData(locusLowMapQIndicator=locusLowMapQIndicator, totalNoOfReads=totalNoOfReads, \ noOfGoodReads=noOfGoodReads, fractionOfGoodRead=fractionOfGoodRead,\ medianMapQ=medianMapQ)
def organizeProbesIntoChromosome(cls, xy_ls, chr_pos_ls, probes_id_ls): """ 2010-4-29 add chr_pos2index to map (chr, pos) to its index in chr_pos_ls 2009-11-24 split out of calculateProbeQuartilePerChromosome() xy_ls, chr_pos_ls, probes_id_ls are already in chromosomal order. """ sys.stderr.write("Getting probes into each chromosome ...") chr2xy_ls = {} chr2probe_id_ls = {} chr_pos2index = { } # 2010-4-29. map (chr, pos) to its index in chr_pos_ls for i in range(len(xy_ls)): chr, pos = chr_pos_ls[i] if chr not in chr2xy_ls: chr2xy_ls[chr] = [] chr2probe_id_ls[chr] = [] #initialize with the start_probe_id chr2xy_ls[chr].append(xy_ls[i]) chr2probe_id_ls[chr].append(probes_id_ls[i]) chr_pos2index[(chr, pos)] = i sys.stderr.write("Done.\n") return PassingData(chr2xy_ls=chr2xy_ls, chr2probe_id_ls=chr2probe_id_ls, chr_pos2index=chr_pos2index)
def getScoreRankFromPermIndexLs(self, genome_wide_result, candidate_gene_snp_index_ls, non_candidate_gene_snp_index_ls): """ 2008-10-21 """ sys.stderr.write("Getting Score rank data given index ls...") candidate_score_ls = [] non_candidate_score_ls = [] candidate_rank_ls = [] non_candidate_rank_ls = [] for index in candidate_gene_snp_index_ls: if genome_wide_result.data_obj_ls[index]: candidate_score_ls.append( genome_wide_result.data_obj_ls[index].value) no_of_candidate_scores = len(candidate_score_ls) for index in non_candidate_gene_snp_index_ls: if genome_wide_result.data_obj_ls[index]: non_candidate_score_ls.append( genome_wide_result.data_obj_ls[index].value) total_score_ls = candidate_score_ls + non_candidate_score_ls import rpy rank_ls = rpy.r.rank(total_score_ls) candidate_rank_ls = rank_ls[:no_of_candidate_scores] non_candidate_rank_ls = rank_ls[no_of_candidate_scores:] score_rank_data = PassingData(candidate_score_ls=candidate_score_ls, candidate_rank_ls=candidate_rank_ls,\ non_candidate_score_ls=non_candidate_score_ls, non_candidate_rank_ls=non_candidate_rank_ls) sys.stderr.write("Done.\n") return score_rank_data
def openWriteBeagleFiles(self, pedigreeFamilyData=None, outputFnamePrefix=None): """ 2013.05.02 The non-likelihood (unphased, trios, pairs) Beagle format: I id sample1 sample1 sample2 sample2 A diabetes 1 1 2 2 M rs12082861 C C C C M rs4912233 T C C C M rs12732823 G A A A M rs17451521 C C C C M rs12033358 C T T T The likelihood version is marker alleleA alleleB 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1002_711_2001039_GA_vs_524 Contig791:1086 C A 0.9693 0.0307 0.0000 0.6660 0.3338 0.0003 0.0000 Contig791:1649 G C 0.9406 0.0594 0.0000 0.9693 0.0307 0.0000 0.0000 Contig791:4084 A C 0.9980 0.0020 0.0000 0.9844 0.0156 0.0000 0.0000 The markers file has this format (markerID, position, alleleA, alleleB) Contig791:1086 1086 C A """ sys.stderr.write( "Opening beagle files (outputFnamePrefix =%s) to write ..." % (outputFnamePrefix)) familySize2BeagleFileHandler = {} familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList counter = 0 for familySize, sampleIDList in familySize2SampleIDList.iteritems(): if familySize not in familySize2BeagleFileHandler: tmpOutputFnamePrefix = '%s_familySize%s' % (outputFnamePrefix, familySize) writer = MatrixFile(inputFname='%s.bgl' % (tmpOutputFnamePrefix), openMode='w', delimiter=' ') familySize2BeagleFileHandler[familySize] = writer if familySize == 1: headerRow = ['marker', 'alleleA', 'alleleB'] else: headerRow = ['I', 'id'] for sampleID in sampleIDList: if familySize == 1: #likelihood format has sample name replicated three times, rather than 2 times headerRow.extend([sampleID] * 3) else: headerRow.extend([sampleID] * 2) writer.writeHeader(headerRow) counter += 1 markersFile = MatrixFile(inputFname='%s.markers' % (outputFnamePrefix), openMode='w', delimiter=' ') counter += 1 sys.stderr.write("%s files outputted.\n" % (counter)) return PassingData( familySize2BeagleFileHandler=familySize2BeagleFileHandler, markersFile=markersFile)
def estimateMeanStdFromData(dataVector=None, excludeTopFraction=0.2): """ 2012.10.14 adapted from vervet/src/pedigree/DetectWrongLabelByCompKinshipVsIBD.DetectWrongLabelByCompKinshipVsIBD.estimateAbsDeltaMeanStd() 2012.8.22 """ sys.stderr.write("Estimating mean&std using the middle %.1f%% of data (n=%s) ..."%\ ((1-excludeTopFraction)*100, len(dataVector))) noOfRows = len(dataVector) import numpy # 2012.8.22 draw some histogram to check what data looks like # if len(dataVector)>10: # outputFname = '%s_kinship_ibd_hist.png'%(self.outputFnamePrefix) # yh_matplotlib.drawHist(dataVector, title='', \ # xlabel_1D="kinship-ibd", xticks=None, \ # outputFname=outputFname, min_no_of_data_points=10, \ # needLog=True, \ # dpi=200, min_no_of_bins=25) #dataVector = map(abs, dataVector) #2012.8.23 no abs dataVector.sort() startIndex = min(0, int(len(dataVector) * (excludeTopFraction / 2)) - 1) stopIndex = int(len(dataVector) * (1 - excludeTopFraction / 2)) dataVector = dataVector[startIndex:stopIndex] data_mean = numpy.mean(dataVector) data_std = numpy.std(dataVector) sys.stderr.write(" mean=%.3f, std=%.3f.\n" % (data_mean, data_std)) return PassingData(mean=data_mean, std=data_std)
def get_strain_id_info(self, QC_method_id, ignore_strains_with_qc=True): """ 2008-08-18 to generate data structure related to strain_id, preparation to get data_matrix strainid not QCed yet link to tg_ecotypeid """ sys.stderr.write("Getting strain_id info ... ") strain_id2index = {} strain_id_list = [] strain_id2acc = {} strain_id2category = {} rows = StockDB.Strain.query.all() for row in rows: if ignore_strains_with_qc: ignore_this = 0 for call_qc in row.call_qc_ls: if call_qc.qc_method_id == QC_method_id: #QC already done ignore_this = 1 break if ignore_this: continue strain_id = row.id strain_index = len(strain_id_list) strain_id_list.append(strain_id) strain_id2index[strain_id] = strain_index strain_id2acc[ strain_id] = row.ecotypeid_strainid2tg_ecotypeid.tg_ecotypeid strain_id2category[strain_id] = strain_id passingdata = PassingData(strain_id2index=strain_id2index, strain_id_list=strain_id_list, strain_id2acc=strain_id2acc,\ strain_id2category=strain_id2category) sys.stderr.write("%s strains. Done.\n" % (len(strain_id_list))) return passingdata
def computing_node_handler(self, communicator, data, param_obj): """ 2009-9-16 parameter test_type is renamed to test_type_id 2008-08-20 wrap all parameters into pd and pass it to run_wilcox_test 2008-07-17 """ node_rank = communicator.rank sys.stderr.write("Node no.%s working...\n"%node_rank) data = cPickle.loads(data) result_ls = [] pd = PassingData(snps_context_wrapper=param_obj.snps_context_wrapper,\ results_directory=param_obj.results_directory,\ min_MAF=param_obj.min_MAF, get_closest=self.get_closest, min_distance=self.min_distance, \ min_sample_size=self.min_sample_size, test_type_id=self.test_type_id, \ results_type=self.results_type, no_of_permutations=self.no_of_permutations,\ no_of_min_breaks=self.no_of_min_breaks) for results_method_id, list_type_id in data: pd.results_id = results_method_id pd.list_type_id = list_type_id result = self.run_wilcox_test(pd) if result is not None: result_ls.append(result) sys.stderr.write("Node no.%s done with %s results.\n"%(node_rank, len(result_ls))) return result_ls
def getGenomeWideResult(self, call_method_id, phenotype_method_id, analysis_method_id): rows = Stock_250kDB.ResultsMethod.query.filter_by(call_method_id=call_method_id).filter_by(analysis_method_id=analysis_method_id).\ filter_by(phenotype_method_id=phenotype_method_id).filter_by(results_method_type_id=1) pdata = PassingData() if rows.count() == 1: rm = rows.first() elif rows.count() == 0: sys.stderr.write("No result fetched from db based on call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n"%\ (call_method_id, analysis_method_id, phenotype_method_id)) rm = None else: sys.stderr.write("First result out of %s results fetched from db based on call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n"%\ (rows.count(), call_method_id, analysis_method_id, phenotype_method_id)) rm = rows.first() if rm: input_fname = rm.filename pdata.gwr_name = '%s_%s_%s' % (rm.analysis_method.short_name, rm.phenotype_method_id, rm.phenotype_method.short_name) else: return genome_wide_result = getGenomeWideResultFromFile( input_fname, min_value_cutoff=None, do_log10_transformation=True, pdata=pdata) return genome_wide_result
def get_no_of_top_snps_info(cls, db, from_where_clause): """ 2008-11-04 #there's a chance it occurs twice due to float difference in min_score 2008-10-23 """ sys.stderr.write("Getting no_of_top_snps_info ...") rows = db.metadata.bind.execute( "select distinct t.no_of_top_snps, t.min_score %s order by no_of_top_snps" % from_where_clause) id_ls = [] id2index = {} label_ls = [] no_of_separators = 0 for row in rows: if row.no_of_top_snps not in id2index: #there's a chance it occurs twice due to float difference in min_score id2index[row.no_of_top_snps] = len(id_ls) id_ls.append(row.no_of_top_snps) label_ls.append('%s %s' % (row.no_of_top_snps, row.min_score)) list_info = PassingData() list_info.id2index = id2index list_info.id_ls = id_ls list_info.label_ls = label_ls sys.stderr.write("Done.\n") return list_info
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session array_id2median_intensity = self.get_array_id2median_intensity(min_array_median_intensity=self.min_array_median_intensity) arrays_to_form_model = self.getModelArrays(db, self.training_cnv_method_id, array_id2median_intensity) if self.debug: # 2010-7-25 for debug, temporary arrays_to_form_model = arrays_to_form_model[:4] array_id2model = self.constructSVMModels(db, arrays_to_form_model, array_id2median_intensity,\ minPercUnCoveredByLerContig=self.minPercUnCoveredByLerContig, cnv_method_id=self.training_cnv_method_id,\ C=self.SVM_C, gamma=self.SVM_gamma, eps=self.SVM_eps, deletedFractionType=self.deletedFractionType) array_id2model_array_id_ls = self.mapAnyArray2ModelArray(array_id2median_intensity, array_id2model, \ max_median_intensity_dist=self.max_median_intensity_dist,\ minNoOfModelArrays=self.minNoOfModelArrays) param_obj = PassingData(session=session, no_of_total=0, no_of_into_db=0, report=self.report,\ cnv_method_id=self.cnv_method_id, cnv_type_id=self.cnv_type_id) self.predictALLSegments(self.input_fname, array_id2model_array_id_ls, array_id2model,\ max_amplitude=self.max_amplitude, param_obj=param_obj) session.flush() session.expunge_all() session.commit()
def loadDataStructure(self, gene_annotation_picklef, min_MAF=0, min_distance=20000, \ list_type_id=None, snp_matrix_fname=None, snp_matrix_data_type=1): """ 2009-5-30 add argument snp_matrix_fname 2008-11-25 2008-10-01 wrap a few functions up, convenient for both run() and drawSNPRegion() """ db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) self.db = db snp_info = self.getSNPInfo(db) gene_annotation = self.dealWithGeneAnnotation(gene_annotation_picklef) if list_type_id: candidate_gene_list = self.getGeneList(list_type_id) candidate_gene_set = Set(candidate_gene_list) else: candidate_gene_set = Set() if snp_matrix_fname: if snp_matrix_data_type==3: matrix_data_type=float #2009-3-23 for CNV amplitude file else: matrix_data_type=int snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=matrix_data_type) #2008-12-05 fake a snp_info for findSNPsInRegion self.construct_chr_pos2index_forSNPData(snpData) else: snpData = None return_data = PassingData(gene_annotation=gene_annotation, snp_info=snp_info, \ candidate_gene_set=candidate_gene_set, snpData=snpData) return return_data
def getAlignmentMatrix(self, alignment_id): sys.stderr.write("Getting alignment matrix for alignment=%s ..." % (alignment_id)) snp_pos_ls = [] accession_id_ls = [] name_ls = [] data_matrix = [] rows = Sequence.query.filter_by(alignment=alignment_id).order_by( Sequence.accession).all() counter = 0 for row in rows: if counter == 0: snp_pos_ls = self.get_snp_pos_ls(row.alignment_obj.target, row.alignment_obj.chromosome, row.alignment_obj.start) accession_id_ls.append(row.accession) name_ls.append(row.accession_obj.name) data_row = dict_map(nt2number, row.bases) data_matrix.append(data_row) counter += 1 data_matrix = num.array(data_matrix, num.int8) passingdata = PassingData(snp_pos_ls=snp_pos_ls, accession_id_ls=accession_id_ls, name_ls=name_ls, data_matrix=data_matrix) sys.stderr.write(' %s accessions, %s bases. Done.\n' % (len(accession_id_ls), len(snp_pos_ls))) return passingdata
def mapEachAlignment(self, workflow=None, alignmentData=None, passingData=None, transferOutput=True, **keywords): """ 2012.9.22 similar to reduceBeforeEachAlignmentData() but for mapping programs that run on one alignment each. passingData.AlignmentJobAndOutputLs = [] passingData.bamFnamePrefix = bamFnamePrefix passingData.individual_alignment = alignment """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob refFastaF = passingData.refFastaFList[0] alignment = alignmentData.alignment parentJobLs = alignmentData.jobLs bamF = alignmentData.bamF baiF = alignmentData.baiF bamFnamePrefix = alignment.getReadGroup() return returnData
def computing_node_handler(self, communicator, data, computing_parameter_obj): """ 2008-09-10 add source_id to PassingData 2008-08-28 """ node_rank = communicator.rank sys.stderr.write("Node no.%s working...\n" % node_rank) data = cPickle.loads(data) result_ls = [] twoSNPData = computing_parameter_obj.twoSNPData QC_method_id = computing_parameter_obj.QC_method_id for row_id1, row_id2 in data: NA_rate, mismatch_rate, no_of_NAs, no_of_totals, no_of_mismatches, no_of_non_NA_pairs = twoSNPData.cmpOneRow( row_id1, row_id2) #the 2nd position in the row-id1 tuple is strain id if QC_method_id == 4: #the 2nd position in the row-id2 tuple is strain id target_id = row_id2[1] else: target_id = row_id2 qc_cross_match = PassingData(source_id=row_id1[0], strainid=row_id1[1], target_id=target_id, mismatch_rate=mismatch_rate, \ no_of_mismatches=no_of_mismatches, no_of_non_NA_pairs=no_of_non_NA_pairs) result_ls.append(qc_cross_match) sys.stderr.write("Node no.%s done with %s results.\n" % (node_rank, len(result_ls))) return result_ls