Beispiel #1
0
	def orderListTypeAnalysisMethodID(self, list_type_id_ls, analysis_method_id_ls):
		"""
		2008-08-29
			deal with separator (list_type_id=-1) in list_type_id_ls
		"""
		sys.stderr.write("Orderinig list type id and analysis_method id ... ")
		list_type_id_analysis_method_id_ls = []
		list_type_id_analysis_method_id2index = {}
		list_type_analysis_method_label_ls = []
		no_of_separators = 0
		for list_type_id in list_type_id_ls:
			if list_type_id==-1:	#separator
				no_of_separators += 1
				tup = (-no_of_separators,-1)
				list_type_id_analysis_method_id2index[tup] = len(list_type_id_analysis_method_id_ls)
				list_type_id_analysis_method_id_ls.append(tup)
				list_type_analysis_method_label_ls.append('')
				continue
			list_type_short_name = GeneListType.get(list_type_id).short_name
			for analysis_method_id in analysis_method_id_ls:
				analysis_method_short_name = AnalysisMethod.get(analysis_method_id).short_name
				tup = (list_type_id, analysis_method_id)
				list_type_id_analysis_method_id2index[tup] = len(list_type_id_analysis_method_id_ls)
				list_type_id_analysis_method_id_ls.append(tup)
				list_type_analysis_method_label_ls.append('%s_%s_%s'%(analysis_method_short_name, list_type_short_name, list_type_id))
		return_data = PassingData()
		return_data.list_type_id_analysis_method_id_ls = list_type_id_analysis_method_id_ls
		return_data.list_type_id_analysis_method_id2index = list_type_id_analysis_method_id2index
		return_data.list_type_analysis_method_label_ls = list_type_analysis_method_label_ls
		sys.stderr.write("Done.\n")
		return return_data
Beispiel #2
0
	def get_symbol2MAJ_MIN(self, symbol2counts):
		#construct a dictionary to map input symbols to MAJ, MIN or '?'
		symbol2MAJ_MIN = {self.input_NA_char:'?'}	#'NA' is always '?'
		symbols = symbol2counts.keys()
		if len(symbols) == 0:
			major = ''
			minor = ''
		elif len(symbols) == 1:
			symbol2MAJ_MIN[symbols[0]] = MAJ
			major = symbols[0]
			minor = ''
		elif len(symbols) ==2:
			major, minor = symbols
			if symbol2counts[major]<symbol2counts[minor]:
				minor, major = symbols	#reverse them
			symbol2MAJ_MIN[major] = MAJ
			symbol2MAJ_MIN[minor] = MIN
		elif len(symbols)>2:
			major, minor = None, None
			symbol2MAJ_MIN = 3
		passingdata = PassingData()
		passingdata.symbol2MAJ_MIN = symbol2MAJ_MIN
		passingdata.major = major
		passingdata.minor = minor
		return passingdata
Beispiel #3
0
	def getStrainIDInfo(self, db, strain_id_info_query, strain_id_set=None):
		"""
		2008-08-29
		"""
		sys.stderr.write("Getting strain id info ...")
		rows = db.metadata.bind.execute(strain_id_info_query)
		strain_id_ls = []
		strain_id2index = {}
		strain_label_ls = []
		prev_country_abbr = None
		no_of_separators = 0
		for row in rows:
			if strain_id_set and row.strainid not in strain_id_set:	#skip
				continue
			if prev_country_abbr == None:
				prev_country_abbr = row.abbr
			elif row.abbr!=prev_country_abbr:
				prev_country_abbr = row.abbr
				no_of_separators += 1
				strain_id2index[-no_of_separators] = len(strain_id_ls)
				strain_id_ls.append(-no_of_separators)
				strain_label_ls.append('')
			strain_id2index[row.strainid] = len(strain_id_ls)
			strain_id_ls.append(row.strainid)
			if len(row.sitename)>10:
				sitename = row.sitename[:10]
			else:
				sitename = row.sitename
			strain_label_ls.append('%s_%s_%s_%s'%(row.abbr, sitename, row.nativename, row.strainid))
		strain_id_info = PassingData()
		strain_id_info.strain_id_ls = strain_id_ls
		strain_id_info.strain_id2index = strain_id2index
		strain_id_info.strain_label_ls = strain_label_ls
		sys.stderr.write("Done.\n")
		return strain_id_info
Beispiel #4
0
	def computing_node_handler(self, communicator, data, param_obj):
		"""
		2009-9-16
			parameter test_type is renamed to test_type_id
		2008-08-20
			wrap all parameters into pd and pass it to run_wilcox_test
		2008-07-17
		
		"""
		node_rank = communicator.rank
		sys.stderr.write("Node no.%s working...\n"%node_rank)
		data = cPickle.loads(data)
		result_ls = []
		pd = PassingData(snps_context_wrapper=param_obj.snps_context_wrapper,\
							results_directory=param_obj.results_directory,\
							min_MAF=param_obj.min_MAF, get_closest=self.get_closest, min_distance=self.min_distance, \
							min_sample_size=self.min_sample_size, test_type_id=self.test_type_id, \
							results_type=self.results_type, no_of_permutations=self.no_of_permutations,\
							no_of_min_breaks=self.no_of_min_breaks)
		for results_method_id, list_type_id in data:
			pd.results_id = results_method_id
			pd.list_type_id = list_type_id
			result = self.run_wilcox_test(pd)
			if result is not None:
				result_ls.append(result)
		sys.stderr.write("Node no.%s done with %s results.\n"%(node_rank, len(result_ls)))
		return result_ls
	def returnGeneSegments(self, db, elem=None, gene_commentary=None, commentary_type=None):
		"""
		2012.5.15
			add argument commentary_type to stop replicating gene_commentary.gene_commentary_type
		2008-07-28
		"""
		start_ls, stop_ls, gi_ls = self.return_location_list(elem)
		gene_segments = []
		min_start = start_ls[0]
		max_stop = stop_ls[0]
		if commentary_type:
			gene_commentary_type = db.getGeneCommentaryType(commentary_type=commentary_type)
		else:
			gene_commentary_type = gene_commentary.gene_commentary_type
		for i in range(len(start_ls)):
			start = start_ls[i]
			stop = stop_ls[i]
			min_start_stop = min(start, stop)
			max_start_stop = max(start, stop)
			if min_start_stop < min_start:
				min_start = min_start_stop
			if max_start_stop > max_stop:
				max_stop = max_start_stop
			gi = gi_ls[i]
			gene_segment = GeneSegment(start=start, stop=stop, gi=gi, gene_commentary_type=gene_commentary_type)
			gene_segment.gene_commentary = gene_commentary
			gene_segments.append(gene_segment)
		passingdata = PassingData()
		passingdata.gene_segments = gene_segments
		passingdata.start = min_start
		passingdata.stop = max_stop
		return passingdata
Beispiel #6
0
	def getPhenotypeInfo(self, db,  where_condition):
		"""
		2008-08-29
			add -1 as a separator into phenotype_method_id_ls and others
		"""
		sys.stderr.write("Getting phenotype method info ...")
		rows = db.metadata.bind.execute("select distinct r.phenotype_method_id, p.biology_category_id from %s p, %s and p.id=r.phenotype_method_id order by p.biology_category_id, r.phenotype_method_id"\
								%(PhenotypeMethod.table.name, where_condition))
		phenotype_method_id_ls = []
		phenotype_method_id2index = {}
		phenotype_method_label_ls = []
		prev_biology_category_id = None
		no_of_separators = 0
		for row in rows:
			if prev_biology_category_id == None:
				prev_biology_category_id = row.biology_category_id
			elif row.biology_category_id!=prev_biology_category_id:
				prev_biology_category_id = row.biology_category_id
				#add a blank phenotype id as separator
				no_of_separators += 1
				phenotype_method_id2index[-no_of_separators] = len(phenotype_method_id_ls)
				phenotype_method_id_ls.append(-no_of_separators)
				phenotype_method_label_ls.append('')
			phenotype_method_id2index[row.phenotype_method_id] = len(phenotype_method_id_ls)
			phenotype_method_id_ls.append(row.phenotype_method_id)
			pm = PhenotypeMethod.get(row.phenotype_method_id)
			phenotype_method_label_ls.append('%s_%s'%(pm.id, pm.short_name))
		phenotype_info = PassingData()
		phenotype_info.phenotype_method_id2index = phenotype_method_id2index
		phenotype_info.phenotype_method_id_ls = phenotype_method_id_ls
		phenotype_info.phenotype_method_label_ls = phenotype_method_label_ls
		sys.stderr.write("Done.\n")
		return phenotype_info
Beispiel #7
0
    def getCallMethodInfo(cls, affiliated_table_name, extra_condition=None, extra_tables=None):
        """
		2009-1-30
			similar to getPhenotypeInfo, getListTypeInfo, getAnalysisMethodInfo
		"""
        table_str = "%s s, %s p" % (affiliated_table_name, model.Stock_250kDB.CallMethod.table.name)
        if extra_tables:
            table_str += ", %s" % extra_tables
        where_condition = "p.id=s.call_method_id"
        if extra_condition:
            where_condition += " and %s" % extra_condition
        rows = model.db.metadata.bind.execute(
            "select distinct p.id, p.short_name from %s \
			where %s order by p.id"
            % (table_str, where_condition)
        )
        id_ls = []
        id2index = {}
        label_ls = []
        prev_biology_category_id = -1
        no_of_separators = 0
        for row in rows:
            id2index[row.id] = len(id_ls)
            id_ls.append(row.id)
            label_ls.append("%s %s" % (row.id, row.short_name))
        list_info = PassingData()
        list_info.id2index = id2index
        list_info.id_ls = id_ls
        list_info.label_ls = label_ls
        return list_info
Beispiel #8
0
	def getDataStructureFromSNPsD(self, snpsd):
		"""
		05/07/08
		"""
		sys.stderr.write("Reading data ...")
		no_of_rows = len(snpsd.positions)
		no_of_cols = len(snpsd.accessions)
		snps = []
		nucs = []
		for i in range(no_of_rows):
			one_snp_ls, symbol2counts = self.get_symbol2counts(snpsd.snps, fixed_index=i, no_of_rolls=no_of_cols, by_row=0)
			
			passingdata = self.get_symbol2MAJ_MIN(symbol2counts)
			if passingdata.symbol2MAJ_MIN==3:
				sys.stderr.write("Error: SNP %s (%s) has more than 2 alleles: %s.\n"%(i, snpsd.positions[i], repr(symbol2counts)))
				sys.exit(2)
			
			map_func = lambda x: passingdata.symbol2MAJ_MIN[x]
			one_snp_ls = map(map_func, one_snp_ls)
			
			snps.append(''.join(one_snp_ls))
			nucs += [(passingdata.major, passingdata.minor)]
		passingdata = PassingData()
		passingdata.snps  = array(snps)
		passingdata.sdps = Set(snps)
		passingdata.nucs = array(nucs)
		passingdata.numSamps = no_of_cols
		sys.stderr.write("Done.\n")
		return passingdata.snps, passingdata.sdps, passingdata.nucs, passingdata.numSamps
Beispiel #9
0
	def merge_call_on_one_row(cls, ecotypeid_duplicate_index_ls, data_matrix, no_of_cols, NA_set=Set([0, -2])):
		"""
		2008-07-11
			calculate the inconsistency ratio among duplicates
		2008-05-12
			-2 is also ruled out, add NA_set
		"""
		one_row = numpy.zeros(no_of_cols)
		passingdata = PassingData()
		passingdata.no_of_non_NA_pairs = 0
		passingdata.no_of_non_NA_inconsistent_pairs = 0
		for i in range(no_of_cols):
			call_counter_ls = [0]*11
			non_NA_call_number_set = Set()
			for index in ecotypeid_duplicate_index_ls:
				call_number = data_matrix[index][i]
				if call_number not in NA_set:	#dont' need NA and non-touched bit
					call_counter_ls[call_number] += 1
					non_NA_call_number_set.add(call_number)
			if len(non_NA_call_number_set)>0:
				passingdata.no_of_non_NA_pairs += 1
				if len(non_NA_call_number_set)>1:
					passingdata.no_of_non_NA_inconsistent_pairs += 1
			one_row[i] = dbSNP2data.get_majority_call_number(call_counter_ls)
		passingdata.one_row = one_row
		return passingdata
Beispiel #10
0
	def get_data_matrix(self, db, phenotype_info, list_type_analysis_method_info, where_condition):
		sys.stderr.write("Getting data matrix ...")
		data_matrix = num.zeros([len(list_type_analysis_method_info.list_type_id_analysis_method_id2index), len(phenotype_info.phenotype_method_id2index)], num.float)
		data_matrix[:] = -1
		i = 0
		rows = db.metadata.bind.execute("select r.analysis_method_id, r.phenotype_method_id, c.* from %s order by analysis_method_id"\
								%(where_condition))
		min_value = None
		max_value = None
		for row in rows:
			tup = (row.list_type_id, row.analysis_method_id)
			row_index = list_type_analysis_method_info.list_type_id_analysis_method_id2index[tup]
			col_index = phenotype_info.phenotype_method_id2index[row.phenotype_method_id]
			if row.pvalue>0:
				data_value = -math.log10(row.pvalue)
				if min_value==None:
					min_value = data_value
				elif data_value<min_value:
					min_value = data_value
				
				if max_value==None:
					max_value=data_value
				elif data_value>max_value:
					max_value =data_value
			else:
				data_value = -2	#0 pvalue
			data_matrix[row_index, col_index] = data_value
		sys.stderr.write("Done.\n")
		return_data = PassingData()
		return_data.data_matrix = data_matrix
		return_data.min_value = min_value
		return_data.max_value = max_value
		return return_data
	def preReduce(self, workflow=None, passingData=None, transferOutput=True, **keywords):
		"""
		2013.2.10
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		return returnData
Beispiel #12
0
    def getPhenotypeInfo(
        cls, affiliated_table_name=None, extra_condition=None, extra_tables=None, with_category_separator=True
    ):
        """
		2009-12-1
			add argument with_category_separator
		2008-10-30
			affiliated_table_name becomes optional
		2008-10-19
			add option extra_tables
		2008-10-16
			sort phenotype by biology_category_id and return other info as well
		"""
        if affiliated_table_name:
            table_str = "%s s, %s p" % (affiliated_table_name, model.Stock_250kDB.PhenotypeMethod.table.name)
            where_condition = ["p.id=s.phenotype_method_id"]
        else:
            table_str = "%s p" % (model.Stock_250kDB.PhenotypeMethod.table.name)
            where_condition = []

        if extra_tables:
            table_str += ", %s" % extra_tables
        if extra_condition:
            where_condition.append(extra_condition)

        if where_condition:  # 2009-3-9
            where_condition = "where " + " and ".join(where_condition)
        else:
            where_condition = ""

        rows = model.db.metadata.bind.execute(
            "select distinct p.id, p.biology_category_id, p.short_name from %s\
				%s order by p.biology_category_id, p.id"
            % (table_str, where_condition)
        )
        phenotype_method_id_ls = []
        phenotype_method_id2index = {}
        phenotype_method_label_ls = []
        prev_biology_category_id = -1
        no_of_separators = 0
        for row in rows:
            if prev_biology_category_id == -1:
                prev_biology_category_id = row.biology_category_id
            elif with_category_separator and row.biology_category_id != prev_biology_category_id:
                prev_biology_category_id = row.biology_category_id
                # add a blank phenotype id as separator
                no_of_separators += 1
                phenotype_method_id2index[-no_of_separators] = len(phenotype_method_id_ls)
                phenotype_method_id_ls.append(-no_of_separators)
                phenotype_method_label_ls.append("=====")
            phenotype_method_id2index[row.id] = len(phenotype_method_id_ls)
            phenotype_method_id_ls.append(row.id)
            phenotype_method_label_ls.append("%s %s" % (row.id, row.short_name))
        phenotype_info = PassingData()
        phenotype_info.phenotype_method_id2index = phenotype_method_id2index
        phenotype_info.phenotype_method_id_ls = phenotype_method_id_ls
        phenotype_info.phenotype_method_label_ls = phenotype_method_label_ls
        return phenotype_info
Beispiel #13
0
    def getListTypeInfo(cls, affiliated_table_name=None, extra_condition=None, extra_tables=None):
        """
		2009-3-9
			handle the case in which there is no the where_condition at all.
		2008-10-30
			affiliated_table_name becomes optional
		2008-10-19
			add option extra_tables
		2008-10-16
			sort gene list type by biology_category_id and return other info as well
			add -1 as a separator into list_type_id_ls
		"""
        if affiliated_table_name:
            table_str = "%s s, %s p" % (affiliated_table_name, model.Stock_250kDB.GeneListType.table.name)
            where_condition = ["p.id=s.list_type_id"]
        else:
            table_str = "%s p" % (model.Stock_250kDB.GeneListType.table.name)
            where_condition = []

        if extra_tables:
            table_str += ", %s" % extra_tables

        if extra_condition:
            where_condition.append(extra_condition)

        if where_condition:  # 2009-3-9
            where_condition = "where " + " and ".join(where_condition)
        else:
            where_condition = ""
        rows = model.db.metadata.bind.execute(
            "select distinct p.id, p.biology_category_id, p.short_name from %s \
			%s order by p.biology_category_id, p.id"
            % (table_str, where_condition)
        )
        list_type_id_ls = []
        list_type_id2index = {}
        list_type_label_ls = []
        prev_biology_category_id = -1
        no_of_separators = 0
        for row in rows:
            if prev_biology_category_id == -1:
                prev_biology_category_id = row.biology_category_id
            elif row.biology_category_id != prev_biology_category_id:
                prev_biology_category_id = row.biology_category_id
                no_of_separators += 1
                list_type_id2index[-no_of_separators] = len(list_type_id_ls)
                list_type_id_ls.append(-no_of_separators)
                list_type_label_ls.append("====\n====")
            list_type_id2index[row.id] = len(list_type_id_ls)
            list_type_id_ls.append(row.id)
            list_type_label_ls.append("%s %s" % (row.id, row.short_name))
        list_info = PassingData()
        list_info.list_type_id2index = list_type_id2index
        list_info.list_type_id_ls = list_type_id_ls
        list_info.list_type_label_ls = list_type_label_ls
        return list_info
	def reduceAfterEachAlignment(self, workflow=None, passingData=None, mapEachChromosomeDataLs=None,\
								reduceAfterEachChromosomeDataLs=None,\
								transferOutput=True, **keywords):
		"""
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		returnData.mapEachChromosomeDataLs = mapEachChromosomeDataLs
		returnData.reduceAfterEachChromosomeDataLs = reduceAfterEachChromosomeDataLs
		return returnData
	def reduceEachVCF(self, workflow=None, chromosome=None, passingData=None, mapEachIntervalDataLs=None,\
					transferOutput=True, **keywords):
		"""
		2013.05.01
			#. concatenate all the sub-VCFs into one
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		returnData.mapEachIntervalDataLs = mapEachIntervalDataLs
		
		refineGenotypeJobLs = [pdata.refineGenotypeJob for pdata in mapEachIntervalDataLs]
		mergeVCFReplicateColumnsJobLs = [pdata.mergeVCFReplicateColumnsJob for pdata in mapEachIntervalDataLs]
		
		
		realInputVolume = passingData.jobData.file.noOfIndividuals * passingData.jobData.file.noOfLoci
		baseInputVolume = 200*2000000
		#base is 4X coverage in 20Mb region => 120 minutes
		walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=500).value
		#base is 4X, => 5000M
		job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=2000, \
							minJobPropertyValue=2000, maxJobPropertyValue=8000).value
		self.concatenateOverlapIntervalsIntoOneVCFSubWorkflow(passingData=passingData, \
						intervalJobLs=[pdata.beagleJob for pdata in mapEachIntervalDataLs],\
						outputDirJob=self.beagleReduceDirJob, \
						transferOutput=True, job_max_memory=job_max_memory, walltime=walltime,\
						**keywords)
		
		self.concatenateOverlapIntervalsIntoOneVCFSubWorkflow(passingData=passingData, \
						intervalJobLs=refineGenotypeJobLs, outputDirJob=self.replicateVCFDirJob, \
						transferOutput=True, job_max_memory=job_max_memory, walltime=walltime, \
						**keywords)
		
		self.concatenateOverlapIntervalsIntoOneVCFSubWorkflow(passingData=passingData, \
						intervalJobLs=mergeVCFReplicateColumnsJobLs, outputDirJob=self.reduceOutputDirJob, \
						transferOutput=True, job_max_memory=job_max_memory, walltime=walltime,\
						**keywords)
		
		
		for pdata in mapEachIntervalDataLs:
			#add this output to the union job
			"""
			self.addInputToStatMergeJob(statMergeJob=self.reduceBeaglePhaseReplicateConcordanceJob_AllSites, \
							parentJobLs=[pdata.beaglePhasedReplicateConcordanceJob])
			self.addInputToStatMergeJob(statMergeJob=self.reduceBeaglePhaseReplicateConcordanceJob_HomoOnly, \
							parentJobLs=[pdata.beaglePhasedReplicateConcordanceJob])
			"""
			self.addInputToStatMergeJob(statMergeJob=self.reduceTrioCallerReplicateConcordanceJob_AllSites, \
							parentJobLs=[pdata.trioCallerReplicateConcordanceJob])
			self.addInputToStatMergeJob(statMergeJob=self.reduceTrioCallerReplicateConcordanceJob_HomoOnly, \
							parentJobLs=[pdata.trioCallerReplicateConcordanceJob])
		
		return returnData
	def mapEachInterval(self, workflow=None, alignmentData=None, intervalData=None,\
			VCFJobData=None, passingData=None, transferOutput=True, **keywords):
		"""
		2012.9.17
		"""
		if workflow is None:
			workflow = self
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		topOutputDirJob = passingData.topOutputDirJob
		
		alignment = alignmentData.alignment
		parentJobLs = alignmentData.jobLs
		bamF = alignmentData.bamF
		baiF = alignmentData.baiF
		bamFnamePrefix = passingData.bamFnamePrefix
		
		
		if intervalData.file:
			mpileupInterval = intervalData.interval
			bcftoolsInterval = intervalData.file
		else:
			mpileupInterval = intervalData.interval
			bcftoolsInterval = intervalData.interval
		intervalFileBasenameSignature = intervalData.intervalFileBasenameSignature
		overlapInterval = intervalData.overlapInterval
		overlapFileBasenameSignature = intervalData.overlapIntervalFnameSignature
		VCFFile = VCFJobData.file
		annotationName = passingData.annotationName
		outputFile = File(os.path.join(topOutputDirJob.output, '%s_%s.%s.vcf'%(bamFnamePrefix, overlapFileBasenameSignature, annotationName)))
		variantAnnotatorJob = self.addGATKVariantAnnotatorJob(workflow, executable=workflow.annotateVariantJava, \
								GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, bamFile=bamF, \
								VCFFile=VCFFile, annotationName=annotationName, interval=bcftoolsInterval, outputFile=outputFile, \
								refFastaFList=passingData.refFastaFList, parentJobLs=[topOutputDirJob]+parentJobLs, 
								extraDependentInputLs=[baiF, VCFFile.tbi_F], \
								transferOutput=False, \
								extraArguments=None, job_max_memory=4000)
		
		outputFile = File(os.path.join(topOutputDirJob.output, '%s_%s.%s.tsv'%(bamFnamePrefix, overlapFileBasenameSignature, annotationName)))
		extractInfoJob = self.addGenericJob(workflow=workflow, executable=workflow.ExtractInfoFromVCF, inputFile=variantAnnotatorJob.output, \
						inputArgumentOption="-i", \
						outputFile=outputFile, outputArgumentOption="-o", \
						parentJobLs=[variantAnnotatorJob], extraDependentInputLs=None, extraOutputLs=None, transferOutput=False, \
						extraArguments="-k %s"%(annotationName), extraArgumentList=None, job_max_memory=2000,  sshDBTunnel=None, \
						key2ObjectForJob=None)
		
		returnData.jobDataLs.append(PassingData(jobLs=[variantAnnotatorJob, extractInfoJob], file=variantAnnotatorJob.output, \
											fileLs=[variantAnnotatorJob.output, extractInfoJob.output]))
		returnData.variantAnnotatorJob=variantAnnotatorJob
		returnData.extractInfoJob=extractInfoJob
		#add the sub-alignment to the alignment merge job
		self.no_of_jobs += 2
		return returnData
Beispiel #17
0
	def getStrainInfoGivenPlateInfo(self, db, plate_info, strain_id_info_query, strain_id_set=None):
		"""
		2008-09-13
			order/group the strains according to plate_set, country, strain longitude
			fetch appropriate labels for each strain
		"""
		sys.stderr.write("Getting strain_info given plate_info ...")

		
		#fetch appropriate label, and put strain id in country_longitude order within each plate
		plate_set2strain_id_ls_in_GPS_order = {}
		strain_id2label = {}
		rows = db.metadata.bind.execute(strain_id_info_query)
		for row in rows:
			if strain_id_set and row.strainid not in strain_id_set:	#skip
				continue
			plate_set = plate_info.strain_id2plate_set[row.strainid]
			if plate_set not in plate_set2strain_id_ls_in_GPS_order:
				plate_set2strain_id_ls_in_GPS_order[plate_set] = []
			plate_set2strain_id_ls_in_GPS_order[plate_set].append(row.strainid)
			
			if len(row.sitename)>10:	#cut short on the site name
				sitename = row.sitename[:10]
			else:
				sitename = row.sitename
			strain_label = '%s_%s_%s_%s_%s'%(row.abbr, sitename, row.nativename, row.strainid, repr(plate_set)[1:-1])
			strain_id2label[row.strainid] = strain_label
		
		#put in plate_set order, assign row index
		plate_set_ls = plate_set2strain_id_ls_in_GPS_order.keys()
		plate_set_ls.sort()
		no_of_plates = len(plate_set_ls)
		strain_id_ls = []
		strain_id2index = {}
		strain_label_ls = []
		for i in range(no_of_plates):
			plate_set = plate_set_ls[i]
			plate_strain_id_ls = plate_set2strain_id_ls_in_GPS_order[plate_set]
			if i!=0:	#insert separator, but not before the first plate_set
				strain_id2index[-i] = len(strain_id2index)
				strain_id_ls.append(-i)
				strain_label_ls.append('')
			for strain_id in plate_strain_id_ls:
				strain_id2index[strain_id] = len(strain_id2index)
				strain_id_ls.append(strain_id)
				strain_label_ls.append(strain_id2label[strain_id])
		
		strain_id_info = PassingData()
		strain_id_info.strain_id_ls = strain_id_ls
		strain_id_info.strain_id2index = strain_id2index
		strain_id_info.strain_label_ls = strain_label_ls
		sys.stderr.write("Done.\n")
		return strain_id_info
	def linkMapToReduce(self, workflow=None, mapEachIntervalData=None, preReduceReturnData=None, passingData=None, transferOutput=True, **keywords):
		"""
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		for jobData in mapEachIntervalData.jobDataLs:
			calculaJob = jobData.jobLs[0]
			self.addInputToStatMergeJob(workflow, statMergeJob=preReduceReturnData.aggregateAndHClusterDistanceMatrixJob, \
						inputF=calculaJob.output, \
						parentJobLs=[calculaJob])
		return returnData
Beispiel #19
0
	def addOneEntry(self, row):
		from pymodule import PassingData
		data_obj = PassingData()
		
		for table_field in self.table_field_ls:
			setattr(data_obj, table_field, getattr(row, table_field, None))
		data_obj.chr = row.snp.chromosome
		data_obj.pos = row.snp.position
		chr_pos_key = (data_obj.chr, data_obj.pos)
		if chr_pos_key not in self.chr_pos2index_ls:
			self.chr_pos2index_ls[chr_pos_key] = []
		self.chr_pos2index_ls[chr_pos_key].append(len(self.data_ls))
		self.data_ls.append(data_obj)
	def addAllJobs(self, workflow=None, db_250k=None, association_result_ls=None, \
				data_dir=None, min_MAF=None, \
				neighbor_distance=None, max_neighbor_distance=None, \
				min_score_ls=None, min_overlap_ratio_ls=None, ground_score=None,\
				peakPadding=None, tax_id=None, \
				outputDirPrefix="", transferOutput=True, job_max_memory=2000, **keywords):
		"""
		2013.2.27
			run ms
			estimate parameters from ms
			forward simulator with estimated ms-parameters or take the output of ms as input
			
			
		"""
		if workflow is None:
			workflow = self
		
		sys.stderr.write("Adding jobs for pop-gen & pedigree sequence simulation #jobs=%s... \n"%\
							(self.no_of_jobs))
		
		returnData = PassingData()
		returnData.jobDataLs = []
		
		passingData = PassingData(fileBasenamePrefix=None, \
					outputDirPrefix=outputDirPrefix, \
					jobData=None,\
					preReduceReturnData=None,\
					association_group_key2orderIndex = {},\
					association_group_key2resultList = {},\
					association_group_key2reduceAssociationPeakJobMatrix = {},\
					association_group_key2countAssociationLocusJobList = {},\
					resultID2defineLandscapeJobData = {},
					)
		
		preReduceReturnData = self.preReduce(workflow=workflow, outputDirPrefix=outputDirPrefix, \
									passingData=passingData, transferOutput=False,\
									**keywords)
		
		mapDirJob = preReduceReturnData.mapDirJob
		plotOutputDirJob = preReduceReturnData.plotOutputDirJob
		countAssociationLocusOutputDirJob = preReduceReturnData.countAssociationLocusOutputDirJob
		reduceOutputDirJob = preReduceReturnData.reduceOutputDirJob
		
		passingData.preReduceReturnData = preReduceReturnData
		
		#add output pedigree job
		
		for i in xrange(self.noOfReplicates):
			popGenSimulationFolderJob = self.addMkDirJob(outputDir=os.path.join(mapDirJob.output, 'popGenSim%s'%(i)), \
														parentJobLs=[mapDirJob])
			popSimulationJob = self.addPopGenSimulationJob()
Beispiel #21
0
    def generate_parameters(self, parameter_names, parameter_depth=2):
        """
		2008-05-19
			min_call_probability = self.min_call_probability
		2008-05-11
			put NA rate into passing parameters as well. too much memory consumption on each computing node
		"""
        sys.stderr.write("Generating parameter settings ...")
        param_d = PassingData()
        for parameter_name in parameter_names:
            parameter_value = getattr(self, parameter_name)
            parameter_value = parameter_value.split(",")
            parameter_value = map(float, parameter_value)
            setattr(self, parameter_name, parameter_value)

        """
		#2008-05-19 commented out. use self.min_call_probability
		#figure out call probability from input_fname
		import re
		call_prob_pattern = re.compile(r'_(\d+)\.csv')
		call_prob_p_result = call_prob_pattern.search(self.input_fname)
		if call_prob_p_result:
			min_call_probability = float(call_prob_p_result.groups()[0])
		else:
			min_call_probability = -1
		"""
        min_call_probability = self.min_call_probability

        # only 1st 4, last 2 passed to computing node
        parameters = []
        for max_call_mismatch_rate in getattr(self, parameter_names[0]):
            for max_call_NA_rate in getattr(self, parameter_names[1]):
                for max_snp_mismatch_rate in getattr(self, parameter_names[2]):
                    for max_snp_NA_rate in getattr(self, parameter_names[3]):
                        for npute_window_size in getattr(self, parameter_names[4]):
                            parameters.append(
                                [
                                    min_call_probability,
                                    max_call_mismatch_rate,
                                    max_call_NA_rate,
                                    max_snp_mismatch_rate,
                                    max_snp_NA_rate,
                                    npute_window_size,
                                ]
                            )

        param_d.parameters = parameters
        param_d.max_snp_NA_rate_ls = self.max_snp_NA_rate_ls
        param_d.npute_window_size_ls = self.npute_window_size_ls
        sys.stderr.write(" %s parameter settings to process. Done.\n" % len(parameters))
        return param_d
	def addJobs(self, workflow=None, inputData=None, db_vervet=None, genotypeMethodShortName=None, commit=None,\
			data_dir=None, checkEmptyVCFByReading=False, transferOutput=True,\
			maxContigID=None, outputDirPrefix="", needSSHDBTunnel=False):
		"""
		2012.5.9
		"""
		sys.stderr.write("Adding VCF2DB jobs for %s vcf files ... "%(len(inputData.jobDataLs)))
		
		
		topOutputDir = "%sVCF2DB"%(outputDirPrefix)
		topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir)
		
		firstVCFFile = inputData.jobDataLs[0].vcfFile
		logFile = File(os.path.join(topOutputDir, 'AddGenotypeMethod2DB.log'))
		addGM2DBJob = self.addAddGenotypeMethod2DBJob(executable=self.AddGenotypeMethod2DB, inputFile=firstVCFFile, \
								genotypeMethodShortName=genotypeMethodShortName,\
								logFile=logFile, data_dir=data_dir, commit=commit, parentJobLs=[], extraDependentInputLs=[], transferOutput=True, \
								extraArguments=None, job_max_memory=10, sshDBTunnel=needSSHDBTunnel)
		updateGMlogFile = File(os.path.join(topOutputDir, 'updateGM.log'))
		updateGMNoOfLociJob = self.addUpdateGenotypeMethodNoOfLociJob(executable=self.UpdateGenotypeMethodNoOfLoci, \
																	genotypeMethodShortName=genotypeMethodShortName,\
								logFile=updateGMlogFile, data_dir=data_dir, commit=commit, parentJobLs=[topOutputDirJob], \
								extraDependentInputLs=[], transferOutput=True, \
								extraArguments=None, job_max_memory=20, sshDBTunnel=needSSHDBTunnel)
		
		returnData = PassingData()
		returnData.jobDataLs = []
		for jobData in inputData.jobDataLs:
			inputF = jobData.vcfFile
			if maxContigID:
				contig_id = self.getContigIDFromFname(inputF.name)
				try:
					contig_id = int(contig_id)
					if contig_id>maxContigID:	#skip the small contigs
						continue
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
			logFile = File(os.path.join(topOutputDir, 'AddVCFFile2DB_%s.log'%(self.getChrFromFname(inputF.name))))
			addVCFJob = self.addAddVCFFile2DBJob(executable=self.AddVCFFile2DB, inputFile=inputF, genotypeMethodShortName=genotypeMethodShortName,\
						logFile=logFile, format="VCF", data_dir=data_dir, checkEmptyVCFByReading=checkEmptyVCFByReading, commit=commit, \
						parentJobLs=[addGM2DBJob]+jobData.jobLs, extraDependentInputLs=[], transferOutput=True, \
						extraArguments=None, job_max_memory=1000, sshDBTunnel=needSSHDBTunnel)
			workflow.depends(parent=addVCFJob, child=updateGMNoOfLociJob)
		sys.stderr.write("%s jobs.\n"%(self.no_of_jobs))
		#include the tfam (outputList[1]) into the fileLs
		returnData.jobDataLs.append(PassingData(jobLs=[updateGMNoOfLociJob], file=updateGMlogFile, \
											fileLs=[updateGMlogFile]))
		return returnData
Beispiel #23
0
	def get_qccall_results(self, input_dir):
		import os,sys,csv
		from variation.src.MpiQCCall import MpiQCCall
		from pymodule import PassingData
		"""
		var_name_ls = ['strain or snp', 'after_imputation'] + MpiQCCall.common_var_name_ls
		avg_var_name_pair_ls, partial_header_avg = MpiQCCall.generate_avg_variable_names(MpiQCCall.avg_var_name_ls)
		var_name_ls += partial_header_avg
		"""
		files = os.listdir(input_dir)
		passingdata_ls = []
		no_of_objects = len(files)
		var_name_ls = []
		for i in range(no_of_objects):
			sys.stderr.write("\t%d/%d: from %s ... \n"%(i+1, no_of_objects, files[i]))
			filename = os.path.join(input_dir, files[i])
			reader = csv.reader(open(filename))
			try:
				row = reader.next()
				if len(var_name_ls)==0:
					var_name_ls = row
			except:
				if self.debug:
					import traceback
					traceback.print_exc()
	  				sys.stderr.write('%s\n'%sys.exc_info())
	  			sys.stderr.write('\terror in reading this file. ignored.\n')
	  			del reader
				continue
			for row in reader:
				passingdata = PassingData()
				for i in range(len(var_name_ls)):
					var_name = var_name_ls[i]
					if var_name!='strain or snp':
						value = float(row[i])
					else:	#the first column is strain or snp, no float conversion
						value = row[i]
					setattr(passingdata, var_name, value)	#
				#two new variables record no of accessions/snps lost
				passingdata.no_of_total_accessions_filtered = passingdata.no_of_accessions_filtered_by_mismatch + passingdata.no_of_accessions_filtered_by_na
				passingdata.no_of_total_snps_filtered = passingdata.no_of_snps_filtered_by_mismatch +\
					passingdata.no_of_snps_filtered_by_na
				passingdata.no_of_total_snps_removed = passingdata.no_of_total_snps_filtered +\
					passingdata.no_of_monomorphic_snps_removed
				
				passingdata_ls.append(passingdata)
			del reader
		return passingdata_ls, var_name_ls
	def reduceBeforeEachAlignment(self, workflow=None, passingData=None, preReduceReturnData=None, transferOutput=True, **keywords):
		"""
		2012.9.17
			add a merge variant annotation job, GW plot job
			
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		outputDirPrefix = passingData.outputDirPrefix
		
		statOutputDirJob = preReduceReturnData.statOutputDirJob
		plotOutputDirJob = preReduceReturnData.plotOutputDirJob
		
		mergeOutputF = File(os.path.join(statOutputDirJob.output, '%s_%s.tsv'%(passingData.bamFnamePrefix, passingData.annotationName)))
		mergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \
							outputF=mergeOutputF, transferOutput=transferOutput, parentJobLs=[statOutputDirJob],)
		returnData.jobDataLs.append(PassingData(jobLs=[mergeJob ], file=mergeJob.output, fileLs=[mergeJob.output], mergeJob=mergeJob))
		self.no_of_jobs += 1
		
		outputFnamePrefix = os.path.join(plotOutputDirJob.output, '%s_%s_Plot'%(passingData.bamFnamePrefix, passingData.annotationName))
		# whichColumnPlotLabel and xColumnPlotLabel should not contain spaces or ( or ). because they will disrupt shell commandline
		self.addPlotVCFtoolsStatJob(executable=workflow.PlotVCFtoolsStat, inputFileList=[mergeOutputF], \
							outputFnamePrefix=outputFnamePrefix, \
							whichColumn=None, whichColumnHeader=passingData.annotationName, whichColumnPlotLabel=passingData.annotationName, \
							need_svg=False, \
							logY=0, valueForNonPositiveYValue=-1, \
							xColumnPlotLabel="position", chrLengthColumnHeader=None, chrColumnHeader="CHROM", \
							minChrLength=None, xColumnHeader="POS", minNoOfTotal=50,\
							figureDPI=100, ylim_type=2, samplingRate=0.01,\
							parentJobLs=[mergeJob, plotOutputDirJob], \
							extraDependentInputLs=None, \
							extraArguments=None, transferOutput=True, sshDBTunnel=self.needSSHDBTunnel)
		self.no_of_jobs += 1
		
		outputFile = File( os.path.join(plotOutputDirJob.output, '%s_%s_Hist.png'%(passingData.bamFnamePrefix, passingData.annotationName)))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[mergeJob.output], \
					outputFile=outputFile, \
					whichColumn=None, whichColumnHeader=passingData.annotationName, whichColumnPlotLabel=passingData.annotationName, \
					logY=None, logCount=True, valueForNonPositiveYValue=-1,\
					minNoOfTotal=10,\
					figureDPI=100, samplingRate=0.1,\
					parentJobLs=[plotOutputDirJob, mergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=True,  job_max_memory=2000)
		self.no_of_jobs += 1
		
		return returnData
Beispiel #25
0
	def getResultsMethodIDInfo(self, db, call_method_id_ls, min_distance, get_closest, min_MAF):
		"""
		2008-09-05
			use results_by_gene.id as main result id
		"""
		sys.stderr.write("Gettiing ResultsMethodIDInfo ...")
		results_method_id_info = PassingData()
		results_method_id_ls = []
		results_method_id2index = {}
		results_method_id_label_ls = []
		rows = db.metadata.bind.execute("select distinct rg.id, rg.results_method_id, r.analysis_method_id, r.phenotype_method_id, \
			p.biology_category_id from %s rg, %s r, %s p \
			where rg.results_method_id=r.id and p.id=r.phenotype_method_id and r.call_method_id in (%s) \
			and rg.min_distance=%s and rg.get_closest=%s and rg.min_MAF>=%s-0.0001 and rg.min_MAF<=%s+0.0001 \
			order by p.biology_category_id, r.phenotype_method_id, r.analysis_method_id"%(ResultsByGene.table.name, \
					ResultsMethod.table.name, PhenotypeMethod.table.name, repr(call_method_id_ls)[1:-1], min_distance, get_closest, min_MAF, min_MAF))
		prev_phenotype_method_id = None
		prev_biology_category_id = None
		no_of_separators = 0
		for row in rows:
			if prev_biology_category_id==None:
				prev_biology_category_id = row.biology_category_id
			elif row.biology_category_id!=prev_biology_category_id:
				prev_biology_category_id = row.biology_category_id
				no_of_separators += 1
				results_method_id2index[-no_of_separators] = len(results_method_id_ls)
				results_method_id_ls.append(-no_of_separators)
				results_method_id_label_ls.append('')
			
			if prev_phenotype_method_id == None:
				prev_phenotype_method_id = row.phenotype_method_id
			elif row.phenotype_method_id!=prev_phenotype_method_id:
				prev_phenotype_method_id = row.phenotype_method_id
				#add a blank phenotype id as separator
				no_of_separators += 1
				results_method_id2index[-no_of_separators] = len(results_method_id_ls)
				results_method_id_ls.append(-no_of_separators)
				results_method_id_label_ls.append('')
			results_method_id2index[row.id] = len(results_method_id_ls)
			results_method_id_ls.append(row.id)
			am = AnalysisMethod.get(row.analysis_method_id)
			pm = PhenotypeMethod.get(row.phenotype_method_id)
			results_method_id_label_ls.append('%s_%s_%s'%(am.short_name, pm.short_name, pm.id))
		results_method_id_info.results_method_id_ls = results_method_id_ls
		results_method_id_info.results_method_id2index = results_method_id2index
		results_method_id_info.results_method_id_label_ls = results_method_id_label_ls
		sys.stderr.write("%s results. Done.\n"%(len(results_method_id_ls)))
		return results_method_id_info
Beispiel #26
0
	def run(self):
		"""
		2008-10-28
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		
		hist_type = CheckCandidateGeneRank.getHistType(self.call_method_id, self.min_distance, self.get_closest, self.min_MAF, \
									self.allow_two_sample_overlapping, self.results_type, self.null_distribution_type_id)
		
		snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest)
		
		param_obj = PassingData(call_method_id=self.call_method_id, \
								analysis_method_id=getattr(self, 'analysis_method_id', None),\
								analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\
								phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\
								list_type_id_ls=self.list_type_id_ls, \
								results_type=self.results_type,\
								no_check_gene_list=True)
		params_ls = self.generate_params(param_obj)
		
		pd = PassingData(snps_context_wrapper=snps_context_wrapper, \
						results_directory=self.results_directory, \
						min_MAF=self.min_MAF,
						get_closest=self.get_closest,
						min_distance=self.min_distance,\
						no_of_top_snps=self.no_of_top_snps,
						min_sample_size=self.min_sample_size,
						test_type_id=self.test_type_id, \
						results_type=self.results_type,
						no_of_permutations=self.no_of_permutations,\
						no_of_min_breaks=self.no_of_min_breaks,
						type=hist_type,\
						null_distribution_type_id=self.null_distribution_type_id,\
						allow_two_sample_overlapping=self.allow_two_sample_overlapping,
						min_score=self.min_score,
						session=session,\
						commit=self.commit)
		
		for results_id, list_type_id in params_ls:
			pd.list_type_id = list_type_id
			pd.results_id = results_id
			self.pick_candidate_genes(pd)
Beispiel #27
0
    def countAlleleComboFrequency(cls, snp1_id, gene1_id, genotype_ls1, snp2_id, gene2_id, genotype_ls2, phenotype_index, phenotype_ls, \
        min_data_point=3):
        """
		2009-2-18
			to test how many distinct allele-combos each SNP pair has
		"""
        return_ls = []
        no_of_rows = len(genotype_ls1)
        allele_combo2freq = {}
        for i in range(no_of_rows):
            allele1 = genotype_ls1[i]
            allele2 = genotype_ls2[i]
            phenotype_value = phenotype_ls[i]
            if numpy.isnan(allele1) or numpy.isnan(allele2) or numpy.isnan(
                    phenotype_value):
                continue
            allele_combo = (allele1, allele2)

            if allele_combo not in allele_combo2freq:
                allele_combo2freq[allele_combo] = 0
            allele_combo2freq[allele_combo] += 1
        pdata = PassingData(snp1_id=snp1_id, gene1_id=gene1_id, snp2_id=snp2_id, gene2_id=gene2_id, pvalue=None,\
              count1=len(allele_combo2freq), count2=None, phenotype_index=phenotype_index, coeff_list=allele_combo2freq)
        return_ls.append(pdata)
        return return_ls
    def computing_node_handler(self, communicator, data,
                               computing_parameter_obj):
        """
		2008-09-10
			add source_id to PassingData
		2008-08-28
		"""
        node_rank = communicator.rank
        sys.stderr.write("Node no.%s working...\n" % node_rank)
        data = cPickle.loads(data)
        result_ls = []
        twoSNPData = computing_parameter_obj.twoSNPData
        QC_method_id = computing_parameter_obj.QC_method_id
        for row_id1, row_id2 in data:
            NA_rate, mismatch_rate, no_of_NAs, no_of_totals, no_of_mismatches, no_of_non_NA_pairs = twoSNPData.cmpOneRow(
                row_id1, row_id2)
            #the 2nd position in the row-id1 tuple is strain id
            if QC_method_id == 4:  #the 2nd position in the row-id2 tuple is strain id
                target_id = row_id2[1]
            else:
                target_id = row_id2
            qc_cross_match = PassingData(source_id=row_id1[0], strainid=row_id1[1], target_id=target_id, mismatch_rate=mismatch_rate, \
                     no_of_mismatches=no_of_mismatches, no_of_non_NA_pairs=no_of_non_NA_pairs)
            result_ls.append(qc_cross_match)
        sys.stderr.write("Node no.%s done with %s results.\n" %
                         (node_rank, len(result_ls)))
        return result_ls
Beispiel #29
0
    def readThroughAndProvideSummary(self):
        """
		2013.08.30
			called by vervet/src/db/input/AddAlignmentDepthIntervalFile2DB.py
		"""

        col_name2index = self.smartReadHeader()
        if col_name2index is None:
            pdata = self.parseRow(self._row)
            self._postProcessParsedRowDataForSummary(pdata)

        for row in self:
            pdata = self.parseRow(row)
            self._postProcessParsedRowDataForSummary(pdata)

        self.min_interval_length = numpy.min(self.interval_length_ls)
        self.max_interval_length = numpy.max(self.interval_length_ls)
        self.median_interval_length = numpy.median(self.interval_length_ls)

        self.mean_interval_value = numpy.mean(self.interval_value_ls)
        self.median_interval_value = numpy.median(self.interval_value_ls)
        return PassingData(no_of_intervals=self.no_of_intervals, chromosome_size=self.chromosome_size, \
            mean_interval_value=self.mean_interval_value,\
            median_interval_value=self.median_interval_value,\
            min_interval_value=self.min_interval_value,\
            max_interval_value=self.max_interval_value,\

            min_interval_length=self.min_interval_length,\
            max_interval_length=self.max_interval_length ,\
            median_interval_length=self.median_interval_length)
Beispiel #30
0
    def general_output_node(self, output_dir, phenotype_index_ls,
                            phenotype_label_ls, free_computing_nodes):
        """
		2009-2-8
			general strategy for output node to do while it's computing
			
			refactored out of run()
		"""
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)
        writer_dict = {}
        header_row = [
            'snp1_id', 'gene1_id', 'snp2_id', 'gene2_id', 'bool_type',
            'pvalue', 'count1', 'count2', 'var_perc', 'coeff_list',
            'coeff_p_value_list'
        ]
        for phenotype_index in phenotype_index_ls:
            phenotype_label = phenotype_label_ls[phenotype_index]
            phenotype_label = phenotype_label.replace(
                '/', '_')  #'/' is taken as folder separator
            output_fname = os.path.join(output_dir,
                                        'SNPpair_%s.tsv' % phenotype_label)
            writer = csv.writer(open(output_fname, 'w'),
                                lineterminator='\n',
                                delimiter='\t')
            writer.writerow(header_row)
            writer_dict[phenotype_index] = writer
        param_obj = PassingData(writer_dict=writer_dict, header_row=header_row)
        self.output_node(free_computing_nodes, param_obj,
                         self.output_node_handler)
        del writer_dict
Beispiel #31
0
    def get_strain_id_info(self, QC_method_id, ignore_strains_with_qc=True):
        """
		2008-08-18
			to generate data structure related to strain_id, preparation to get data_matrix
			strainid not QCed yet
			link to tg_ecotypeid
		"""
        sys.stderr.write("Getting strain_id info  ... ")
        strain_id2index = {}
        strain_id_list = []
        strain_id2acc = {}
        strain_id2category = {}

        rows = StockDB.Strain.query.all()
        for row in rows:
            if ignore_strains_with_qc:
                ignore_this = 0
                for call_qc in row.call_qc_ls:
                    if call_qc.qc_method_id == QC_method_id:  #QC already done
                        ignore_this = 1
                        break
                if ignore_this:
                    continue
            strain_id = row.id
            strain_index = len(strain_id_list)
            strain_id_list.append(strain_id)
            strain_id2index[strain_id] = strain_index
            strain_id2acc[
                strain_id] = row.ecotypeid_strainid2tg_ecotypeid.tg_ecotypeid
            strain_id2category[strain_id] = strain_id
        passingdata = PassingData(strain_id2index=strain_id2index, strain_id_list=strain_id_list, strain_id2acc=strain_id2acc,\
              strain_id2category=strain_id2category)
        sys.stderr.write("%s strains. Done.\n" % (len(strain_id_list)))
        return passingdata
	def loadDataStructure(self, gene_annotation_picklef, min_MAF=0, min_distance=20000, \
						list_type_id=None, snp_matrix_fname=None, snp_matrix_data_type=1):
		"""
		2009-5-30
			add argument snp_matrix_fname
		2008-11-25
		2008-10-01
			wrap a few functions up, convenient for both run() and drawSNPRegion()
		"""
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		self.db = db
		snp_info = self.getSNPInfo(db)
		gene_annotation = self.dealWithGeneAnnotation(gene_annotation_picklef)
		if list_type_id:
			candidate_gene_list = self.getGeneList(list_type_id)
			candidate_gene_set = Set(candidate_gene_list)
		else:
			candidate_gene_set = Set()
		
		if snp_matrix_fname:
			if snp_matrix_data_type==3:
				matrix_data_type=float		#2009-3-23 for CNV amplitude file
			else:
				matrix_data_type=int
			snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=matrix_data_type)			
			#2008-12-05 fake a snp_info for findSNPsInRegion
			self.construct_chr_pos2index_forSNPData(snpData)
		else:
			snpData = None
			
		return_data = PassingData(gene_annotation=gene_annotation, snp_info=snp_info, \
								candidate_gene_set=candidate_gene_set, snpData=snpData)
		return return_data
Beispiel #33
0
	def parseFastaDescriptionForGenBank(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		"""
		"""
		possible header lines:
		
		>gi|51511461|ref|NC_000001.8|NC_000001 H**o sapiens chromosome 1, complete sequence
		>gi|186497660|ref|NC_003070.6| Arabidopsis thaliana chromosome 1, complete sequence
		>gi|26556996|ref|NC_001284.2| Arabidopsis thaliana mitochondrion, complete genome
		>gi|115442598|ref|NC_008394.1| Oryza sativa (japonica cultivar-group) genomic DNA, chromosome 1
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split('|')
		_tax_id = FigureOutTaxID_ins.returnTaxIDGivenSentence(header[4])
		
		if self.p_chromosome.search(header[4]) is not None:
			chromosome = self.p_chromosome.search(header[4]).groups()[0]
		elif header[4].find('mitochondrion')!=-1:
			chromosome = 'mitochondrion'
		elif header[4].find('chloroplast')!=-1:
			chromosome = 'chloroplast'
		else:	#something else, take the whole before ','
			chromosome = header[4].split(',')[0]
		gi = int(header[1])
		acc_ver = header[3]
		comment = header[4]
		return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
Beispiel #34
0
	def findSNPsInRegion(self, snp_info, chromosome, start, stop, center_snp_position=None):
		"""
		2008-10-1
			called by plotSNPRegion()
			find SNPs in this region, if center_snp_position is not given, find one.
			similar to getSNPsAroundThisSNP()
		"""
		if self.report:
			sys.stderr.write("Get SNPs in this region ...")
		from DrawSNPRegion import SNPPassingData
		chr_pos_ls = []
		chr_pos2adjacent_window = {}
		j = 0
		midpoint = (start+stop)/2.
		if center_snp_position is None:
			_center_snp_position = start
		else:
			_center_snp_position = center_snp_position
		center_snp = SNPPassingData(chromosome=chromosome, position=_center_snp_position, snps_id=None)
		for i in range(start-1, stop+2):
			new_pos = i
			new_chr_pos = (chromosome, new_pos)
			if new_chr_pos in snp_info.chr_pos2index:
				if center_snp_position is None and abs(new_pos-midpoint)<abs(center_snp.position-midpoint):	#this SNP is closer to the center
					center_snp.position = new_pos
				chr_pos_ls.append(new_chr_pos)
				if j!=0:
					#add_mid_point(chr_pos_ls, chr_pos2adjacent_window)
					pass
				j += 1
		center_snp.snps_id = '%s_%s'%(center_snp.chromosome, center_snp.position)
		snp_region = PassingData(chr_pos_ls=chr_pos_ls, chr_pos2adjacent_window=chr_pos2adjacent_window, center_snp=center_snp)
		if self.report:
			sys.stderr.write("Done.\n")
		return snp_region
Beispiel #35
0
	def returnLocusLowMapQualityIndicator(self, alignedReadLs=None, minMapQGoodRead=2, minFractionOfGoodRead=0.9):
		"""
		2013.12.04
		"""
		totalNoOfReads = 0
		noOfGoodReads = 0.0
		medianMapQ=-10
		mapQList=[]
		for alignedRead in alignedReadLs:
			totalNoOfReads +=1
			mapQList.append(alignedRead.mapq)
			if alignedRead.mapq>=minMapQGoodRead:
				noOfGoodReads += 1
			else:
				pass
		if totalNoOfReads>0:
			fractionOfGoodRead = noOfGoodReads/(totalNoOfReads)
			medianMapQ = numpy.median(mapQList)
		else:
			fractionOfGoodRead = -1
			medianMapQ = -10
			
		if fractionOfGoodRead>=minFractionOfGoodRead:
			locusLowMapQIndicator = 0
		else:
			locusLowMapQIndicator = 2
		return PassingData(locusLowMapQIndicator=locusLowMapQIndicator, totalNoOfReads=totalNoOfReads, \
						noOfGoodReads=noOfGoodReads, fractionOfGoodRead=fractionOfGoodRead,\
						medianMapQ=medianMapQ)
Beispiel #36
0
    def IntLMOnTwoSNPs(cls, snp1_id, gene1_id, genotype_ls1, snp2_id, gene2_id, genotype_ls2, phenotype_index, phenotype_ls, \
        min_data_point=3):
        """
		2009-2-8
			interaction detection linear model
			y = b + SNP1xSNP2 + SNP1 + SNP2 + e
			interaction is the 1st term. therefore the pvalue directly returned is also for this term.
		"""
        return_ls = []
        genotype_ls1 = numpy.resize(genotype_ls1, [len(genotype_ls1), 1])
        genotype_ls2 = numpy.resize(genotype_ls2, [len(genotype_ls2), 1])
        snp_int_matrix = genotype_ls1 * genotype_ls2
        genotype_ls = numpy.hstack(
            (snp_int_matrix, genotype_ls1,
             genotype_ls2))  #interaction variable is the 1st position

        pdata = Association.linear_model(genotype_ls,
                                         phenotype_ls,
                                         min_data_point,
                                         snp_index=snp1_id + snp2_id)

        if pdata:
            pdata = PassingData(snp1_id=snp1_id, gene1_id=gene1_id, snp2_id=snp2_id, gene2_id=gene2_id, pvalue=pdata.pvalue,\
                count1=pdata.count_ls[0], count2=pdata.count_ls[1], bool_type=None, phenotype_index=phenotype_index,\
                var_perc=pdata.var_perc, coeff_list=pdata.coeff_list, coeff_p_value_list=pdata.coeff_p_value_list)
            return_ls.append(pdata)
        return return_ls
Beispiel #37
0
	def get_enrichment_pvalue_by_gw_looping(self, candidate_sample_size, top_snp_index_ls, candidate_gene_set, \
										snps_context_wrapper, \
										no_of_total_snps, total_chr_pos_ar=None, no_of_permutations=20000, no_of_min_breaks=30):
		"""
		2008-10-30
		2008-10-22
			get enrichment pvalue by genome-wide looping of SNP positions. a permutation to preserve LD.
		"""
		if self.debug:
			sys.stderr.write("Getting enrichment pvalue by gw-looping ... ")
		i = 0
		no_of_hits = 0
		while i<no_of_permutations:
			looped_chr_pos_ls = self.get_looped_chr_pos_ls(top_snp_index_ls, no_of_total_snps, total_chr_pos_ar)
			looped_candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls(candidate_gene_set, \
																					looped_chr_pos_ls, snps_context_wrapper)
			new_candidate_sample_size = len(looped_candidate_gene_snp_index_ls)
			if new_candidate_sample_size>=candidate_sample_size:	#pvalue = Prob(X>=candidate_sample_size)
				no_of_hits += 1
			i+=1
			if no_of_min_breaks>0 and no_of_hits>=no_of_min_breaks:	#if no_of_min_breaks<=0, no smart breaking
				break
		pvalue = no_of_hits/float(i)
		return_data = PassingData(pvalue=pvalue, no_of_tests=i, no_of_tests_passed=no_of_hits)
		if self.debug:
			sys.stderr.write("%s/%s tests in total. Done.\n"%(no_of_hits, i))
		return return_data
Beispiel #38
0
	def get_enrichment_pvalue_by_random_gene_list(self, sample_pvalue, total_gene_id_ls, candidate_gene_set, \
												total_chr_pos_ls, snps_context_wrapper, top_snp_chr_pos_ls, n,k,\
												no_of_permutations=20000, no_of_min_breaks=30):
		"""
		2008-10-22
		"""
		if self.debug:
			sys.stderr.write("Getting enrichment pvalue by random gene list ... ")
		i = 0
		no_of_hits = 0
		no_of_candidate_genes = len(candidate_gene_set)
		no_of_total_snps = len(total_chr_pos_ls)
		no_of_top_snps = len(top_snp_chr_pos_ls)
		while i<no_of_permutations:
			random_candidate_gene_set = Set(random.sample(total_gene_id_ls, no_of_candidate_genes))
			random_candidate_gene_snp_gw_index_ls = self.get_candidate_gene_snp_index_ls(random_candidate_gene_set, total_chr_pos_ls, snps_context_wrapper)
			random_candidate_gene_snp_sample_index_ls = self.get_candidate_gene_snp_index_ls(random_candidate_gene_set, top_snp_chr_pos_ls, snps_context_wrapper)
			x = len(random_candidate_gene_snp_sample_index_ls)
			m = len(random_candidate_gene_snp_gw_index_ls)
			n = no_of_total_snps - m
			k = no_of_top_snps
			new_sample_pvalue = rpy.r.phyper(x-1,m,n,k, lower_tail = rpy.r.FALSE)
			if new_sample_pvalue<=sample_pvalue:	#watch: pvalue = Prob(X<=sample_pvalue). chance of getting more significant (smaller) pvalues
				no_of_hits += 1
			i+=1
			if no_of_min_breaks>0 and no_of_hits>=no_of_min_breaks:	#if no_of_min_breaks<=0, no smart breaking
				break
		pvalue = no_of_hits/float(i)
		return_data = PassingData(pvalue=pvalue, no_of_tests=i, no_of_tests_passed=no_of_hits)
		if self.debug:
			sys.stderr.write("%s/%s tests in total. Done.\n"%(no_of_hits, i))
		return return_data
Beispiel #39
0
	def prepareDataForHGTest(self, rm, snps_context_wrapper, candidate_gene_list, results_directory=None, min_MAF=None, \
							no_of_top_snps=None,\
							db_250k=None):
		"""
		2012.3.23
			add argument db_250k
		2008-08-20
		"""
		sys.stderr.write("Preparing data for HG test ... ")
		genome_wide_result = db_250k.getResultMethodContent(rm.id, results_directory, min_MAF)
		genome_wide_result.data_obj_ls.sort()	#in value descending order. each SNP object has a defined method for comparison based on its value
		genome_wide_result.data_obj_ls.reverse()
		candidate_gene_set = Set(candidate_gene_list)
		candidate_gene_in_top_set = Set([])
		non_candidate_gene_in_top_set = Set([])
		for i in range(no_of_top_snps):
			data_obj = genome_wide_result.data_obj_ls[i]
			snps_context_matrix = snps_context_wrapper.returnGeneLs(data_obj.chromosome, data_obj.position)
			for snps_context in snps_context_matrix:
				snps_id, disp_pos, gene_id = snps_context
				if gene_id in candidate_gene_set:
					candidate_gene_in_top_set.add(gene_id)
				else:
					non_candidate_gene_in_top_set.add(gene_id)
		passingdata = PassingData(candidate_gene_in_top_set=candidate_gene_in_top_set, non_candidate_gene_in_top_set=non_candidate_gene_in_top_set)
		sys.stderr.write("Done.\n")
		return passingdata
Beispiel #40
0
    def organizeProbesIntoChromosome(cls, xy_ls, chr_pos_ls, probes_id_ls):
        """
		2010-4-29
			add chr_pos2index to map (chr, pos) to its index in chr_pos_ls
		2009-11-24
			split out of calculateProbeQuartilePerChromosome()
			xy_ls, chr_pos_ls, probes_id_ls are already in chromosomal order.
		"""
        sys.stderr.write("Getting probes into each chromosome ...")
        chr2xy_ls = {}
        chr2probe_id_ls = {}
        chr_pos2index = {
        }  # 2010-4-29. map (chr, pos) to its index in chr_pos_ls
        for i in range(len(xy_ls)):
            chr, pos = chr_pos_ls[i]
            if chr not in chr2xy_ls:
                chr2xy_ls[chr] = []
                chr2probe_id_ls[chr] = []  #initialize with the start_probe_id
            chr2xy_ls[chr].append(xy_ls[i])
            chr2probe_id_ls[chr].append(probes_id_ls[i])
            chr_pos2index[(chr, pos)] = i
        sys.stderr.write("Done.\n")
        return PassingData(chr2xy_ls=chr2xy_ls,
                           chr2probe_id_ls=chr2probe_id_ls,
                           chr_pos2index=chr_pos2index)
    def getScoreRankFromPermIndexLs(self, genome_wide_result,
                                    candidate_gene_snp_index_ls,
                                    non_candidate_gene_snp_index_ls):
        """
		2008-10-21
		"""
        sys.stderr.write("Getting Score rank data given index ls...")
        candidate_score_ls = []
        non_candidate_score_ls = []
        candidate_rank_ls = []
        non_candidate_rank_ls = []
        for index in candidate_gene_snp_index_ls:
            if genome_wide_result.data_obj_ls[index]:
                candidate_score_ls.append(
                    genome_wide_result.data_obj_ls[index].value)
        no_of_candidate_scores = len(candidate_score_ls)

        for index in non_candidate_gene_snp_index_ls:
            if genome_wide_result.data_obj_ls[index]:
                non_candidate_score_ls.append(
                    genome_wide_result.data_obj_ls[index].value)

        total_score_ls = candidate_score_ls + non_candidate_score_ls
        import rpy
        rank_ls = rpy.r.rank(total_score_ls)
        candidate_rank_ls = rank_ls[:no_of_candidate_scores]
        non_candidate_rank_ls = rank_ls[no_of_candidate_scores:]

        score_rank_data = PassingData(candidate_score_ls=candidate_score_ls, candidate_rank_ls=candidate_rank_ls,\
              non_candidate_score_ls=non_candidate_score_ls, non_candidate_rank_ls=non_candidate_rank_ls)
        sys.stderr.write("Done.\n")
        return score_rank_data
Beispiel #42
0
	def parseFastaDescriptionForFullVervetBACs(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		possible header lines:
			
		>gi|285026568|gb|AC239257.2| Chlorocebus aethiops chromosome UNK clone CH252-270J24, WORKING DRAFT SEQUENCE, 2 unordered pieces
		>gi|281332227|gb|AC238852.3| Chlorocebus aethiops BAC clone CH252-133A18 from chromosome 3, complete sequence
		>gi|285002488|gb|AC239185.3| Chlorocebus aethiops BAC clone CH252-404N12 from chromosome unknown, complete sequence
		
		
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split('|')
		_tax_id = None
		p_chromosome = re.compile(r'UNK clone ([^,]+),')	# 1st type of clone description
		p2_chromosome = re.compile(r'clone ([^,]+),')	# 2nd type of clone description
		
		if p_chromosome.search(header[4]) is not None:
			chromosome = p_chromosome.search(header[4]).groups()[0]
		else:
			if p2_chromosome.search(header[4]) is not None:
				chromosome = p2_chromosome.search(header[4]).groups()[0]
			else:
				chromosome = None
		gi = int(header[1])
		acc_ver = header[3]
		comment = header[4]
		return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
Beispiel #43
0
    def mergeSegmentsForOneArray(self, db_250k, array_id, raw_cnv_method_id = None, max_gap_ratio=0.3, \
          max_gap_len=None, maxDeletionLength=50000, param_obj=None):
        """
		2010-7-29
		"""
        sys.stderr.write("Merging segments for array %s ... \n" % array_id)
        query = Stock_250kDB.CNVCall.query.filter_by(cnv_method_id=raw_cnv_method_id).\
         filter_by(array_id=array_id).filter_by(cnv_type_id=param_obj.cnv_type_id).\
         order_by(Stock_250kDB.CNVCall.chromosome).order_by(Stock_250kDB.CNVCall.start).order_by(Stock_250kDB.CNVCall.stop)

        segment_ls = []
        for row in query:
            segment = [row.chromosome, row.start, row.stop, row.start_probe_id, row.stop_probe_id, row.no_of_probes_covered, \
              row.size_affected,\
              row.amplitude, row.probability]
            segment_ls.append(segment)

        merged_segment_ls = self.mergeOverlappingORCloseSegmentsByGraph(segment_ls, max_reciprocal_gap_ratio=max_gap_ratio, \
                      max_gap_len=max_gap_len, mergeFunc=self.mergeTwoSegments,\
                      maxDeletionLength=maxDeletionLength,\
                      maxNeighborDist=getattr(param_obj, 'maxNeighborDist', 5000))

        from CNVPredictDeletionBySVM import CNVPredictDeletionBySVM
        for merged_segment in merged_segment_ls:
            chromosome, start, stop, start_probe_id, stop_probe_id, no_of_probes_covered, size_affected,\
              amplitude, probability = merged_segment[:9]
            cnv_segment_obj = PassingData(array_id=array_id, start_probe_id=start_probe_id, stop_probe_id=stop_probe_id, \
                 no_of_probes=no_of_probes_covered, amplitude=amplitude, segment_length=size_affected, \
                 segment_chromosome=chromosome, \
                 segment_start_pos=start, segment_stop_pos=stop, \
                 median_intensity=None, probability=probability)
            CNVPredictDeletionBySVM.saveSegmentObj(param_obj, cnv_segment_obj)
        sys.stderr.write("Done.\n")
Beispiel #44
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				  			password=self.db_passwd, hostname=self.hostname, database=self.dbname, 
				   			schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		
		array_id2median_intensity = self.get_array_id2median_intensity(min_array_median_intensity=self.min_array_median_intensity)
		arrays_to_form_model = self.getModelArrays(db, self.training_cnv_method_id, array_id2median_intensity)
		if self.debug:	# 2010-7-25 for debug, temporary
			arrays_to_form_model = arrays_to_form_model[:4]
		
		array_id2model = self.constructSVMModels(db, arrays_to_form_model, array_id2median_intensity,\
						minPercUnCoveredByLerContig=self.minPercUnCoveredByLerContig, cnv_method_id=self.training_cnv_method_id,\
						C=self.SVM_C, gamma=self.SVM_gamma, eps=self.SVM_eps, deletedFractionType=self.deletedFractionType)
		
		array_id2model_array_id_ls = self.mapAnyArray2ModelArray(array_id2median_intensity, array_id2model, \
															max_median_intensity_dist=self.max_median_intensity_dist,\
															minNoOfModelArrays=self.minNoOfModelArrays)
		param_obj = PassingData(session=session, no_of_total=0, no_of_into_db=0, report=self.report,\
							cnv_method_id=self.cnv_method_id, cnv_type_id=self.cnv_type_id)
		
		self.predictALLSegments(self.input_fname, array_id2model_array_id_ls, array_id2model,\
						max_amplitude=self.max_amplitude, param_obj=param_obj)
		session.flush()
		session.expunge_all()
		session.commit()
	def run(self):
		"""
		11-13-05 
			--db_connect()
			--parse_entrezgene_xml_file()
				--is_gi_valid_in_annot_assembly_table()
				--find_info_dict()
					--return_location_list()
				--submit_to_entrezgene_mapping_table()
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		sys.stderr.write("\tTotally, %d files to be processed.\n"%len(self.inputfiles))
		db = GenomeDatabase(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)	#2010-6-22
		session = db.session
		param_obj = PassingData(session=db.session, no_of_genes_already_in_db=0, no_of_entrezgene_mappings_already_in_db=0,\
					no_of_total=0, no_of_into_db=0, report=self.report, no_of_commentaries_already_in_db=0,\
					no_of_gene_segments_already_in_db=0, no_of_gene2go_already_in_db=0)
		for f in self.inputfiles:
			sys.stderr.write("%d/%d:\t%s\n"%(self.inputfiles.index(f)+1,len(self.inputfiles),f))
			self.parse_xml_file(session, f, tax_id=self.tax_id, param_obj=param_obj)
		
		session.flush()
		if self.commit:
			session.commit()
		else:
			session.rollback()
    def avgKey2DataLs(self, key2dataLs, no_of_key_columns=1, header=[]):
        """
		2012.1.9
			1. take mean/median/stdev of every cell in dataLs,
			2. modify newHeader to reflect that
		"""
        sys.stderr.write("Averaging key2dataLs (%s entries ) ..." %
                         (len(key2dataLs)))
        newKey2DataLs = {}
        newHeader = []
        keyColHeader = header[:no_of_key_columns]
        valueColHeader = header[no_of_key_columns:]
        newValueColHeader = []
        no_of_value_columns = len(valueColHeader)
        for i in xrange(no_of_value_columns):
            valueColName = valueColHeader[i]
            newValueColHeader += [
                'mean_%s' % (valueColName),
                'median_%s' % (valueColName),
                'stdev_%s' % (valueColName)
            ]

        for key, dataLs in key2dataLs.iteritems():
            if key not in newKey2DataLs:
                newKey2DataLs[key] = []
            no_of_value_columns = len(dataLs)
            for i in xrange(no_of_value_columns):
                meanValue = numpy.mean(dataLs[i])
                medianValue = numpy.median(dataLs[i])
                stdev = numpy.std(dataLs[i])
                newKey2DataLs[key] += [meanValue, medianValue, stdev]
        sys.stderr.write("Done.\n")
        return PassingData(key2dataLs=newKey2DataLs,
                           header=keyColHeader + newValueColHeader)
	def get_enrichment_pvalue_by_gw_looping(self, candidate_sample_size, top_loci_in_cumu_pos, candidate_gene_set=None, \
							genomeRBDict=None, cumuSpan2ChrRBDict=None, no_of_permutations=20000, \
							no_of_min_breaks=30,\
							param_data=None):
		"""
		2011-3-18
			do the test against permData.captured_candidate_gene_set
		2011-3-12
			get enrichment pvalue by genome-wide looping of SNP positions. a permutation to preserve LD.
		"""
		if self.debug:
			sys.stderr.write("Getting enrichment pvalue by gw-looping ... ")
		i = 0
		no_of_hits = 0
		while i<no_of_permutations:
			permuted_top_loci_in_chr_start_stop = self.applyGWLoopToCumuPos(top_loci_in_cumu_pos, cumuSpan2ChrRBDict)
			
			permData = self.prepareDataForPermutationRankTest(permuted_top_loci_in_chr_start_stop, genomeRBDict, param_data)
			new_candidate_sample_size = len(permData.captured_candidate_gene_set)
			if new_candidate_sample_size>=candidate_sample_size:	#pvalue = Prob(X>=candidate_sample_size)
				no_of_hits += 1
			i+=1
			if no_of_min_breaks>0 and no_of_hits>=no_of_min_breaks:	#if no_of_min_breaks<=0, no smart breaking
				break
		pvalue = no_of_hits/float(i)
		return_data = PassingData(pvalue=pvalue, no_of_tests=i, no_of_tests_passed=no_of_hits)
		if self.debug:
			sys.stderr.write("%s/%s tests in total. Done.\n"%(no_of_hits, i))
		return return_data
Beispiel #48
0
 def getAlignmentMatrix(self, alignment_id):
     sys.stderr.write("Getting alignment matrix for alignment=%s ..." %
                      (alignment_id))
     snp_pos_ls = []
     accession_id_ls = []
     name_ls = []
     data_matrix = []
     rows = Sequence.query.filter_by(alignment=alignment_id).order_by(
         Sequence.accession).all()
     counter = 0
     for row in rows:
         if counter == 0:
             snp_pos_ls = self.get_snp_pos_ls(row.alignment_obj.target,
                                              row.alignment_obj.chromosome,
                                              row.alignment_obj.start)
         accession_id_ls.append(row.accession)
         name_ls.append(row.accession_obj.name)
         data_row = dict_map(nt2number, row.bases)
         data_matrix.append(data_row)
         counter += 1
     data_matrix = num.array(data_matrix, num.int8)
     passingdata = PassingData(snp_pos_ls=snp_pos_ls,
                               accession_id_ls=accession_id_ls,
                               name_ls=name_ls,
                               data_matrix=data_matrix)
     sys.stderr.write(' %s accessions, %s bases. Done.\n' %
                      (len(accession_id_ls), len(snp_pos_ls)))
     return passingdata
Beispiel #49
0
    def remove_rows_with_too_many_NAs(
        cls, data_matrix, row_cutoff, cols_with_too_many_NAs_set=None, NA_set=Set([0, -2]), debug=0, is_cutoff_max=0
    ):
        """
		2008-05-19
			if is_cutoff_max=1, anything > row_cutoff is deemed as having too many NAs
			if is_cutoff_max=0 (cutoff is minimum), anything >= row_cutoff is deemed as having too many NAs
		2008-05-12
			made more robust
			add cols_with_too_many_NAs_set
			add NA_set
		2008-05-08
			become classmethod
		"""
        sys.stderr.write("Removing rows with NA rate >= %s ..." % (row_cutoff))
        no_of_rows, no_of_cols = data_matrix.shape
        rows_with_too_many_NAs_set = Set()
        total_cols_set = Set(range(no_of_cols))
        if cols_with_too_many_NAs_set:
            cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set
        else:
            cols_to_be_checked = total_cols_set
        row_index2no_of_NAs = {}
        for i in range(no_of_rows):
            no_of_NAs = 0.0
            for j in cols_to_be_checked:
                if data_matrix[i][j] in NA_set:
                    no_of_NAs += 1
            if no_of_cols != 0:
                NA_ratio = no_of_NAs / no_of_cols
            else:
                NA_ratio = 0.0
            row_index2no_of_NAs[i] = NA_ratio
            if is_cutoff_max:
                if NA_ratio > row_cutoff:
                    rows_with_too_many_NAs_set.add(i)
            else:
                if NA_ratio >= row_cutoff:
                    rows_with_too_many_NAs_set.add(i)
        if debug:
            print
            print "rows_with_too_many_NAs_set"
            print rows_with_too_many_NAs_set
        passingdata = PassingData(rows_with_too_many_NAs_set=rows_with_too_many_NAs_set)
        passingdata.row_index2no_of_NAs = row_index2no_of_NAs
        sys.stderr.write("%s strains removed, done.\n" % len(rows_with_too_many_NAs_set))
        return passingdata
Beispiel #50
0
	def getStrainidTargetidFromFile(self, db, QC_method_id, input_fname, max_mismatch_rate, min_no_of_non_NAs=20):
		"""
		2008-09-10
			column in input_fname is determined on the fly
		2008-08-29
			to get strain id and target id set from the qc_cross_match result file.
		"""
		sys.stderr.write("Getting set of strain_id & target_id ... \n")
		reader = csv.reader(open(input_fname), delimiter='\t')
		#figure out which variable is in which column
		header = reader.next()
		col_name2index = {}
		for i in range(len(header)):
			column_name = header[i]
			col_name2index[column_name] = i
		
		strain_id_set = Set()
		target_id_set = Set()
		i = 0
		for row in reader:
			#id, strainid, target_id, qc_method_id, mismatch_rate, no_of_mismatches, no_of_non_NA_pairs, readme_id =row
			strainid = int(row[col_name2index['strainid']])	#2008-09-10
			target_id = int(row[col_name2index['target_id']])
			qc_method_id = int(row[col_name2index['qc_method_id']])
			mismatch_rate = float(row[col_name2index['mismatch_rate']])
			no_of_mismatches = int(row[col_name2index['no_of_mismatches']])
			no_of_non_NA_pairs = int(row[col_name2index['no_of_non_NA_pairs']])
			if qc_method_id == QC_method_id and no_of_non_NA_pairs>=min_no_of_non_NAs and mismatch_rate<=max_mismatch_rate:
				if QC_method_id==4:	#strain_id_set = target_id_set
					strain_id_set.add(strainid)
					strain_id_set.add(target_id)
				else:
					strain_id_set.add(strainid)
					target_id_set.add(target_id)
			i +=1
			if self.report and i%100000==0:
				sys.stderr.write("%s\t%s"%('\x08'*40, i))
			if self.debug and i>1000000:
				break
		if self.report:
			sys.stderr.write("%s\t%s\n"%('\x08'*40, i))
		return_data = PassingData()
		return_data.strain_id_set = strain_id_set
		return_data.target_id_set = target_id_set
		del reader
		sys.stderr.write("%s strainids and %s target_ids. Done.\n"%(len(strain_id_set), len(target_id_set)))
		return return_data
Beispiel #51
0
    def rm2result(cls, session, rm=None, chr_pos2db_id=None, max_rank=1000, commit=False, min_rank=1, results_directory=None, \
       min_score=None,update=True,db_id2chr_pos=None, db_250k=None):
        """
		2012.3.23
			add argument db_250k
		2010-3-8
			add argument min_score to exclude SNPs whose scores are too low. This argument has an AND relationship with max_rank.
				log transformation is automatically determined based on analysis_method.smaller_score_more_significant in db.
		2009-11-2
			split out of run()
		"""

        # 2009-5-1 check whether it's already in db.
        db_entries = Stock_250kDB.Results.query.filter_by(results_id=rm.id)
        result_exists = False
        if db_entries.count() == max_rank - min_rank + 1:
            if update:
                db_entries.delete()
                sys.stderr.write("%s already in db. Deleting rows.\n" % rm.id)
            else:
                sys.stderr.write("%s already in db. Ignore.\n" % rm.id)

        param_data = PassingData(min_MAC=0, db_id2chr_pos=db_id2chr_pos)
        genome_wide_result = db_250k.getResultMethodContent(rm.id, results_directory=results_directory, min_MAF=0., \
                      pdata=param_data, min_value_cutoff=min_score)

        counter = 0
        no_of_saved = 0
        if genome_wide_result:
            for rank in range(min_rank, max_rank + 1):
                if rank > len(
                        genome_wide_result.data_obj_ls
                ):  # rank has gone past the total number of SNPs. break the for loop.
                    break
                data_obj = genome_wide_result.get_data_obj_at_given_rank(rank)
                if data_obj is not None:
                    counter += 1
                    snps_id = chr_pos2db_id.get(
                        (data_obj.chromosome, data_obj.position))
                    if data_obj.extra_col_ls:
                        result_obj = cPickle.dumps(data_obj.extra_col_ls)
                    else:
                        result_obj = None
                    # 2010-3-8 check if it's in db now.
                    db_entries = Stock_250kDB.Results.query.filter_by(
                        results_id=rm.id).filter_by(snps_id=snps_id)
                    if db_entries.count() == 0:
                        Stock_250kDB.Results(snps_id=snps_id, results_id=rm.id, score=data_obj.value, rank=rank, beta=getattr(data_obj, 'beta1', None),\
                             maf=data_obj.maf, mac=data_obj.mac, genotype_var_perc=data_obj.genotype_var_perc,\
                             correlation=getattr(data_obj, 'correlations', None), odds_ratio=getattr(data_obj, "odds_ratio_est",None),\
                             statistic=getattr(data_obj, "statistics",None), object=result_obj)
                        no_of_saved += 1
        if commit:
            session.flush()
            #session.commit()
        else:
            session.rollback()
        sys.stderr.write("%s out of %s saved in db.\n" %
                         (no_of_saved, counter))
Beispiel #52
0
	def mapEachChromosome(self, workflow=None, alignmentData=None, chromosome=None,\
				VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None, transferOutput=True, **keywords):
		"""
		2012.9.17
		"""
		if workflow is None:
			workflow = self
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		topOutputDirJob = passingData.topOutputDirJob
		
		alignment = alignmentData.alignment
		parentJobLs = alignmentData.jobLs
		bamF = alignmentData.bamF
		baiF = alignmentData.baiF
		bamFnamePrefix = passingData.bamFnamePrefix
		
		
		"""
		#2012.9.21 perhaps a downsampling job
		outputFname = os.path.join(topOutputDirJob.output, '%s_%s.bam'%(bamFnamePrefix, overlapFileBasenameSignature))
		outputFile = File(outputFname)
		selectAlignmentJob, bamIndexJob1 = self.addSelectAlignmentJob(executable=workflow.samtools, inputFile=bamF, \
				outputFile=outputFile, region=overlapInterval, parentJobLs=[topOutputDirJob] + parentJobLs, \
				extraDependentInputLs=[baiF], transferOutput=False, \
				extraArguments=None, job_max_memory=2000, needBAMIndexJob=True)
		"""
		
		"""
		#2012.9.21 count covariates job is moved to map()
		recalFile = File(os.path.join(topOutputDirJob.output, '%s_%s.recal_data.csv'%(bamFnamePrefix, chromosome)))
		countCovariatesJob = self.addGATKBaseRecalibratorJob(GenomeAnalysisTKJar=workflow.GenomeAnalysisTK2Jar, inputFile=bamF, \
								VCFFile=VCFFile, interval=chromosome, outputFile=recalFile, \
								refFastaFList=passingData.refFastaFList, parentJobLs=[topOutputDirJob]+parentJobLs, 
								extraDependentInputLs=[baiF, VCFFile.tbi_F], \
								transferOutput=False, \
								extraArguments=None, job_max_memory=4000)
		
		self.no_of_jobs += 1
		returnData.countCovariatesJob = countCovariatesJob
		returnData.jobDataLs.append(PassingData(jobLs=[countCovariatesJob], file=countCovariatesJob.recalFile, \
											fileLs=[countCovariatesJob.recalFile]))
		"""
		
		return returnData
	def preReduce(self, workflow=None, outputDirPrefix="", passingData=None, transferOutput=True, **keywords):
		"""
		2012.9.17
		"""
		parentPreReduceData = AbstractVervetWorkflow.preReduce(self, workflow=workflow, outputDirPrefix=outputDirPrefix, passingData=passingData, \
							transferOutput=transferOutput, **keywords)
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		callOutputDir = "call"
		callOutputDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=callOutputDir)
		passingData.callOutputDirJob = callOutputDirJob
		
		matrixDir = "pairwiseDistMatrix"
		matrixDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=matrixDir)
		passingData.matrixDirJob = matrixDirJob
		
		reduceOutputDirJob = passingData.reduceOutputDirJob
		#2012.10.9 reduceOutputDirJob was added to passingData during AbstractVCFWorkflow.preReduce()
		
		#reduceOutputDir = "aggregateData"
		#reduceOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=reduceOutputDir)
		#passingData.reduceOutputDirJob = reduceOutputDirJob
		
		figureFnamePrefix = os.path.join(reduceOutputDirJob.output, 'aggregateDistanceMatrix')
		aggregateDistanceMatrixOutputF = File('%s.tsv'%(figureFnamePrefix))
		PCAFile = File('%s_PCA.tsv'%(figureFnamePrefix))
		aggregateAndHClusterDistanceMatrixJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.AggregateAndHClusterDistanceMatrix, \
									outputF=aggregateDistanceMatrixOutputF, \
									parentJobLs=[reduceOutputDirJob],extraOutputLs=[PCAFile, File('%s.png'%(figureFnamePrefix)), \
																				File('%s.svg'%(figureFnamePrefix))], \
									extraDependentInputLs=[], transferOutput=True, extraArguments="-f %s"%(figureFnamePrefix))
		returnData.aggregateAndHClusterDistanceMatrixJob = aggregateAndHClusterDistanceMatrixJob
		
		#2012.9.5 add the job to append meta info (country, sex, latitude, etc. of each monkey)
		outputF = File('%s_withMetaInfo.tsv'%(figureFnamePrefix))
		appendInfo2PCAOutputJob = self.addGenericDBJob(executable=self.AppendInfo2SmartPCAOutput, inputFile=PCAFile, \
				outputFile=outputF, \
				parentJobLs=[aggregateAndHClusterDistanceMatrixJob], extraDependentInputLs=None, \
				extraOutputLs=None,\
				transferOutput=True, \
				extraArgumentList=None, extraArguments=None, sshDBTunnel=self.needSSHDBTunnel, \
				key2ObjectForJob=None, job_max_memory=2000)
		
		
		return returnData
Beispiel #54
0
	def run(self):
		self.communicator = MPI.world.duplicate()
		node_rank = self.communicator.rank
		free_computing_nodes = range(1, self.communicator.size-1)	#exclude the 1st and last node
		free_computing_node_set = Set(free_computing_nodes)
		output_node_rank = self.communicator.size-1
		
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		session.begin()
		
		if node_rank == 0:
			snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest)
			if not self.results_id_ls:
				pdata = PassingData(call_method_id=self.call_method_id, analysis_method_id=self.analysis_method_id)
				self.results_id_ls = self.getResultsMethodIDLs(pdata)
			
			snps_context_wrapper_pickle = cPickle.dumps(snps_context_wrapper, -1)
			for node in free_computing_nodes:	#send it to the computing_node
				sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node))
				self.communicator.send(snps_context_wrapper_pickle, node, 0)
				sys.stderr.write(".\n")
			del snps_context_wrapper_pickle, snps_context_wrapper
		elif node_rank in free_computing_node_set:
			data, source, tag = self.communicator.receiveString(0, 0)
			snps_context_wrapper =  cPickle.loads(data)
			del data
		else:
			pass
		
		self.synchronize()
		if node_rank == 0:
			param_obj = PassingData(params_ls=self.results_id_ls, output_node_rank=output_node_rank, report=self.report, counter=0)
			self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=self.message_size)
		elif node_rank in free_computing_node_set:
			param_data = PassingData(session=session)
			param_data.results_directory = self.input_db_directory
			param_data.default_output_db_directory = self.default_output_db_directory
			param_data.output_db_directory = self.output_db_directory
			param_data.commit = self.commit
			param_data.min_MAF = self.min_MAF
			param_data.min_distance = self.min_distance
			param_data.get_closest = self.get_closest
			param_data.snps_context_wrapper = snps_context_wrapper
			self.computing_node(param_data, self.computing_node_handler)
		else:
			param_obj = PassingData()
			self.output_node(free_computing_nodes, param_obj, self.output_node_handler)
		self.synchronize()	#to avoid some node early exits
	def remove_rows_with_too_many_NAs(cls, data_matrix, row_cutoff, cols_with_too_many_NAs_set=None, NA_set=set([0, -2]), debug=0, is_cutoff_max=0):
		"""
		2008-05-19
			if is_cutoff_max=1, anything > row_cutoff is deemed as having too many NAs
			if is_cutoff_max=0 (cutoff is minimum), anything >= row_cutoff is deemed as having too many NAs
		2008-05-12
			made more robust
			add cols_with_too_many_NAs_set
			add NA_set
		2008-05-08
			become classmethod
		"""
		sys.stderr.write("Removing rows with NA rate >= %s ..."%(row_cutoff))
		no_of_rows, no_of_cols = data_matrix.shape
		rows_with_too_many_NAs_set = set()
		total_cols_set = set(range(no_of_cols))
		if cols_with_too_many_NAs_set:
			cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set
		else:
			cols_to_be_checked = total_cols_set
		row_index2no_of_NAs = {}
		for i in range(no_of_rows):
			no_of_NAs = 0.0
			for j in cols_to_be_checked:
				if data_matrix[i][j] in NA_set:
					no_of_NAs += 1
			if no_of_cols!=0:
				NA_ratio = no_of_NAs/no_of_cols
			else:
				NA_ratio = 0.0
			row_index2no_of_NAs[i] = NA_ratio
			if is_cutoff_max:
				if NA_ratio > row_cutoff:
					rows_with_too_many_NAs_set.add(i)
			else:
				if NA_ratio >= row_cutoff:
					rows_with_too_many_NAs_set.add(i)
		if debug:
			print
			print 'rows_with_too_many_NAs_set'
			print rows_with_too_many_NAs_set
		passingdata = PassingData(rows_with_too_many_NAs_set=rows_with_too_many_NAs_set)
		passingdata.row_index2no_of_NAs = row_index2no_of_NAs
		sys.stderr.write("%s strains removed, done.\n"%len(rows_with_too_many_NAs_set))
		return passingdata
Beispiel #56
0
def getEcotypeInfo(db, country_order_type=1):
    """
	2009-09-2
		add region into ecotype_obj
	2008-10-08
		use ecotype_id2ecotype_obj to summarize
			ecotypeid2pos
			ecotypeid2nativename
			ecotypeid2country
	2008-10-08
		add option order_by_type
		get country2order
		moved from PlotGroupOfSNPs.py
		the db handle is not restricted to the stock database. could be any database on the same server.
		BUT StockDB has to be imported in the program where db connection is established just so that StockDB.Ecotype.table is setup while StockDB.Ecotype.table.metadata is None.
	2008-10-07
	"""
    sys.stderr.write("Getting  Ecotype info ... ")
    import StockDB
    from pymodule import PassingData
    ecotype_info = PassingData()
    if country_order_type == 1:
        order_seq_sentence = 'c.latitude, c.longitude'
    else:
        order_seq_sentence = 'c.longitude, c.latitude'
    rows = db.metadata.bind.execute("select e.id as ecotype_id, e.nativename, e.latitude, e.longitude, a.region, \
		c.abbr as country, c.latitude as country_latitude, \
		c.longitude as country_longitude \
		from stock.%s e, stock.%s s, stock.%s a, stock.%s c where e.siteid=s.id and s.addressid=a.id and \
		a.countryid=c.id order by %s "                                   %(getattr(StockDB.Ecotype.table, 'name', 'ecotype'), \
     getattr(StockDB.Site.table, 'name', 'site'), getattr(StockDB.Address.table, 'name', 'address'), \
     getattr(StockDB.Country.table, 'name', 'country'), order_seq_sentence))
    ecotype_id2ecotype_obj = {}
    country2order = {}
    for row in rows:
        ecotype_obj = PassingData()
        for key, value in row.items():  #not iteritems() for RowProxy object
            setattr(ecotype_obj, key, value)
        ecotype_id2ecotype_obj[row.ecotype_id] = ecotype_obj
        if row.country not in country2order:
            country2order[row.country] = len(country2order)
    ecotype_info.ecotype_id2ecotype_obj = ecotype_id2ecotype_obj
    ecotype_info.country2order = country2order
    sys.stderr.write("%s ecotypes.\n" % (len(ecotype_id2ecotype_obj)))
    return ecotype_info
Beispiel #57
0
    def getTranslationDataStructureForBackgroundLoci(self,
                                                     db_250k,
                                                     cnv_method_id=None,
                                                     min_MAF=0.1):
        """
		2011-4-22
			1. get all loci whose MAF is above min_MAF
			2. construct a (chr,start,stop) 2 cumu_start dictionary
			3. construct a (cumu_start, cumu_stop) 2 (chr, start, stop) RBDict
			
		"""
        sys.stderr.write("Getting translation structures between (chr, start, stop) and (cumu_start, cumu_stop) for cnv method %s ..."%\
            cnv_method_id)
        TableClass = Stock_250kDB.CNV
        query = TableClass.query.filter_by(
            cnv_method_id=cnv_method_id).order_by(
                TableClass.chromosome).order_by(TableClass.start)

        chrSpan2cumuStartRBDict = RBDict()
        cumuSpan2ChrSpanRBDict = RBDict()

        cumu_start = 0
        counter = 0
        real_counter = 0
        for row in query:
            counter += 1
            maf = min(row.frequency, 1 - row.frequency)
            if maf <= min_MAF:
                continue

            real_counter += 1
            chrSpanKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \
                span_ls=[row.start, row.stop], \
                min_reciprocal_overlap=0.00000000000001,)
            #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical.
            chrSpan2cumuStartRBDict[
                chrSpanKey] = cumu_start  #cumu_start is 0-based

            size = row.stop - row.start + 1
            span_ls = [cumu_start + 1, cumu_start + size]
            segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=0, \
                span_ls=span_ls, \
                min_reciprocal_overlap=0.00000000000001,)
            #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical.
            if segmentKey not in cumuSpan2ChrSpanRBDict:
                cumuSpan2ChrSpanRBDict[segmentKey] = (row.chromosome,
                                                      row.start, row.stop)
            else:
                sys.stderr.write(
                    "Error: %s of chr %s is already in cumuSpan2ChrSpanRBDict.\n"
                    % (segmentKey, row.chromosome))

            cumu_start += size
        sys.stderr.write("%s out of %s CNVs are included. Done.\n" %
                         (real_counter, counter))
        return PassingData(cumuSpan2ChrSpanRBDict=cumuSpan2ChrSpanRBDict,
                           chrSpan2cumuStartRBDict=chrSpan2cumuStartRBDict)
    def run(self):
        """
		2011-10
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        workflow = self.initiateWorkflow()

        self.registerExecutables()
        self.registerCustomExecutables()

        callMethodID2Data = {}
        for call_method_id in self.call_method_id_ls:
            callMethod = Stock_250kDB.CallMethod.get(call_method_id)
            if callMethod and callMethod.filename:
                datasetFile = self.registerOneInputFile(inputFname=db.supplantFilePathWithNewDataDir(filePath=callMethod.filename, \
                              oldDataDir=self.db_250k.data_dir, \
                              newDataDir=self.data_dir), \
                         folderName=self.pegasusFolderName)
                callMethodID2Data[callMethod.id] = PassingData(
                    datasetFile=datasetFile, db_entry=callMethod)
            else:
                sys.stderr.write(
                    "WARNING: call method %s is not in db or the filename column is empty.\n"
                    % (call_method_id))
        if self.kinship_fname:
            kinshipFile = self.registerOneInputFile(
                inputFname=self.kinship_fname,
                folderName=self.pegasusFolderName)
        else:
            kinshipFile = None
        if self.eigen_vector_fname:
            eigenVectorFile = self.registerOneInputFile(
                inputFname=self.eigen_vector_fname,
                folderName=self.pegasusFolderName)
        else:
            eigenVectorFile = None
        if self.genotype_fname_to_generate_kinship:
            genotypeFileToGenerateKinship = self.registerOneInputFile(inputFname=self.genotype_fname_to_generate_kinship, \
                         folderName=self.pegasusFolderName)
        else:
            genotypeFileToGenerateKinship = None

        self.addJobs(db_250k=self.db_250k, callMethodID2Data=callMethodID2Data, kinshipFile=kinshipFile, \
           eigenVectorFile=eigenVectorFile, phenotype_method_id_ls=self.phenotype_method_id_ls,\
           analysis_method_id_ls=self.analysis_method_id_ls, \
           genotypeFileToGenerateKinship=genotypeFileToGenerateKinship, \
           data_dir=self.data_dir, \
           getPublicPhenotype=self.getPublicPhenotype,\
           commit=self.commit, \
           transferOutput=True, needSSHDBTunnel=self.needSSHDBTunnel, outputDirPrefix="")
        # Write the DAX to stdout
        outf = open(self.outputFname, 'w')
        self.writeXML(outf)
Beispiel #59
0
	def getCNVFeatureData(cls,  db_250k, array_id=None, \
					minPercUnCoveredByLerContig=0.6, cnv_method_id=6, \
					replaceAmpWithMedianIntensity=False, deletedFractionType=1):
		"""
		2010-7-25
			add argument deletedFractionType
				1: CNVCall.percUnCoveredByLerContig
				2: CNVCall.fractionDeletedInPECoverageData
		2010-7-1
			moved from CNV.CNVPredictionBySVM in misc.py
		"""
		sys.stderr.write("Getting CNV feature data (amplitude, #probes, probe density,) array %s, cnv_method %s, minPercUnCoveredByLerContig %s ... \n"%\
						(array_id, cnv_method_id, minPercUnCoveredByLerContig))
		i = 0
		block_size = 5000
		real_counter = 0
		TableClass = Stock_250kDB.CNVCall
		query = TableClass.query.filter_by(array_id=array_id).filter_by(cnv_method_id=cnv_method_id)
		rows = query.offset(i).limit(block_size)
		session = db_250k.session
		
		ecotype_id = None
		percUnCoveredByLerContig_ls = []
		feature_data = []
		class_label_ls = []
		c_ls = []
		while rows.count()!=0:
			for row in rows:
				ecotype_id = row.array.maternal_ecotype_id
				if deletedFractionType==1:
					deletedFraction = row.percUnCoveredByLerContig
				else:
					deletedFraction = row.fractionDeletedInPECoverageData
				if deletedFraction is not None:
					#x_ls.append(row.amplitude)
					no_of_probes = math.log10(row.no_of_probes_covered)
					probeDensity = row.no_of_probes_covered*1000.0/(row.stop-row.start+1.0)
					if deletedFraction>=minPercUnCoveredByLerContig:
						class_label = -1
						real_counter += 1
					else:
						class_label = 1
					class_label_ls.append(class_label)
					if replaceAmpWithMedianIntensity:
						amp = row.median_intensity
					else:
						amp = row.amplitude
					feature_data.append([amp, no_of_probes, probeDensity ])
					percUnCoveredByLerContig_ls.append(deletedFraction)
				
				i += 1
			if i%5000==0:
				sys.stderr.write("%s%s\t%s"%('\x08'*80, i, real_counter))
			rows = query.offset(i).limit(block_size)
		sys.stderr.write("%s%s\t%s\n"%('\x08'*80, i, real_counter))
		return PassingData(feature_data=feature_data, class_label_ls=class_label_ls, \
						percUnCoveredByLerContig_ls=percUnCoveredByLerContig_ls, ecotype_id=ecotype_id)
Beispiel #60
0
    def generate_parameters(self, parameter_names, parameter_depth=2):
        """
		2008-05-19
			min_call_probability = self.min_call_probability
		2008-05-11
			put NA rate into passing parameters as well. too much memory consumption on each computing node
		"""
        sys.stderr.write("Generating parameter settings ...")
        param_d = PassingData()
        for parameter_name in parameter_names:
            parameter_value = getattr(self, parameter_name)
            parameter_value = parameter_value.split(',')
            parameter_value = map(float, parameter_value)
            setattr(self, parameter_name, parameter_value)
        """
		#2008-05-19 commented out. use self.min_call_probability
		#figure out call probability from input_fname
		import re
		call_prob_pattern = re.compile(r'_(\d+)\.csv')
		call_prob_p_result = call_prob_pattern.search(self.input_fname)
		if call_prob_p_result:
			min_call_probability = float(call_prob_p_result.groups()[0])
		else:
			min_call_probability = -1
		"""
        min_call_probability = self.min_call_probability

        #only 1st 4, last 2 passed to computing node
        parameters = []
        for max_call_mismatch_rate in getattr(self, parameter_names[0]):
            for max_call_NA_rate in getattr(self, parameter_names[1]):
                for max_snp_mismatch_rate in getattr(self, parameter_names[2]):
                    for max_snp_NA_rate in getattr(self, parameter_names[3]):
                        for npute_window_size in getattr(
                                self, parameter_names[4]):
                            parameters.append([min_call_probability, max_call_mismatch_rate, max_call_NA_rate, \
                               max_snp_mismatch_rate, max_snp_NA_rate, npute_window_size])

        param_d.parameters = parameters
        param_d.max_snp_NA_rate_ls = self.max_snp_NA_rate_ls
        param_d.npute_window_size_ls = self.npute_window_size_ls
        sys.stderr.write(" %s parameter settings to process. Done.\n" %
                         len(parameters))
        return param_d