Ejemplo n.º 1
0
    def addJobs(self, inputURL=None, relativePathList =[], outputDir="", username=None, password=None, \
      transferOutput=True):
        """
		2012.6.27
		"""

        sys.stderr.write("Adding wget jobs for %s input ... " %
                         (len(relativePathList)))
        no_of_jobs = 0

        topOutputDir = outputDir
        topOutputDirJob = yh_pegasus.addMkDirJob(self,
                                                 mkdir=self.mkdirWrap,
                                                 outputDir=topOutputDir)
        no_of_jobs += 1
        returnData = PassingData()
        returnData.jobDataLs = []

        for relativePath in relativePathList:
            #2013.06.26 remove all "/" from  relativePath in case it's a folder
            relativePathNoFolder = relativePath.replace('/', '_')
            logFile = File('%s.log' % (relativePathNoFolder))
            wgetJob = self.addWgetJob(executable=self.wget, url=inputURL, relativePath=relativePath, \
               username=username, password=password,\
               targetFolder=outputDir, logFile=logFile, cut_dir_number=self.cut_dir_number, parentJobLs=[topOutputDirJob], extraDependentInputLs=[], \
               transferOutput=transferOutput, \
               extraArguments=None, job_max_memory=50)
            #include the tfam (outputList[1]) into the fileLs
            returnData.jobDataLs.append(PassingData(jobLs=[wgetJob], file=wgetJob.output, \
                    fileLs=wgetJob.outputLs))
            no_of_jobs += 1
        sys.stderr.write("%s jobs.\n" % (no_of_jobs))

        return returnData
Ejemplo n.º 2
0
    def inputNodePrepare(self, snp_info=None):
        """
		2009-2-16
			get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls
		2009-2-11
			refactored out of run()
		"""
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname)
        snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
            data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching

        picklef = open(self.snps_context_fname)
        snps_context_wrapper = cPickle.load(picklef)
        del picklef
        gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
        del snps_context_wrapper
        gene_id_ls = gene_id2snps_id_ls.keys()
        gene_id_ls.sort()

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            self.phenotype_fname, turn_into_integer=0)
        phenData = SNPData(header=header_phen,
                           strain_acc_list=strain_acc_list_phen,
                           data_matrix=data_matrix_phen)
        phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
            snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
        phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(
            phenData)  #2009-2-16

        self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(
            phenData, Set(self.phenotype_method_id_ls))

        if not self.phenotype_index_ls:
            self.phenotype_index_ls = range(len(phenData.col_id_ls))

        pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \
            phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
        params_ls = self.generate_params(self.gene_id_fname, pdata,
                                         self.block_size)

        other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \
              phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
        other_data_pickle = cPickle.dumps(other_data, -1)
        del other_data

        output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \
              phenotype_index_ls=self.phenotype_index_ls)
        output_node_data_pickle = cPickle.dumps(output_node_data, -1)

        snpData_pickle = cPickle.dumps(snpData, -1)
        del snpData, data_matrix
        return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\
              output_node_data_pickle=output_node_data_pickle, params_ls=params_ls)
        return return_data
Ejemplo n.º 3
0
    def addJobs(self, inputData=None, outputDirPrefix="", ntDatabaseFileList=None, noOfTotalSequences=None, \
      transferOutput=True, makeBlastDBJob=None):
        """
		2012.5.24
		"""

        sys.stderr.write("Adding blast jobs for %s input ... " %
                         (len(inputData.jobDataLs)))
        no_of_jobs = 0

        topOutputDir = "%sBlast" % (outputDirPrefix)
        topOutputDirJob = yh_pegasus.addMkDirJob(self,
                                                 mkdir=self.mkdirWrap,
                                                 outputDir=topOutputDir)
        no_of_jobs += 1

        allBlastResultFile = File(os.path.join(topOutputDir, 'blast.tsv'))
        allBlastMergeJob = self.addStatMergeJob(statMergeProgram=self.mergeSameHeaderTablesIntoOne, \
             outputF=allBlastResultFile, transferOutput=transferOutput, parentJobLs=[topOutputDirJob])
        no_of_jobs += 1

        ntDatabaseFile = ntDatabaseFileList[0]
        returnData = PassingData()
        returnData.jobDataLs = []

        for jobData in inputData.jobDataLs:
            inputF = jobData.output
            outputFnamePrefix = os.path.join(
                topOutputDir,
                os.path.splitext(os.path.basename(inputF.name))[0])

            splitFastaJob = self.addSplitFastaFileJob(executable=self.SplitFastaFile, inputFile=inputF, outputFnamePrefix=outputFnamePrefix, \
               noOfSequencesPerSplitFile=self.blockSize, filenameSuffix=".fasta", noOfTotalSequences=noOfTotalSequences,\
               parentJobLs=jobData.jobLs + [topOutputDirJob], extraDependentInputLs=[], transferOutput=False, \
               extraArguments=None, job_max_memory=500)
            no_of_jobs += 1
            for splitFastaOutput in splitFastaJob.outputList:
                outputFile = File('%s.tsv' % (splitFastaOutput.name))
                blastJob = self.addBlastWrapperJob(executable=self.BlastWrapper, inputFile=splitFastaOutput, outputFile=outputFile, \
                    outputFnamePrefix=splitFastaOutput.name , databaseFile=ntDatabaseFile,\
                    maxNoOfMismatches=self.maxNoOfMismatches, minNoOfIdentities=self.minNoOfIdentities, \
                    minIdentityPercentage=self.minIdentityPercentage, blastallPath=self.blastallPath, \
                    parentJobLs=[splitFastaJob, makeBlastDBJob], extraDependentInputLs=ntDatabaseFileList, transferOutput=False, \
                    extraArguments=None, job_max_memory=1000)

                #add output to some reduce job
                self.addInputToStatMergeJob(statMergeJob=allBlastMergeJob, \
                    inputF=blastJob.output, parentJobLs=[blastJob])
                no_of_jobs += 1
        sys.stderr.write("%s jobs. Done.\n" % (no_of_jobs))
        #include the tfam (outputList[1]) into the fileLs
        returnData.jobDataLs.append(PassingData(jobLs=[allBlastMergeJob], file=allBlastResultFile, \
                 fileLs=[allBlastResultFile]))
        return returnData
Ejemplo n.º 4
0
	def run(self):
		self.communicator = MPI.world.duplicate()
		node_rank = self.communicator.rank
		free_computing_nodes = range(1, self.communicator.size-1)	#exclude the 1st and last node
		free_computing_node_set = Set(free_computing_nodes)
		output_node_rank = self.communicator.size-1
		
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		session.begin()
		
		if node_rank == 0:
			snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest)
			if not self.results_id_ls:
				pdata = PassingData(call_method_id=self.call_method_id, analysis_method_id=self.analysis_method_id)
				self.results_id_ls = self.getResultsMethodIDLs(pdata)
			
			snps_context_wrapper_pickle = cPickle.dumps(snps_context_wrapper, -1)
			for node in free_computing_nodes:	#send it to the computing_node
				sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node))
				self.communicator.send(snps_context_wrapper_pickle, node, 0)
				sys.stderr.write(".\n")
			del snps_context_wrapper_pickle, snps_context_wrapper
		elif node_rank in free_computing_node_set:
			data, source, tag = self.communicator.receiveString(0, 0)
			snps_context_wrapper =  cPickle.loads(data)
			del data
		else:
			pass
		
		self.synchronize()
		if node_rank == 0:
			param_obj = PassingData(params_ls=self.results_id_ls, output_node_rank=output_node_rank, report=self.report, counter=0)
			self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=self.message_size)
		elif node_rank in free_computing_node_set:
			param_data = PassingData(session=session)
			param_data.results_directory = self.input_db_directory
			param_data.default_output_db_directory = self.default_output_db_directory
			param_data.output_db_directory = self.output_db_directory
			param_data.commit = self.commit
			param_data.min_MAF = self.min_MAF
			param_data.min_distance = self.min_distance
			param_data.get_closest = self.get_closest
			param_data.snps_context_wrapper = snps_context_wrapper
			self.computing_node(param_data, self.computing_node_handler)
		else:
			param_obj = PassingData()
			self.output_node(free_computing_nodes, param_obj, self.output_node_handler)
		self.synchronize()	#to avoid some node early exits
Ejemplo n.º 5
0
	def run(self):
		"""
		2008-10-28
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		db_id2chr_pos = db.snp_id2chr_pos
		hist_type = CheckCandidateGeneRank.getHistType(self.call_method_id, self.min_distance, self.get_closest, self.min_MAF, \
									self.allow_two_sample_overlapping, self.results_type, self.null_distribution_type_id)
		
		snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest)
		
		param_obj = PassingData(call_method_id=self.call_method_id, \
								analysis_method_id=getattr(self, 'analysis_method_id', None),\
								analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\
								phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\
								list_type_id_ls=self.list_type_id_ls, \
								results_type=self.results_type,\
								no_check_gene_list=True)
		params_ls = self.generate_params(param_obj)
		
		pd = PassingData(snps_context_wrapper=snps_context_wrapper, \
						results_directory=self.results_directory, \
						min_MAF=self.min_MAF,
						get_closest=self.get_closest,
						min_distance=self.min_distance,\
						no_of_top_snps=self.no_of_top_snps,
						min_sample_size=self.min_sample_size,
						test_type_id=self.test_type_id, \
						results_type=self.results_type,
						no_of_permutations=self.no_of_permutations,\
						no_of_min_breaks=self.no_of_min_breaks,
						type=hist_type,\
						null_distribution_type_id=self.null_distribution_type_id,\
						allow_two_sample_overlapping=self.allow_two_sample_overlapping,
						min_score=self.min_score,
						session=session,\
						commit=self.commit,\
						db_id2chr_pos = db_id2chr_pos)
		
		for results_id, list_type_id in params_ls:
			pd.list_type_id = list_type_id
			pd.results_id = results_id
			self.pick_candidate_genes(pd)
Ejemplo n.º 6
0
    def run(self):
        """
		2008-07-17
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.db_user,
                                       password=self.db_passwd,
                                       hostname=self.hostname,
                                       database=self.dbname,
                                       schema=self.schema)
        db.setup(create_tables=False)
        session = db.session
        session.begin()
        snps_context_wrapper = self.dealWithSnpsContextWrapper(
            self.snps_context_picklef, self.min_distance, self.get_closest)

        param_data = PassingData()
        param_data.results_directory = self.input_db_directory
        param_data.default_output_db_directory = self.default_output_db_directory
        param_data.output_db_directory = self.output_db_directory
        param_data.commit = self.commit
        param_data.min_MAF = self.min_MAF
        param_data.min_distance = self.min_distance
        param_data.get_closest = self.get_closest
        param_data.snps_context_wrapper = snps_context_wrapper

        if not self.results_id_ls:
            pdata = PassingData(call_method_id=self.call_method_id, analysis_method_id=self.analysis_method_id, \
                phenotype_method_id_ls=self.phenotype_method_id_ls)
            self.results_id_ls = self.getResultsMethodIDLs(pdata)

        for results_method_id in self.results_id_ls:
            rm = Stock_250kDB.ResultsMethod.get(results_method_id)
            if not rm:
                sys.stderr.write(
                    "No results method available for results_method_id=%s.\n" %
                    results_method_id)
                continue
            self.saveResultsByGene(session, rm, param_data)

        if self.commit:
            session.commit()
            session.clear()
        else:
            session.rollback()
Ejemplo n.º 7
0
    def merge_call_on_one_row(cls,
                              ecotypeid_duplicate_index_ls,
                              data_matrix,
                              no_of_cols,
                              NA_set=Set([0, -2])):
        """
		2008-07-11
			calculate the inconsistency ratio among duplicates
		2008-05-12
			-2 is also ruled out, add NA_set
		"""
        one_row = numpy.zeros(no_of_cols)
        passingdata = PassingData()
        passingdata.no_of_non_NA_pairs = 0
        passingdata.no_of_non_NA_inconsistent_pairs = 0
        for i in range(no_of_cols):
            call_counter_ls = [0] * 11
            non_NA_call_number_set = Set()
            for index in ecotypeid_duplicate_index_ls:
                call_number = data_matrix[index][i]
                if call_number not in NA_set:  #dont' need NA and non-touched bit
                    call_counter_ls[call_number] += 1
                    non_NA_call_number_set.add(call_number)
            if len(non_NA_call_number_set) > 0:
                passingdata.no_of_non_NA_pairs += 1
                if len(non_NA_call_number_set) > 1:
                    passingdata.no_of_non_NA_inconsistent_pairs += 1
            one_row[i] = dbSNP2data.get_majority_call_number(call_counter_ls)
        passingdata.one_row = one_row
        return passingdata
Ejemplo n.º 8
0
    def summarize_NA_mismatch_ls(self, NA_mismatch_ls_ls,
                                 avg_var_name_pair_ls):
        """
		05/12/2008
			called by output_node_handler()
			calculate average NA_rate, mismatch_rate, relative_NA_rate from NA_mismatch_ls_ls
		"""
        passingdata = PassingData()
        for avg_var_name_pair in avg_var_name_pair_ls:
            ls_var_name, avg_var_name, std_var_name = avg_var_name_pair
            setattr(passingdata, ls_var_name, [])
            setattr(passingdata, avg_var_name, -1)
            setattr(passingdata, std_var_name, -1)
        for i in range(len(NA_mismatch_ls_ls)):
            NA_mismatch_ls = NA_mismatch_ls_ls[i]

            NA_rate, mismatch_rate, no_of_NAs, no_of_totals, \
            no_of_mismatches, no_of_non_NA_pairs, \
            relative_NA_rate, relative_no_of_NAs, relative_no_of_totals = NA_mismatch_ls

            if NA_rate != -1 and mismatch_rate != -1 and relative_NA_rate != -1:  #no non-valid values
                passingdata.NA_rate_ls.append(NA_rate)
                passingdata.mismatch_rate_ls.append(mismatch_rate)
                passingdata.relative_NA_rate_ls.append(relative_NA_rate)
        for avg_var_name_pair in avg_var_name_pair_ls:
            ls_var_name, avg_var_name, std_var_name = avg_var_name_pair
            this_ls = getattr(passingdata, ls_var_name)
            sample_size = len(this_ls)
            setattr(passingdata, 'sample_size', sample_size)
            if sample_size > 0:
                setattr(passingdata, avg_var_name, numpy.average(this_ls))
            if sample_size > 1:
                setattr(passingdata, std_var_name, numpy.std(this_ls))
        return passingdata
Ejemplo n.º 9
0
    def testAllPlateIDinPlateSet(self, plate_id_ls, plate_id2plate_set):
        """
		2008-09-12
		"""
        plate_set = None
        all_plate_id_in_plate_set = 1  #test whether all plate ids in previous plate sets or not
        for plate_id in plate_id_ls:
            if plate_id != 0:
                if plate_id not in plate_id2plate_set:
                    all_plate_id_in_plate_set = 0
                    break
                else:
                    if plate_set == None:
                        plate_set = plate_id2plate_set[plate_id]
                    elif plate_id2plate_set[plate_id] != plate_set:
                        sys.stderr.write(
                            "This plate_id_ls, %s, has >1 plate_sets: %s, %s.\n"
                            % (repr(plate_id_ls), plate_set,
                               plate_id2plate_set[plate_id]))
                        all_plate_id_in_plate_set = 0
                        break
        return_data = PassingData()
        return_data.all_plate_id_in_plate_set = all_plate_id_in_plate_set
        return_data.plate_set = plate_set
        return return_data
Ejemplo n.º 10
0
	def findSNPsInRegion(self, snp_info, chromosome, start, stop, center_snp_position=None):
		"""
		2008-10-1
			called by plotSNPRegion()
			find SNPs in this region, if center_snp_position is not given, find one.
			similar to getSNPsAroundThisSNP()
		"""
		if self.report:
			sys.stderr.write("Get SNPs in this region ...")
		from DrawSNPRegion import SNPPassingData
		chr_pos_ls = []
		chr_pos2adjacent_window = {}
		j = 0
		midpoint = (start+stop)/2.
		if center_snp_position is None:
			_center_snp_position = start
		else:
			_center_snp_position = center_snp_position
		center_snp = SNPPassingData(chromosome=chromosome, position=_center_snp_position, snps_id=None)
		for i in range(start-1, stop+2):
			new_pos = i
			new_chr_pos = (chromosome, new_pos)
			if new_chr_pos in snp_info.chr_pos2index:
				if center_snp_position is None and abs(new_pos-midpoint)<abs(center_snp.position-midpoint):	#this SNP is closer to the center
					center_snp.position = new_pos
				chr_pos_ls.append(new_chr_pos)
				if j!=0:
					#add_mid_point(chr_pos_ls, chr_pos2adjacent_window)
					pass
				j += 1
		center_snp.snps_id = '%s_%s'%(center_snp.chromosome, center_snp.position)
		snp_region = PassingData(chr_pos_ls=chr_pos_ls, chr_pos2adjacent_window=chr_pos2adjacent_window, center_snp=center_snp)
		if self.report:
			sys.stderr.write("Done.\n")
		return snp_region
    def mapEachInterval(self, workflow=None, inputJobData=None, selectIntervalJobData=None, \
        chromosome=None,intervalData=None,\
        mapEachChromosomeData=None, \
        passingData=None, transferOutput=False, **keywords):
        """
		2013.04.08 use inputJobData
		2012.10.3
			#. extract flanking sequences from the input Input (ref sequence file => contig ref sequence)
			#. blast them
			#. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py
				#. where hit length match query length, and no of mismatches <=2 => good => infer new coordinates
			#. output a mapping file between old SNP and new SNP coordinates.
				#. reduce this thing by combining everything
			#. make a new Input file based on the input split Input file
				(replace contig ID , position with the new one's, remove the header part regarding chromosomes or replace it)

		"""
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        #passingData.intervalFileBasenamePrefix
        #passingData.splitInputFile
        #passingData.unitNumber
        """
		## 2013.06.19 structures available from passingData, specific to the interval
		passingData.splitInputFile = splitInputFile
		passingData.unitNumber = unitNumber
		passingData.intervalFileBasenamePrefix = '%s_%s_splitInput_u%s'%(chromosome, commonPrefix, unitNumber)
		passingData.noOfIndividuals = jobData.file.noOfIndividuals
		passingData.span = self.intervalSize + self.intervalOverlapSize*2 	#2013.06.19 for memory/walltime gauging
		"""
        return returnData
Ejemplo n.º 12
0
	def readInSNPID2GenotypeVectorLs(self, inputFname=None, returnType=1):
		"""
		returnType
			1: snp_pos2returnData is snp_pos2genotypeVectorLs
			2: snp_pos2returnData is snp_pos2returnData
		2013.07.19 bugfix
		2013.07.11
		"""
		sys.stderr.write("Finding SNPs that have same positions from %s ..."%(inputFname))
		
		reader = VCFFile(inputFname=inputFname)
		counter = 0
		real_counter = 0
		snp_pos2returnData = {}
		for vcfRecord in reader:
			key = (vcfRecord.chromosome, vcfRecord.position)
			if key not in snp_pos2returnData:
				if returnType==1:
					snp_pos2returnData[key] = []
				else:
					snp_pos2returnData[key] = 0
			else:
				real_counter += 1
			
			if returnType==1:
				snp_pos2returnData[key].append(vcfRecord.data_row[1:])	#[0] is reference
			else:
				snp_pos2returnData[key] += 1
			
			counter += 1
		reader.close()
		sys.stderr.write("%s snp coordinates from %s vcf records. %s entries with same-positions.\n"%\
						(len(snp_pos2returnData), counter, real_counter))
		return PassingData(snp_pos2returnData=snp_pos2returnData)
	def avgKey2DataLs(self, key2dataLs, no_of_key_columns=1, header=[]):
		"""
		2012.1.9
			1. take mean/median/stdev of every cell in dataLs,
			2. modify newHeader to reflect that
		"""
		sys.stderr.write("Averaging key2dataLs (%s entries ) ..."%(len(key2dataLs)))
		newKey2DataLs = {}
		newHeader = []
		keyColHeader = header[:no_of_key_columns]
		valueColHeader = header[no_of_key_columns:]
		newValueColHeader = []
		no_of_value_columns = len(valueColHeader)
		for i in xrange(no_of_value_columns):
			valueColName = valueColHeader[i]
			newValueColHeader += ['mean_%s'%(valueColName), 'median_%s'%(valueColName), 'stdev_%s'%(valueColName)]
		
		for key, dataLs in key2dataLs.iteritems():
			if key not in newKey2DataLs:
				newKey2DataLs[key] = []
			no_of_value_columns = len(dataLs)
			for i in xrange(no_of_value_columns):
				meanValue = numpy.mean(dataLs[i])
				medianValue = numpy.median(dataLs[i])
				stdev = numpy.std(dataLs[i])
				newKey2DataLs[key] += [meanValue, medianValue, stdev]
		sys.stderr.write("Done.\n")
		return PassingData(key2dataLs= newKey2DataLs, header=keyColHeader + newValueColHeader)
Ejemplo n.º 14
0
    def getDataStructureFromSNPsD(self, snpsd):
        """
		05/07/08
		"""
        sys.stderr.write("Reading data ...")
        no_of_rows = len(snpsd.positions)
        no_of_cols = len(snpsd.accessions)
        snps = []
        nucs = []
        for i in range(no_of_rows):
            one_snp_ls, symbol2counts = self.get_symbol2counts(
                snpsd.snps, fixed_index=i, no_of_rolls=no_of_cols, by_row=0)

            passingdata = self.get_symbol2MAJ_MIN(symbol2counts)
            if passingdata.symbol2MAJ_MIN == 3:
                sys.stderr.write(
                    "Error: SNP %s (%s) has more than 2 alleles: %s.\n" %
                    (i, snpsd.positions[i], repr(symbol2counts)))
                sys.exit(2)

            map_func = lambda x: passingdata.symbol2MAJ_MIN[x]
            one_snp_ls = map(map_func, one_snp_ls)

            snps.append(''.join(one_snp_ls))
            nucs += [(passingdata.major, passingdata.minor)]
        passingdata = PassingData()
        passingdata.snps = array(snps)
        passingdata.sdps = set(snps)
        passingdata.nucs = array(nucs)
        passingdata.numSamps = no_of_cols
        sys.stderr.write("Done.\n")
        return passingdata.snps, passingdata.sdps, passingdata.nucs, passingdata.numSamps
Ejemplo n.º 15
0
 def get_symbol2MAJ_MIN(self, symbol2counts):
     #construct a dictionary to map input symbols to MAJ, MIN or '?'
     symbol2MAJ_MIN = {self.input_NA_char: '?'}  #'NA' is always '?'
     symbols = symbol2counts.keys()
     if len(symbols) == 0:
         major = ''
         minor = ''
     elif len(symbols) == 1:
         symbol2MAJ_MIN[symbols[0]] = MAJ
         major = symbols[0]
         minor = ''
     elif len(symbols) == 2:
         major, minor = symbols
         if symbol2counts[major] < symbol2counts[minor]:
             minor, major = symbols  #reverse them
         symbol2MAJ_MIN[major] = MAJ
         symbol2MAJ_MIN[minor] = MIN
     elif len(symbols) > 2:
         major, minor = None, None
         symbol2MAJ_MIN = 3
     passingdata = PassingData()
     passingdata.symbol2MAJ_MIN = symbol2MAJ_MIN
     passingdata.major = major
     passingdata.minor = minor
     return passingdata
    def reduceEachChromosome(self, workflow=None, chromosome=None, passingData=None, mapEachInputDataLs=None,
         chromosome2mapEachIntervalDataLs=None,\
         reduceEachInputDataLs=None,\
         transferOutput=True, \
         **keywords):
        """
		"""
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        returnData.mapEachInputDataLs = mapEachInputDataLs
        returnData.reduceEachInputDataLs = reduceEachInputDataLs
        #reduce matrix by chosen column and average p-value

        outputFile = File(
            os.path.join(
                self.reduceEachChromosomeDirJob.output,
                'chr_%s_LocusLiftOverProbability.tsv.gz' % (chromosome)))
        reduceChromosomeJob = self.addStatMergeJob(statMergeProgram=self.mergeSameHeaderTablesIntoOne, \
               outputF=outputFile, \
               parentJobLs=[self.reduceEachChromosomeDirJob],extraOutputLs=None, \
               extraDependentInputLs=None, transferOutput=False)
        #extraArgumentList=['--keyColumnLs 0-6 --valueColumnLs 7'],\
        mapEachIntervalDataLs = chromosome2mapEachIntervalDataLs.get(
            chromosome)
        for mapEachIntervalData in mapEachIntervalDataLs:
            for jobData in mapEachIntervalData.jobDataLs:
                self.addInputToStatMergeJob(statMergeJob=reduceChromosomeJob,
                                            parentJobLs=[jobData.job])

        #add the reduction job to final stat merge job
        self.addInputToStatMergeJob(statMergeJob=self.reduceJob,
                                    parentJobLs=[reduceChromosomeJob])

        return returnData
Ejemplo n.º 17
0
	def returnLocusLowMapQualityIndicator(self, alignedReadLs=None, minMapQGoodRead=2, minFractionOfGoodRead=0.9):
		"""
		2013.12.04
		"""
		totalNoOfReads = 0
		noOfGoodReads = 0.0
		medianMapQ=-10
		mapQList=[]
		for alignedRead in alignedReadLs:
			totalNoOfReads +=1
			mapQList.append(alignedRead.mapq)
			if alignedRead.mapq>=minMapQGoodRead:
				noOfGoodReads += 1
			else:
				pass
		if totalNoOfReads>0:
			fractionOfGoodRead = noOfGoodReads/(totalNoOfReads)
			medianMapQ = numpy.median(mapQList)
		else:
			fractionOfGoodRead = -1
			medianMapQ = -10
			
		if fractionOfGoodRead>=minFractionOfGoodRead:
			locusLowMapQIndicator = 0
		else:
			locusLowMapQIndicator = 2
		return PassingData(locusLowMapQIndicator=locusLowMapQIndicator, totalNoOfReads=totalNoOfReads, \
						noOfGoodReads=noOfGoodReads, fractionOfGoodRead=fractionOfGoodRead,\
						medianMapQ=medianMapQ)
Ejemplo n.º 18
0
    def organizeProbesIntoChromosome(cls, xy_ls, chr_pos_ls, probes_id_ls):
        """
		2010-4-29
			add chr_pos2index to map (chr, pos) to its index in chr_pos_ls
		2009-11-24
			split out of calculateProbeQuartilePerChromosome()
			xy_ls, chr_pos_ls, probes_id_ls are already in chromosomal order.
		"""
        sys.stderr.write("Getting probes into each chromosome ...")
        chr2xy_ls = {}
        chr2probe_id_ls = {}
        chr_pos2index = {
        }  # 2010-4-29. map (chr, pos) to its index in chr_pos_ls
        for i in range(len(xy_ls)):
            chr, pos = chr_pos_ls[i]
            if chr not in chr2xy_ls:
                chr2xy_ls[chr] = []
                chr2probe_id_ls[chr] = []  #initialize with the start_probe_id
            chr2xy_ls[chr].append(xy_ls[i])
            chr2probe_id_ls[chr].append(probes_id_ls[i])
            chr_pos2index[(chr, pos)] = i
        sys.stderr.write("Done.\n")
        return PassingData(chr2xy_ls=chr2xy_ls,
                           chr2probe_id_ls=chr2probe_id_ls,
                           chr_pos2index=chr_pos2index)
Ejemplo n.º 19
0
    def getScoreRankFromPermIndexLs(self, genome_wide_result,
                                    candidate_gene_snp_index_ls,
                                    non_candidate_gene_snp_index_ls):
        """
		2008-10-21
		"""
        sys.stderr.write("Getting Score rank data given index ls...")
        candidate_score_ls = []
        non_candidate_score_ls = []
        candidate_rank_ls = []
        non_candidate_rank_ls = []
        for index in candidate_gene_snp_index_ls:
            if genome_wide_result.data_obj_ls[index]:
                candidate_score_ls.append(
                    genome_wide_result.data_obj_ls[index].value)
        no_of_candidate_scores = len(candidate_score_ls)

        for index in non_candidate_gene_snp_index_ls:
            if genome_wide_result.data_obj_ls[index]:
                non_candidate_score_ls.append(
                    genome_wide_result.data_obj_ls[index].value)

        total_score_ls = candidate_score_ls + non_candidate_score_ls
        import rpy
        rank_ls = rpy.r.rank(total_score_ls)
        candidate_rank_ls = rank_ls[:no_of_candidate_scores]
        non_candidate_rank_ls = rank_ls[no_of_candidate_scores:]

        score_rank_data = PassingData(candidate_score_ls=candidate_score_ls, candidate_rank_ls=candidate_rank_ls,\
              non_candidate_score_ls=non_candidate_score_ls, non_candidate_rank_ls=non_candidate_rank_ls)
        sys.stderr.write("Done.\n")
        return score_rank_data
Ejemplo n.º 20
0
    def openWriteBeagleFiles(self,
                             pedigreeFamilyData=None,
                             outputFnamePrefix=None):
        """
		2013.05.02
			
		The non-likelihood (unphased, trios, pairs) Beagle format:
			I id sample1 sample1 sample2 sample2
			A diabetes 1 1 2 2
			M rs12082861 C C C C
			M rs4912233 T C C C
			M rs12732823 G A A A
			M rs17451521 C C C C
			M rs12033358 C T T T
		
		The likelihood version is
			marker alleleA alleleB 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1002_711_2001039_GA_vs_524
			Contig791:1086 C A 0.9693 0.0307 0.0000 0.6660 0.3338 0.0003 0.0000
			Contig791:1649 G C 0.9406 0.0594 0.0000 0.9693 0.0307 0.0000 0.0000
			Contig791:4084 A C 0.9980 0.0020 0.0000 0.9844 0.0156 0.0000 0.0000
		
		The markers file has this format (markerID, position, alleleA, alleleB)
			Contig791:1086 1086 C A
		"""
        sys.stderr.write(
            "Opening beagle files (outputFnamePrefix =%s) to write ..." %
            (outputFnamePrefix))
        familySize2BeagleFileHandler = {}
        familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList
        counter = 0
        for familySize, sampleIDList in familySize2SampleIDList.iteritems():
            if familySize not in familySize2BeagleFileHandler:
                tmpOutputFnamePrefix = '%s_familySize%s' % (outputFnamePrefix,
                                                            familySize)
                writer = MatrixFile(inputFname='%s.bgl' %
                                    (tmpOutputFnamePrefix),
                                    openMode='w',
                                    delimiter=' ')
                familySize2BeagleFileHandler[familySize] = writer
                if familySize == 1:
                    headerRow = ['marker', 'alleleA', 'alleleB']
                else:
                    headerRow = ['I', 'id']
                for sampleID in sampleIDList:
                    if familySize == 1:  #likelihood format has sample name replicated three times, rather than 2 times
                        headerRow.extend([sampleID] * 3)
                    else:
                        headerRow.extend([sampleID] * 2)
                writer.writeHeader(headerRow)
                counter += 1
        markersFile = MatrixFile(inputFname='%s.markers' % (outputFnamePrefix),
                                 openMode='w',
                                 delimiter=' ')

        counter += 1
        sys.stderr.write("%s files outputted.\n" % (counter))

        return PassingData(
            familySize2BeagleFileHandler=familySize2BeagleFileHandler,
            markersFile=markersFile)
Ejemplo n.º 21
0
def estimateMeanStdFromData(dataVector=None, excludeTopFraction=0.2):
    """
	2012.10.14
		adapted from vervet/src/pedigree/DetectWrongLabelByCompKinshipVsIBD.DetectWrongLabelByCompKinshipVsIBD.estimateAbsDeltaMeanStd()
	2012.8.22
	"""
    sys.stderr.write("Estimating mean&std using the middle %.1f%% of data (n=%s) ..."%\
        ((1-excludeTopFraction)*100, len(dataVector)))
    noOfRows = len(dataVector)
    import numpy
    # 2012.8.22 draw some histogram to check what data looks like
    #		if len(dataVector)>10:
    #			outputFname = '%s_kinship_ibd_hist.png'%(self.outputFnamePrefix)
    #			yh_matplotlib.drawHist(dataVector, title='', \
    #							xlabel_1D="kinship-ibd", xticks=None, \
    #							outputFname=outputFname, min_no_of_data_points=10, \
    #							needLog=True, \
    #							dpi=200, min_no_of_bins=25)
    #dataVector = map(abs, dataVector)	#2012.8.23 no abs
    dataVector.sort()
    startIndex = min(0, int(len(dataVector) * (excludeTopFraction / 2)) - 1)
    stopIndex = int(len(dataVector) * (1 - excludeTopFraction / 2))
    dataVector = dataVector[startIndex:stopIndex]

    data_mean = numpy.mean(dataVector)
    data_std = numpy.std(dataVector)

    sys.stderr.write(" mean=%.3f, std=%.3f.\n" % (data_mean, data_std))
    return PassingData(mean=data_mean, std=data_std)
Ejemplo n.º 22
0
    def get_strain_id_info(self, QC_method_id, ignore_strains_with_qc=True):
        """
		2008-08-18
			to generate data structure related to strain_id, preparation to get data_matrix
			strainid not QCed yet
			link to tg_ecotypeid
		"""
        sys.stderr.write("Getting strain_id info  ... ")
        strain_id2index = {}
        strain_id_list = []
        strain_id2acc = {}
        strain_id2category = {}

        rows = StockDB.Strain.query.all()
        for row in rows:
            if ignore_strains_with_qc:
                ignore_this = 0
                for call_qc in row.call_qc_ls:
                    if call_qc.qc_method_id == QC_method_id:  #QC already done
                        ignore_this = 1
                        break
                if ignore_this:
                    continue
            strain_id = row.id
            strain_index = len(strain_id_list)
            strain_id_list.append(strain_id)
            strain_id2index[strain_id] = strain_index
            strain_id2acc[
                strain_id] = row.ecotypeid_strainid2tg_ecotypeid.tg_ecotypeid
            strain_id2category[strain_id] = strain_id
        passingdata = PassingData(strain_id2index=strain_id2index, strain_id_list=strain_id_list, strain_id2acc=strain_id2acc,\
              strain_id2category=strain_id2category)
        sys.stderr.write("%s strains. Done.\n" % (len(strain_id_list)))
        return passingdata
Ejemplo n.º 23
0
	def computing_node_handler(self, communicator, data, param_obj):
		"""
		2009-9-16
			parameter test_type is renamed to test_type_id
		2008-08-20
			wrap all parameters into pd and pass it to run_wilcox_test
		2008-07-17
		
		"""
		node_rank = communicator.rank
		sys.stderr.write("Node no.%s working...\n"%node_rank)
		data = cPickle.loads(data)
		result_ls = []
		pd = PassingData(snps_context_wrapper=param_obj.snps_context_wrapper,\
							results_directory=param_obj.results_directory,\
							min_MAF=param_obj.min_MAF, get_closest=self.get_closest, min_distance=self.min_distance, \
							min_sample_size=self.min_sample_size, test_type_id=self.test_type_id, \
							results_type=self.results_type, no_of_permutations=self.no_of_permutations,\
							no_of_min_breaks=self.no_of_min_breaks)
		for results_method_id, list_type_id in data:
			pd.results_id = results_method_id
			pd.list_type_id = list_type_id
			result = self.run_wilcox_test(pd)
			if result is not None:
				result_ls.append(result)
		sys.stderr.write("Node no.%s done with %s results.\n"%(node_rank, len(result_ls)))
		return result_ls
Ejemplo n.º 24
0
    def getGenomeWideResult(self, call_method_id, phenotype_method_id,
                            analysis_method_id):
        rows = Stock_250kDB.ResultsMethod.query.filter_by(call_method_id=call_method_id).filter_by(analysis_method_id=analysis_method_id).\
           filter_by(phenotype_method_id=phenotype_method_id).filter_by(results_method_type_id=1)

        pdata = PassingData()
        if rows.count() == 1:
            rm = rows.first()
        elif rows.count() == 0:
            sys.stderr.write("No result fetched from db based on call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n"%\
                (call_method_id, analysis_method_id, phenotype_method_id))
            rm = None
        else:
            sys.stderr.write("First result out of %s results fetched from db based on call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n"%\
                (rows.count(), call_method_id, analysis_method_id, phenotype_method_id))
            rm = rows.first()
        if rm:
            input_fname = rm.filename
            pdata.gwr_name = '%s_%s_%s' % (rm.analysis_method.short_name,
                                           rm.phenotype_method_id,
                                           rm.phenotype_method.short_name)
        else:
            return

        genome_wide_result = getGenomeWideResultFromFile(
            input_fname,
            min_value_cutoff=None,
            do_log10_transformation=True,
            pdata=pdata)
        return genome_wide_result
    def get_no_of_top_snps_info(cls, db, from_where_clause):
        """
		2008-11-04
			#there's a chance it occurs twice due to float difference in min_score
		2008-10-23
		"""
        sys.stderr.write("Getting no_of_top_snps_info ...")
        rows = db.metadata.bind.execute(
            "select distinct t.no_of_top_snps, t.min_score %s order by no_of_top_snps"
            % from_where_clause)
        id_ls = []
        id2index = {}
        label_ls = []
        no_of_separators = 0
        for row in rows:
            if row.no_of_top_snps not in id2index:  #there's a chance it occurs twice due to float difference in min_score
                id2index[row.no_of_top_snps] = len(id_ls)
                id_ls.append(row.no_of_top_snps)
                label_ls.append('%s %s' % (row.no_of_top_snps, row.min_score))
        list_info = PassingData()
        list_info.id2index = id2index
        list_info.id_ls = id_ls
        list_info.label_ls = label_ls
        sys.stderr.write("Done.\n")
        return list_info
Ejemplo n.º 26
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				  			password=self.db_passwd, hostname=self.hostname, database=self.dbname, 
				   			schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		
		array_id2median_intensity = self.get_array_id2median_intensity(min_array_median_intensity=self.min_array_median_intensity)
		arrays_to_form_model = self.getModelArrays(db, self.training_cnv_method_id, array_id2median_intensity)
		if self.debug:	# 2010-7-25 for debug, temporary
			arrays_to_form_model = arrays_to_form_model[:4]
		
		array_id2model = self.constructSVMModels(db, arrays_to_form_model, array_id2median_intensity,\
						minPercUnCoveredByLerContig=self.minPercUnCoveredByLerContig, cnv_method_id=self.training_cnv_method_id,\
						C=self.SVM_C, gamma=self.SVM_gamma, eps=self.SVM_eps, deletedFractionType=self.deletedFractionType)
		
		array_id2model_array_id_ls = self.mapAnyArray2ModelArray(array_id2median_intensity, array_id2model, \
															max_median_intensity_dist=self.max_median_intensity_dist,\
															minNoOfModelArrays=self.minNoOfModelArrays)
		param_obj = PassingData(session=session, no_of_total=0, no_of_into_db=0, report=self.report,\
							cnv_method_id=self.cnv_method_id, cnv_type_id=self.cnv_type_id)
		
		self.predictALLSegments(self.input_fname, array_id2model_array_id_ls, array_id2model,\
						max_amplitude=self.max_amplitude, param_obj=param_obj)
		session.flush()
		session.expunge_all()
		session.commit()
Ejemplo n.º 27
0
	def loadDataStructure(self, gene_annotation_picklef, min_MAF=0, min_distance=20000, \
						list_type_id=None, snp_matrix_fname=None, snp_matrix_data_type=1):
		"""
		2009-5-30
			add argument snp_matrix_fname
		2008-11-25
		2008-10-01
			wrap a few functions up, convenient for both run() and drawSNPRegion()
		"""
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		self.db = db
		snp_info = self.getSNPInfo(db)
		gene_annotation = self.dealWithGeneAnnotation(gene_annotation_picklef)
		if list_type_id:
			candidate_gene_list = self.getGeneList(list_type_id)
			candidate_gene_set = Set(candidate_gene_list)
		else:
			candidate_gene_set = Set()
		
		if snp_matrix_fname:
			if snp_matrix_data_type==3:
				matrix_data_type=float		#2009-3-23 for CNV amplitude file
			else:
				matrix_data_type=int
			snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=matrix_data_type)			
			#2008-12-05 fake a snp_info for findSNPsInRegion
			self.construct_chr_pos2index_forSNPData(snpData)
		else:
			snpData = None
			
		return_data = PassingData(gene_annotation=gene_annotation, snp_info=snp_info, \
								candidate_gene_set=candidate_gene_set, snpData=snpData)
		return return_data
Ejemplo n.º 28
0
 def getAlignmentMatrix(self, alignment_id):
     sys.stderr.write("Getting alignment matrix for alignment=%s ..." %
                      (alignment_id))
     snp_pos_ls = []
     accession_id_ls = []
     name_ls = []
     data_matrix = []
     rows = Sequence.query.filter_by(alignment=alignment_id).order_by(
         Sequence.accession).all()
     counter = 0
     for row in rows:
         if counter == 0:
             snp_pos_ls = self.get_snp_pos_ls(row.alignment_obj.target,
                                              row.alignment_obj.chromosome,
                                              row.alignment_obj.start)
         accession_id_ls.append(row.accession)
         name_ls.append(row.accession_obj.name)
         data_row = dict_map(nt2number, row.bases)
         data_matrix.append(data_row)
         counter += 1
     data_matrix = num.array(data_matrix, num.int8)
     passingdata = PassingData(snp_pos_ls=snp_pos_ls,
                               accession_id_ls=accession_id_ls,
                               name_ls=name_ls,
                               data_matrix=data_matrix)
     sys.stderr.write(' %s accessions, %s bases. Done.\n' %
                      (len(accession_id_ls), len(snp_pos_ls)))
     return passingdata
    def mapEachAlignment(self,
                         workflow=None,
                         alignmentData=None,
                         passingData=None,
                         transferOutput=True,
                         **keywords):
        """
		2012.9.22
			similar to reduceBeforeEachAlignmentData() but for mapping programs that run on one alignment each.
			
			passingData.AlignmentJobAndOutputLs = []
			passingData.bamFnamePrefix = bamFnamePrefix
			passingData.individual_alignment = alignment
		"""
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []

        topOutputDirJob = passingData.topOutputDirJob
        refFastaF = passingData.refFastaFList[0]

        alignment = alignmentData.alignment
        parentJobLs = alignmentData.jobLs
        bamF = alignmentData.bamF
        baiF = alignmentData.baiF

        bamFnamePrefix = alignment.getReadGroup()

        return returnData
Ejemplo n.º 30
0
    def computing_node_handler(self, communicator, data,
                               computing_parameter_obj):
        """
		2008-09-10
			add source_id to PassingData
		2008-08-28
		"""
        node_rank = communicator.rank
        sys.stderr.write("Node no.%s working...\n" % node_rank)
        data = cPickle.loads(data)
        result_ls = []
        twoSNPData = computing_parameter_obj.twoSNPData
        QC_method_id = computing_parameter_obj.QC_method_id
        for row_id1, row_id2 in data:
            NA_rate, mismatch_rate, no_of_NAs, no_of_totals, no_of_mismatches, no_of_non_NA_pairs = twoSNPData.cmpOneRow(
                row_id1, row_id2)
            #the 2nd position in the row-id1 tuple is strain id
            if QC_method_id == 4:  #the 2nd position in the row-id2 tuple is strain id
                target_id = row_id2[1]
            else:
                target_id = row_id2
            qc_cross_match = PassingData(source_id=row_id1[0], strainid=row_id1[1], target_id=target_id, mismatch_rate=mismatch_rate, \
                     no_of_mismatches=no_of_mismatches, no_of_non_NA_pairs=no_of_non_NA_pairs)
            result_ls.append(qc_cross_match)
        sys.stderr.write("Node no.%s done with %s results.\n" %
                         (node_rank, len(result_ls)))
        return result_ls