def reduceEachInput(self,
                        chromosome=None,
                        passingData=None,
                        mapEachIntervalDataLs=None,
                        transferOutput=True,
                        **keywords):
        """
        2013.07.10
            #. concatenate all the sub-Inputs into one
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        returnData.mapEachIntervalDataLs = mapEachIntervalDataLs

        #intervalJobLs = [pdata for pdata in mapEachIntervalDataLs]
        """
        realInputVolume = passingData.jobData.file.noOfIndividuals * \
            passingData.jobData.file.noOfLoci
        baseInputVolume = 200*20000
        walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume, \
            baseInputVolume=baseInputVolume, baseJobPropertyValue=60,
            minJobPropertyValue=60, maxJobPropertyValue=500).value
        job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume, \
            baseInputVolume=baseInputVolume, baseJobPropertyValue=5000,
            minJobPropertyValue=5000, maxJobPropertyValue=10000).value
        """
        return returnData
Example #2
0
 def getFamilyStructure(self):
     """
     2013.07.19
     """
     sys.stderr.write("Finding unique pairs (singletons or groups) of parents ...\n ")
     noOfParents2FamilyData = {}
     for nodeID in self.pedigreeGraph:
         parents = self.pedigreeGraph.predecessors(nodeID)
         noOfParents = len(parents)
         if noOfParents not in noOfParents2FamilyData:
             noOfParents2FamilyData[noOfParents] = PassingData(
                 parentTupleSet=set(), parentIDSet=set(), childIDSet=set(),\
                 individualIDSet=set())
         parents.sort()
         noOfParents2FamilyData[noOfParents].parentTupleSet.add(tuple(parents))
         for parentID in parents:
             noOfParents2FamilyData[noOfParents].parentIDSet.add(parentID)
             noOfParents2FamilyData[noOfParents].individualIDSet.add(parentID)
         noOfParents2FamilyData[noOfParents].childIDSet.add(nodeID)
         noOfParents2FamilyData[noOfParents].individualIDSet.add(nodeID)
     
     noOfNuclearFamilies = noOfParents2FamilyData.get(2, 0)
     
     self._reportFamilyStructure(noOfParents2FamilyData)
     return PassingData(noOfParents2FamilyData=noOfParents2FamilyData)
Example #3
0
    def run(self):
        """
        2011-7-11
        """
        self.setup_run()
        
        inputData = PassingData(jobDataLs = [])
        inputFile = self.registerOneInputFile(self.inputFname, folderName=self.pegasusFolderName)
        inputData.jobDataLs.append(PassingData(output=inputFile, jobLs=[]))
        noOfTotalSequences= self.getNoOfSequencesFromFasta(inputFastaFname=self.inputFname)
        
        registerReferenceData = self.registerBlastNucleotideDatabaseFile(
            ntDatabaseFname=self.databaseFname, \
            input_site_handler=self.input_site_handler)
        ntDatabaseFileList = registerReferenceData.refFastaFList
        ntDatabaseFile = ntDatabaseFileList[0]

        if len(ntDatabaseFileList)<4:	#some nt-database index file is missing
            sys.stderr.write("Adding blast-db-making job...")
            makeBlastDBJob = self.addMakeBlastDBJob(executable=self.formatdb,\
                inputFile=ntDatabaseFile, transferOutput=True)
            #add the index files to the ntDatabaseFileList
            ntDatabaseFileList = [ntDatabaseFile] + makeBlastDBJob.outputList
            sys.stderr.write(".\n")
        else:
            makeBlastDBJob = None
        
        self.addJobs(inputData=inputData, outputDirPrefix=self.pegasusFolderName,
            ntDatabaseFileList=ntDatabaseFileList, \
            noOfTotalSequences=noOfTotalSequences, \
            transferOutput=True, makeBlastDBJob=makeBlastDBJob)
        
        self.end_run()
Example #4
0
    def addJobs(self, inputURL=None, relativePathList =[], outputDir="", username=None, password=None, \
            transferOutput=True):
        """
        2012.6.27
        """
        sys.stderr.write("Adding wget jobs for %s input ... " %
                         (len(relativePathList)))
        no_of_jobs = 0

        topOutputDir = outputDir
        topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir)
        no_of_jobs += 1
        returnData = PassingData()
        returnData.jobDataLs = []

        for relativePath in relativePathList:
            #2013.06.26 remove all "/" from  relativePath in case it's a folder
            relativePathNoFolder = relativePath.replace('/', '_')
            logFile = File('%s.log' % (relativePathNoFolder))
            wgetJob = self.addWgetJob(executable=self.wget, url=inputURL,
                relativePath=relativePath, \
                username=username, password=password,\
                targetFolder=outputDir, logFile=logFile,
                cut_dir_number=self.cut_dir_number,
                parentJobLs=[topOutputDirJob], extraDependentInputLs=[], \
                transferOutput=transferOutput, \
                extraArguments=None, job_max_memory=50)
            #include the tfam (outputList[1]) into the fileLs
            returnData.jobDataLs.append(PassingData(jobLs=[wgetJob], file=wgetJob.output, \
                fileLs=wgetJob.outputLs))
            no_of_jobs += 1
        sys.stderr.write("%s jobs.\n" % (no_of_jobs))

        return returnData
 def linkMapToReduce(self, mapEachIntervalData=None,
     preReduceReturnData=None, passingData=None, transferOutput=True, **keywords):
     """
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
Example #6
0
 def reduceEachChromosome(self, chromosome=None, passingData=None,
     mapEachInputDataLs=None, 
     chromosome2mapEachIntervalDataLs=None,\
     reduceEachInputDataLs=None,\
     transferOutput=True, \
     **keywords):
     """
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     returnData.mapEachInputDataLs = mapEachInputDataLs
     returnData.reduceEachInputDataLs = reduceEachInputDataLs
     #reduce matrix by chosen column and average p-value
     
     outputFile = File(os.path.join(self.reduceEachChromosomeDirJob.output,
         'chr_%s_LocusLiftOverProbability.tsv.gz'%(chromosome)))
     reduceChromosomeJob = self.addStatMergeJob(
         statMergeProgram=self.mergeSameHeaderTablesIntoOne, \
         outputF=outputFile, \
         parentJobLs=[self.reduceEachChromosomeDirJob],extraOutputLs=None, \
         extraDependentInputLs=None, transferOutput=False)
         #extraArgumentList=['--keyColumnLs 0-6 --valueColumnLs 7'],\
     mapEachIntervalDataLs = chromosome2mapEachIntervalDataLs.get(chromosome)
     for mapEachIntervalData in mapEachIntervalDataLs:
         for jobData in mapEachIntervalData.jobDataLs:
             self.addInputToMergeJob(reduceChromosomeJob, parentJobLs=[jobData.job])
         
     #add the reduction job to final stat merge job
     self.addInputToMergeJob(self.reduceJob, parentJobLs=[reduceChromosomeJob])
     
     return returnData
    def mapEachAlignment(self, alignmentData=None,  passingData=None,
        transferOutput=True, **keywords):
        """
        2012.9.22
            similar to reduceBeforeEachAlignmentData() but
             for mapping programs that run on one alignment each.

            passingData.alignmentJobAndOutputLs = []
            passingData.bamFnamePrefix = bamFnamePrefix
            passingData.individual_alignment = alignment
        """
        returnData = PassingData(no_of_jobs = 0)
        returnData.jobDataLs = []

        topOutputDirJob = passingData.topOutputDirJob
        refFastaF = passingData.refFastaFList[0]

        alignment = alignmentData.alignment
        parentJobLs = alignmentData.jobLs
        bamF = alignmentData.bamF
        baiF = alignmentData.baiF

        bamFnamePrefix = alignment.getReadGroup()

        return returnData
 def preReduce(self, passingData=None, transferOutput=True, **keywords):
     """
     setup additional mkdir folder jobs, before mapEachAlignment,
         mapEachChromosome, mapReduceOneAlignment
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
 def mapEachChromosome(self, alignmentData=None, chromosome=None,\
     VCFJobData=None, passingData=None,
     reduceBeforeEachAlignmentData=None, transferOutput=True, **keywords):
     """
     2012.9.17
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
 def map(self, alignmentData=None, intervalData=None,\
     VCFJobData=None, passingData=None,
     mapEachChromosomeData=None, transferOutput=True, **keywords):
     """
     2012.9.17
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
 def reduceAfterEachChromosome(self, chromosome=None, passingData=None,
     transferOutput=True,
     mapEachIntervalDataLs=None, **keywords):
     """
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     returnData.mapEachIntervalDataLs = mapEachIntervalDataLs
     return returnData
 def reduce(self, passingData=None, reduceAfterEachAlignmentDataLs=None,
         transferOutput=True, **keywords):
     """
     2012.9.17
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     returnData.reduceAfterEachAlignmentDataLs = reduceAfterEachAlignmentDataLs
     return returnData
 def reduceBeforeEachAlignment(self, passingData=None,
     transferOutput=True, **keywords):
     """
     2012.9 setup some reduce jobs before loop over all intervals of one alignment begins.
         these reduce jobs will collect stuff from each map() job.
         the link will be established in linkMapToReduce().
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
 def reduceAfterEachAlignment(self, passingData=None,
     mapEachChromosomeDataLs=None,
     reduceAfterEachChromosomeDataLs=None,\
     transferOutput=True, **keywords):
     """
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     returnData.mapEachChromosomeDataLs = mapEachChromosomeDataLs
     returnData.reduceAfterEachChromosomeDataLs = reduceAfterEachChromosomeDataLs
     return returnData
Example #15
0
    def mapEachInterval(self, inputJobData=None, selectIntervalJobData=None, \
        chromosome=None,intervalData=None,\
        mapEachChromosomeData=None, \
        passingData=None, transferOutput=False, **keywords):
        """
        #. extract flanking sequences from the input Input (ref sequence file => contig ref sequence)
        #. blast them
        #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py
            #. where hit length match query length, and
            #    no of mismatches <=2 => good => infer new coordinates
        #. output a mapping file between old SNP and new SNP coordinates.
            #. reduce this thing by combining everything
        #. make a new Input file based on the input split Input file
            (replace contig ID , position with the new one's,
                remove the header part regarding chromosomes or replace it)

        """
        returnData = PassingData(no_of_jobs = 0)
        returnData.jobDataLs = []
        passingData.intervalFileBasenamePrefix
        passingData.splitInputFile
        """
        ## 2013.06.19 structures available from passingData, specific to the interval
        passingData.splitInputFile = splitInputFile
        passingData.unitNumber = unitNumber
        passingData.intervalFileBasenamePrefix = '%s_%s_splitInput_u%s'%(
            chromosome, commonPrefix, unitNumber)
        passingData.noOfIndividuals = jobData.file.noOfIndividuals
        passingData.span = self.intervalSize + self.intervalOverlapSize*2
        """
        #add one computing job
        outputFile = File(os.path.join(self.mapDirJob.output,
            "%s.%s.probability.tsv.gz"%(passingData.fileBasenamePrefix,\
            intervalData.interval)))
        locusIntervalDeltaOutputFile = File(os.path.join(self.mapDirJob.output,
            "%s.%s.locusIntervalDelta.tsv.gz"%(passingData.fileBasenamePrefix,
            intervalData.interval)))
        job = self.addAbstractMatrixFileWalkerJob(
            executable=self.ComputeLiftOverLocusProbability, \
            inputFile=selectIntervalJobData.file, outputFile=outputFile, \
            whichColumn=None, whichColumnHeader=None, \
            logY=None, valueForNonPositiveYValue=-1, \
            minNoOfTotal=1, samplingRate=1, \
            inputFileFormat=None, outputFileFormat=None,\
            extraArgumentList=["--locusIntervalDeltaOutputFname", locusIntervalDeltaOutputFile, \
                "--startPosition %s"%(intervalData.start), "--stopPosition %s"%(intervalData.stop)],
            parentJobLs=[selectIntervalJobData.job],
            extraOutputLs=[locusIntervalDeltaOutputFile],\
            transferOutput=transferOutput, job_max_memory=2000, sshDBTunnel=False)
            #For each interval, probabilities are not calculated for loci in
            #  extra segment (from overlapStart to start).
        returnData.jobDataLs.append(self.constructJobDataFromJob(job))
        return returnData
 def mapEachAlignment(self,
                      passingData=None,
                      transferOutput=True,
                      **keywords):
     """
     2012.9.22
         similar to reduceBeforeEachAlignmentData()
          but for mapping programs that run on one alignment each.
     """
     returnData = PassingData(no_of_jobs=0)
     returnData.jobDataLs = []
     return returnData
    def mapEachChromosome(self, alignmentData=None, chromosome=None,\
        VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None,
        transferOutput=True, **keywords):
        """
        2012.9.17
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []

        topOutputDirJob = passingData.topOutputDirJob

        alignment = alignmentData.alignment
        parentJobLs = alignmentData.jobLs
        bamF = alignmentData.bamF
        baiF = alignmentData.baiF
        bamFnamePrefix = passingData.bamFnamePrefix
        """
        #2012.9.21 perhaps a downsampling job
        outputFname = os.path.join(topOutputDirJob.output, \
            '%s_%s.bam'%(bamFnamePrefix, overlapFileBasenameSignature))
        outputFile = File(outputFname)
        selectAlignmentJob, bamIndexJob1 = self.addSelectAlignmentJob(
            executable=self.samtools, inputFile=bamF, \
            outputFile=outputFile, region=overlapInterval,
            parentJobLs=[topOutputDirJob] + parentJobLs, \
            extraDependentInputLs=[baiF], transferOutput=False, \
            extraArguments=None, job_max_memory=2000, needBAMIndexJob=True)
        """
        """
        #2012.9.21 count covariates job is moved to map()
        recalFile = File(os.path.join(topOutputDirJob.output,
            '%s_%s.recal_data.csv'%(bamFnamePrefix, chromosome)))
        countCovariatesJob = self.addGATKBaseRecalibratorJob(
            GenomeAnalysisTKJar=self.GenomeAnalysisTK2Jar, inputFile=bamF, \
            VCFFile=VCFFile, interval=chromosome, outputFile=recalFile, \
            refFastaFList=passingData.refFastaFList,
            parentJobLs=[topOutputDirJob]+parentJobLs, 
            extraDependentInputLs=[baiF, VCFFile.tbi_F], \
            transferOutput=False, \
            extraArguments=None, job_max_memory=4000)

        self.no_of_jobs += 1
        returnData.countCovariatesJob = countCovariatesJob
        returnData.jobDataLs.append(PassingData(jobLs=[countCovariatesJob],
            file=countCovariatesJob.recalFile, \
            fileLs=[countCovariatesJob.recalFile]))
        """

        return returnData
    def parseQueryLocusID(self, locus_id=None):
        """
        2012.10.8
            locus_id is in the format of '%s_%s_%s_positionInFlank%s'%(chromosome, start, stop, flankingLength+1)
            output of ExtractFlankingSequenceForVCFLoci.py
        """
        search_result = ExtractFlankingSequenceForVCFLoci.sequenceTitlePattern.search(
            locus_id)
        chromosome = None
        start = None
        stop = None
        refBase = None
        altBase = None
        positionInFlank = None
        if search_result:
            chromosome = search_result.group(1)
            start = int(search_result.group(2))
            stop = int(search_result.group(3))
            refBase = search_result.group(4)
            altBase = search_result.group(5)
            positionInFlank = int(search_result.group(6))

        return PassingData(chromosome=chromosome,
                           start=start,
                           stop=stop,
                           refBase=refBase,
                           altBase=altBase,
                           positionInFlank=positionInFlank)
	def returnLocusLowMapQualityIndicator(self, alignedReadLs=None, minMapQGoodRead=2, minFractionOfGoodRead=0.9):
		"""
		2013.12.04
		"""
		totalNoOfReads = 0
		noOfGoodReads = 0.0
		medianMapQ=-10
		mapQList=[]
		for alignedRead in alignedReadLs:
			totalNoOfReads +=1
			mapQList.append(alignedRead.mapq)
			if alignedRead.mapq>=minMapQGoodRead:
				noOfGoodReads += 1
			else:
				pass
		if totalNoOfReads>0:
			fractionOfGoodRead = noOfGoodReads/(totalNoOfReads)
			medianMapQ = numpy.median(mapQList)
		else:
			fractionOfGoodRead = -1
			medianMapQ = -10
			
		if fractionOfGoodRead>=minFractionOfGoodRead:
			locusLowMapQIndicator = 0
		else:
			locusLowMapQIndicator = 2
		return PassingData(locusLowMapQIndicator=locusLowMapQIndicator, totalNoOfReads=totalNoOfReads, \
						noOfGoodReads=noOfGoodReads, fractionOfGoodRead=fractionOfGoodRead,\
						medianMapQ=medianMapQ)
Example #20
0
 def parseFastaDescriptionForGenBank(self, descriptionLine=None,
     FigureOutTaxID_ins=None):
     """
     possible header lines:
 
 >gi|51511461|ref|NC_000001.8|NC_000001 H**o sapiens chromosome 1, complete sequence
 >gi|186497660|ref|NC_003070.6| Arabidopsis thaliana chromosome 1, complete sequence
 >gi|26556996|ref|NC_001284.2| Arabidopsis thaliana mitochondrion, complete genome
 >gi|115442598|ref|NC_008394.1| Oryza sativa (japonica cultivar-group) genomic DNA, chromosome 1
     """
     #discard '>' and '\n'
     header = descriptionLine[1:-1]
     header = header.split('|')
     _tax_id = FigureOutTaxID_ins.returnTaxIDGivenSentence(header[4])
     
     if self.p_chromosome.search(header[4]) is not None:
         chromosome = self.p_chromosome.search(header[4]).groups()[0]
     elif header[4].find('mitochondrion')!=-1:
         chromosome = 'mitochondrion'
     elif header[4].find('chloroplast')!=-1:
         chromosome = 'chloroplast'
     else:	#something else, take the whole before ','
         chromosome = header[4].split(',')[0]
     gi = int(header[1])
     acc_ver = header[3]
     comment = header[4]
     return PassingData(tax_id=_tax_id, gi=gi, comment=comment, 
         acc_ver=acc_ver, chromosome=chromosome)
Example #21
0
 def parseFastaDescriptionForGenebank_hs37d5(self, 
     descriptionLine=None, FigureOutTaxID_ins=None):
     """
     >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1
     >Y dna:chromosome chromosome:GRCh37:Y:2649521:59034049:1
     >MT gi|251831106|ref|NC_012920.1| H**o sapiens mitochondrion, complete genome
     >GL000207.1 dna:supercontig supercontig::GL000207.1:1:4262:1
     >GL000226.1 dna:supercontig supercontig::GL000226.1:1:15008:1
     >NC_007605
     >hs37d5
     """
     header = descriptionLine[1:-1]
     headerList = header.split()        
     chromosome = headerList[0]
     comment = ' '.join(headerList[1:])
     gi = None
     acc_ver = None
     accitem = re.compile(r'supercontig')
     if accitem.search(header) is not None:
         acc_ver = headerList[0]
     else:	  
         commentSplit = comment.split("|")
         if(len(commentSplit) > 4):
             #deal with MT
             gi = int(commentSplit[1])
             acc_ver = commentSplit[3]
             comment = commentSplit[4]
     return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver,
         chromosome=chromosome)
Example #22
0
    def parseFastaDescriptionForWUSTLVervetScaffolds(self, descriptionLine=None,
        FigureOutTaxID_ins=None):
        """
        2011-7-6
            
        """
        """
        possible header lines:
        >Contig0  12652774 13406928

        """
        header = descriptionLine[1:-1]	#discard '>' and '\n'
        header = header.split()
        chromosome = header[0]	#contig name is taken as chromosome
        """
        p_chromosome = re.compile(r'Contig(\d+)')
        if p_chromosome.search(header[0]) is not None:
            chromosome = p_chromosome.search(header[0]).groups()[0]
        else:
            chromosome = None
        """
        gi = None
        acc_ver = None
        comment = None
        return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver,
            chromosome=chromosome)
Example #23
0
    def parseFastaDescriptionForFullVervetBACs(self, descriptionLine=None,
        FigureOutTaxID_ins=None):
        """
        2011-7-6
            
        possible header lines:
            
>gi|285026568|gb|AC239257.2| Chlorocebus aethiops chromosome UNK clone CH252-270J24, WORKING DRAFT SEQUENCE, 2 unordered pieces
>gi|281332227|gb|AC238852.3| Chlorocebus aethiops BAC clone CH252-133A18 from chromosome 3, complete sequence
>gi|285002488|gb|AC239185.3| Chlorocebus aethiops BAC clone CH252-404N12 from chromosome unknown, complete sequence


        """
        header = descriptionLine[1:-1]	#discard '>' and '\n'
        header = header.split('|')
        _tax_id = None
        # 1st type of clone description
        p_chromosome = re.compile(r'UNK clone ([^,]+),')
        # 2nd type of clone description
        p2_chromosome = re.compile(r'clone ([^,]+),')
        
        if p_chromosome.search(header[4]) is not None:
            chromosome = p_chromosome.search(header[4]).groups()[0]
        else:
            if p2_chromosome.search(header[4]) is not None:
                chromosome = p2_chromosome.search(header[4]).groups()[0]
            else:
                chromosome = None
        gi = int(header[1])
        acc_ver = header[3]
        comment = header[4]
        return PassingData(tax_id=_tax_id, gi=gi, comment=comment, 
            acc_ver=acc_ver, chromosome=chromosome)
Example #24
0
def estimateMeanStdFromData(dataVector=None, excludeTopFraction=0.2):
    """
	2012.10.14
		adapted from vervet/src/pedigree/DetectWrongLabelByCompKinshipVsIBD.DetectWrongLabelByCompKinshipVsIBD.estimateAbsDeltaMeanStd()
	2012.8.22
	"""
    sys.stderr.write("Estimating mean&std using the middle %.1f%% of data (n=%s) ..."%\
        ((1-excludeTopFraction)*100, len(dataVector)))
    noOfRows = len(dataVector)
    import numpy
    # 2012.8.22 draw some histogram to check what data looks like
    #		if len(dataVector)>10:
    #			outputFname = '%s_kinship_ibd_hist.png'%(self.outputFnamePrefix)
    #			yh_matplotlib.drawHist(dataVector, title='', \
    #							xlabel_1D="kinship-ibd", xticks=None, \
    #							outputFname=outputFname, min_no_of_data_points=10, \
    #							needLog=True, \
    #							dpi=200, min_no_of_bins=25)
    #dataVector = map(abs, dataVector)	#2012.8.23 no abs
    dataVector.sort()
    startIndex = min(0, int(len(dataVector) * (excludeTopFraction / 2)) - 1)
    stopIndex = int(len(dataVector) * (1 - excludeTopFraction / 2))
    dataVector = dataVector[startIndex:stopIndex]

    data_mean = numpy.mean(dataVector)
    data_std = numpy.std(dataVector)

    sys.stderr.write(" mean=%.3f, std=%.3f.\n" % (data_mean, data_std))
    return PassingData(mean=data_mean, std=data_std)
 def readThroughAndProvideSummary(self):
     """
     2013.08.30
         called by vervet/src/db/import/AddAlignmentDepthIntervalFile2DB.py
     """
     col_name2index= self.smartReadHeader()
     if col_name2index is None:
         pdata = self.parseRow(self._row)
         self._postProcessParsedRowDataForSummary(pdata)
     
     for row in self:
         pdata = self.parseRow(row)
         self._postProcessParsedRowDataForSummary(pdata)
     
     
     self.min_interval_length = numpy.min(self.interval_length_ls)
     self.max_interval_length = numpy.max(self.interval_length_ls)
     self.median_interval_length = numpy.median(self.interval_length_ls)
     
     self.mean_interval_value=numpy.mean(self.interval_value_ls)
     self.median_interval_value=numpy.median(self.interval_value_ls)
     return PassingData(
         no_of_intervals=self.no_of_intervals,
         chromosome_size=self.chromosome_size,
         mean_interval_value=self.mean_interval_value,
         median_interval_value=self.median_interval_value,
         min_interval_value=self.min_interval_value,
         max_interval_value=self.max_interval_value,
         
         min_interval_length=self.min_interval_length,
         max_interval_length=self.max_interval_length,
         median_interval_length=self.median_interval_length)
    def openWriteBeagleFiles(self,
                             pedigreeFamilyData=None,
                             outputFnamePrefix=None):
        """
        2013.05.02
            
        The non-likelihood (unphased, trios, pairs) Beagle format:
            I id sample1 sample1 sample2 sample2
            A diabetes 1 1 2 2
            M rs12082861 C C C C
            M rs4912233 T C C C
            M rs12732823 G A A A
            M rs17451521 C C C C
            M rs12033358 C T T T
        
        The likelihood version is
            marker alleleA alleleB 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1002_711_2001039_GA_vs_524
            Contig791:1086 C A 0.9693 0.0307 0.0000 0.6660 0.3338 0.0003 0.0000
            Contig791:1649 G C 0.9406 0.0594 0.0000 0.9693 0.0307 0.0000 0.0000
            Contig791:4084 A C 0.9980 0.0020 0.0000 0.9844 0.0156 0.0000 0.0000
        
        The markers file has this format (markerID, position, alleleA, alleleB)
            Contig791:1086 1086 C A
        """
        sys.stderr.write(
            "Opening beagle files (outputFnamePrefix =%s) to write ..." %
            (outputFnamePrefix))
        familySize2BeagleFileHandler = {}
        familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList
        counter = 0
        for familySize, sampleIDList in familySize2SampleIDList.items():
            if familySize not in familySize2BeagleFileHandler:
                tmpOutputFnamePrefix = '%s_familySize%s' % (outputFnamePrefix,
                                                            familySize)
                writer = MatrixFile(path='%s.bgl' % (tmpOutputFnamePrefix),
                                    mode='w',
                                    delimiter=' ')
                familySize2BeagleFileHandler[familySize] = writer
                if familySize == 1:
                    headerRow = ['marker', 'alleleA', 'alleleB']
                else:
                    headerRow = ['I', 'id']
                for sampleID in sampleIDList:
                    if familySize == 1:
                        #likelihood format has sample name replicated three times, rather than 2 times
                        headerRow.extend([sampleID] * 3)
                    else:
                        headerRow.extend([sampleID] * 2)
                writer.writeHeader(headerRow)
                counter += 1
        markersFile = MatrixFile(path='%s.markers' % (outputFnamePrefix),
                                 mode='w',
                                 delimiter=' ')

        counter += 1
        sys.stderr.write("%s files outputted.\n" % (counter))

        return PassingData(
            familySize2BeagleFileHandler=familySize2BeagleFileHandler,
            markersFile=markersFile)
Example #27
0
 def parseInputFile(self, inputFname=None, **keywords):
     """
     2013.08.23
         if a program is adding a file to db-affiliated storage,
          this is used for parsing.
     """
     return PassingData()
Example #28
0
	def run(self):
		"""
		11-13-05 
			--db_connect()
			--parse_entrezgene_xml_file()
				--is_gi_valid_in_annot_assembly_table()
				--find_info_dict()
					--return_location_list()
				--submit_to_entrezgene_mapping_table()
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		sys.stderr.write("\tTotally, %d files to be processed.\n"%len(self.inputfiles))
		db = GenomeDatabase(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)	#2010-6-22
		session = db.session
		param_obj = PassingData(session=db.session, no_of_genes_already_in_db=0, no_of_entrezgene_mappings_already_in_db=0,\
					no_of_total=0, no_of_into_db=0, report=self.report, no_of_commentaries_already_in_db=0,\
					no_of_gene_segments_already_in_db=0, no_of_gene2go_already_in_db=0)
		for f in self.inputfiles:
			sys.stderr.write("%d/%d:\t%s\n"%(self.inputfiles.index(f)+1,len(self.inputfiles),f))
			self.parse_xml_file(session, f, tax_id=self.tax_id, param_obj=param_obj)
		
		session.flush()
		if self.commit:
			session.commit()
		else:
			session.rollback()
Example #29
0
    def avgKey2DataLs(self, key2dataLs, no_of_key_columns=1, header=[]):
        """
        1. take mean/median/stdev of every cell in dataLs,
        2. modify newHeader to reflect that
        """
        print(f"Averaging key2dataLs ({len(key2dataLs)} entries ) ...",
              flush=True)
        newKey2DataLs = {}
        newHeader = []
        keyColHeader = header[:no_of_key_columns]
        valueColHeader = header[no_of_key_columns:]
        newValueColHeader = []
        no_of_value_columns = len(valueColHeader)
        for i in range(no_of_value_columns):
            valueColName = valueColHeader[i]
            newValueColHeader += [
                'mean_%s' % (valueColName),
                'median_%s' % (valueColName),
                'stdev_%s' % (valueColName)
            ]

        for key, dataLs in key2dataLs.items():
            if key not in newKey2DataLs:
                newKey2DataLs[key] = []
            no_of_value_columns = len(dataLs)
            for i in range(no_of_value_columns):
                meanValue = numpy.mean(dataLs[i])
                medianValue = numpy.median(dataLs[i])
                stdev = numpy.std(dataLs[i])
                newKey2DataLs[key] += [meanValue, medianValue, stdev]
        print(f"Done.", flush=True)
        return PassingData(key2dataLs=newKey2DataLs,
                           header=keyColHeader + newValueColHeader)
    def reduce(self,
               passingData=None,
               reduceEachChromosomeDataLs=None,
               transferOutput=True,
               **keywords):
        """
        #. merge all output of input jobs (passingData.mapEachIntervalDataLsLs) into one big one
        
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        reduceOutputDirJob = passingData.reduceOutputDirJob

        realInputVolume = passingData.jobData.file.noOfIndividuals * passingData.jobData.file.noOfLoci
        baseInputVolume = 200 * 20000
        walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume,
            baseInputVolume=baseInputVolume,
            baseJobPropertyValue=60,
            minJobPropertyValue=60,
            maxJobPropertyValue=500).value
        job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume,
            baseInputVolume=baseInputVolume,
            baseJobPropertyValue=5000,
            minJobPropertyValue=5000,
            maxJobPropertyValue=10000).value

        outputFile = File(
            os.path.join(reduceOutputDirJob.output, 'sameSiteConcordance.tsv'))
        reduceJob = self.addStatMergeJob(
            statMergeProgram=self.mergeSameHeaderTablesIntoOne,
            outputF=outputFile,
            parentJobLs=[reduceOutputDirJob],
            transferOutput=transferOutput,
        )
        returnData.jobDataLs.append(
            PassingData(jobLs=[reduceJob],
                        file=reduceJob.output,
                        fileLs=[reduceJob.output]))

        for mapEachIntervalDataLs in passingData.mapEachIntervalDataLsLs:
            for mapEachIntervalData in mapEachIntervalDataLs:
                self.addInputToMergeJob(reduceJob, \
                        parentJobLs=[mapEachIntervalData.mapJob])

        return returnData