Example #1
0
    def __init__(self,
                 chr=None,
                 chromosome=None,
                 chromosomeSize=None,
                 start=None,
                 stop=None,
                 overlapStart=None,
                 overlapStop=None,
                 **keywords):
        PassingData.__init__(self, chr=chr, chromosome=chromosome, chromosomeSize=chromosomeSize, \
             start=start, stop=stop,
          overlapStart=overlapStart, overlapStop=overlapStop, **keywords)
        if not hasattr(self, 'file'):
            self.file = None
        if not hasattr(self, 'jobLs'):
            self.jobLs = []
        if self.chr is None and self.chromosome:
            self.chr = self.chromosome
        elif self.chr and self.chromosome is None:
            self.chromosome = self.chr

        if self.overlapStart is None:
            self.overlapStart = self.start

        if self.overlapStop is None:
            self.overlapStop = self.stop

        self.subIntervalLs = []
        self.subIntervalLs.append((self.overlapStart, self.overlapStop))
Example #2
0
 def __init__(self, **keywords):
     self.isPhased = None
     self.ploidy = None
     self.locusIDList = []
     self.haplotypeList = []
     self.locusPositionList = []
     PassingData.__init__(self, **keywords)
Example #3
0
    def addIndividual(self, name=None, family_id = None, father_name = None, \
        mother_name = None, sex = None, phenotype = None, \
        populationName=None, speciesName=None, ploidy=None):
        """
		2013.3.8
		"""
        if name:
            population_id = None
            species_id = None
            if speciesName:
                species = self.getSpecies(name=speciesName, ploidy=ploidy)
                if species:
                    species_id = species.id
            if populationName:
                population = self.getPopulation(name=populationName,
                                                speciesName=speciesName)
                if population:
                    population_id = population.id
            oneCell = PassingData(name=name,family_id =family_id, father_name=father_name,\
                 mother_name=mother_name, sex=sex, phenotype=phenotype,\
                 population_id=population_id)
            self.individualTable.writeOneCell(oneCell, cellType=2)
            self.flush()
            if name in self._individualName2ID:
                sys.stderr.write("Error: individual %s is not unique, already in _individualName2ID with id=%s.\n"%\
                    (name, self._individualName2ID.get(name)))
                raise
            else:
                self._individualName2ID[
                    name] = self.individualTable.no_of_rows  #nrows is not updated until flush()

        return self.checkIndividual(
            name=name)  #would this work without flush()?
Example #4
0
    def writeChrStartStopTupleList2LocusTable(self, chr_start_stop_list=None, chromosomeLength=None,\
              speciesName=None, ploidy=None):
        """
		2013.3.7
			#. establish _locus_index2id, to be used in writeIndividualName2PolymorphismData()
			#. make sure chr_start_stop_list is in the same order as the haplotype in writeIndividualName2PolymorphismData()
		"""
        sys.stderr.write(
            "Writing a %s-element list of (chr, start,stop) out ..." %
            (len(chr_start_stop_list)))
        chr_start_stop_list.sort()  #make sure it's sorted
        if ploidy is None:
            ploidy = self.ploidy
        for i in xrange(len(chr_start_stop_list)):
            chromosomeName, start, stop = chr_start_stop_list[i][:3]
            if chromosomeName:
                chromosomeEntry = self.getChromosome(name=chromosomeName, length=chromosomeLength, speciesName=speciesName,\
                         ploidy=ploidy)
            else:
                chromosomeEntry = None
            name = '%s_%s_%s' % (chromosomeName, start, stop)
            oneCell = PassingData(name=name,
                                  chromosome_id=getattr(
                                      chromosomeEntry, 'id', None),
                                  start=start,
                                  stop=stop)
            self.locusTable.writeOneCell(oneCell, cellType=2)
            self._locus_index2id[i] = self.locusTable.no_of_rows
        sys.stderr.write("%s loci \n")
        return self._locus_index2id
Example #5
0
    def writeIndividualName2PolymorphismData(self, individualName2polymorphismData=None, \
              locus_index2id=None, speciesName=None, ploidy=None):
        """
		2013.3.7
			if locus_index2id is not available, raise error
		"""
        sys.stderr.write("Writing individualName2polymorphismData (%s individuals) out ..."%\
            (len(individualName2polymorphismData)))
        if locus_index2id is None:
            locus_index2id = self._locus_index2id
        counter = 0
        for individualName, polymorphismData in individualName2polymorphismData.iteritems(
        ):
            individual_id = self.getIndividual(individualName,
                                               speciesName=speciesName,
                                               ploidy=ploidy).id
            for i in xrange(len(polymorphismData.haplotypeList)):
                haplotype = polymorphismData.haplotypeList[i]
                for j in xrange(len(haplotype)):
                    locus_id = locus_index2id.get(j)
                    if locus_id is None:
                        sys.stderr.write(
                            "Error: no locus_id for locus index %s.\n" % (j))
                        raise
                    oneCell = PassingData(individual_id=individual_id, locus_id=locus_id,\
                         chromosome_copy=i, allele_sequence=haplotype[j],\
                         allele_sequence_length=len(haplotype[j]), allele_type=1)
                    self.polymorphismTable.writeOneCell(oneCell, cellType=2)
                    counter += 1
        sys.stderr.write(" %s alleles outputted.\n" % (counter))
Example #6
0
def castPyTablesRowIntoPassingData(rowPointer=None):
    """
	2012.12.21 rowPointer from PyTables iteration is like a C pointer to the current row (no real content).
		need to convert it to a real object if you try to store its content in memory and use it later. 
	
	rowPointer has these methods: 
		['__class__', '__contains__', '__delattr__', '__delitem__', '__doc__', '__format__', '__getattribute__', 
		'__getitem__', '__hash__', '__init__', '__iter__', '__new__', '__next__', '__pyx_vtable__', '__reduce__', 
		'__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 
		'_fillCol', '_flushBufferedRows', '_flushModRows', '_getUnsavedNrows', '_iter', 'append', 'fetch_all_fields', 
		'next', 'nrow', 'table', 'update']
		
		However, errors like this crop up during copying of these rowPointer. 

  File "/usr/lib/python2.7/copy_reg.py", line 93, in __newobj__
    return cls.__new__(cls, *args)
  File "tableExtension.pyx", line 706, in tables.tableExtension.Row.__cinit__ (tables/tableExtension.c:6910)
TypeError: __cinit__() takes exactly 1 positional argument (0 given)

	2012.12.21 could not use PassingDataList. because of these errors.

  File "/usr/lib/python2.7/copy.py", line 257, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/usr/lib/python2.7/copy.py", line 182, in deepcopy
    rv = reductor(2)
TypeError: 'NoneType' object is not callable

	"""
    pdata = PassingData()
    for colname in rowPointer.table.colnames:
        setattr(pdata, colname, rowPointer[colname])
    return pdata
Example #7
0
    def traverseBamByRead(self, processor=None):
        """
		2011-7-10
			add samfile to param_obj
		2011-2-8
			a traverser used by other functions
		"""
        self.seek(0)
        it = self.fetch()
        counter = 0
        real_counter = 0
        qname2count = {}
        param_obj = PassingData(real_counter=real_counter,
                                counter=counter,
                                qname2count=qname2count,
                                samfile=self)
        for read in it:
            counter += 1
            exitCode = processor.run(read, param_obj=param_obj)

            if counter % 10000 == 0:
                sys.stderr.write(
                    "%s\t%s\t\t%s" %
                    ('\x08' * 80, param_obj.counter, param_obj.real_counter))
            if exitCode:  #2011-7-8
                break
        processor.qname2count = param_obj.qname2count  #2011-2-9 pass it to the processor
        max_redundant_read_count = max(param_obj.qname2count.values())
        sys.stderr.write("\n %s unique reads among %s mapped reads, max redundant read count=%s. Done.\n"%\
            (len(param_obj.qname2count), param_obj.real_counter, max_redundant_read_count))
Example #8
0
    def addSpecies(self, name=None, scientific_name=None, ploidy=None):
        """
		2013.3.8
		"""
        if name:
            oneCell = PassingData(name=name,
                                  scientific_name=scientific_name,
                                  ploidy=ploidy)
            self.speciesTable.writeOneCell(oneCell, cellType=2)
            self.flush()
        return self.checkSpecies(name=name)  #would this work without flush()?
Example #9
0
	def reduce(self, **keywords):
		"""
		2012.10.15
			run after all files have been walked through
		"""
		#sample the data
		probabilityMassContainer = DiscreteProbabilityMassContainer(object2proabilityMassDict=self.individualID2probabilityMass)
				
		noOfTotalRows = len(self.individualID2probabilityMass)
		genotypeSampleID2IBDSampleID = self.mapSampleIDToIDInIBDFile(genotypeSampleIDList=self.individualID2probabilityMass.keys(), \
															ibdFileSampleIDList=self.ibdData.row_id_ls)
		counter = 0
		real_counter = 0
		if self.sampleSize<noOfTotalRows:
			if self.ibdData:
				#complicated sampling starts here
				#
				sampledSetSizeHistoryData = PassingData(historyList= [], sumOfAbsStepDifference = 0, \
													noOfLastRounds=20)
					#a metre about whether sampledIndividualIDSet has stopped growing
				sampledIndividualIDSet = set()
				while len(sampledIndividualIDSet)<self.sampleSize and \
						self.detectSampledSetSizeHistoryChangeInLastRounds(sampledSetSizeHistoryData=sampledSetSizeHistoryData):
					sampledIndividualID = probabilityMassContainer.sampleObject()
					counter += 1
					if sampledIndividualID:
						includeInTheSampling = True
						for alreadySampledIndividualID in sampledIndividualIDSet:	#not too close to anyone previously sampled
							#getting the relatedness
							relatedness = self.ibdData.getCellDataGivenRowColID(genotypeSampleID2IBDSampleID.get(sampledIndividualID), \
																			genotypeSampleID2IBDSampleID.get(alreadySampledIndividualID))
							if relatedness is not None and relatedness>=self.maxPairwiseKinship:
								includeInTheSampling = False
						if includeInTheSampling:
							sampledIndividualIDSet.add(sampledIndividualID)
					sampledSetSizeHistoryData.historyList.append(len(sampledIndividualIDSet))
				#turn into list
				sampledIndividualIDList = list(sampledIndividualIDSet)
			else:
				sampledIndividualIDList = random.sample(self.individualID2probabilityMass.keys(), self.sampleSize)
		else:	#take all
			sampledIndividualIDList = self.individualID2probabilityMass.keys()
		
		#output the sampled individuals
		for individualID in sampledIndividualIDList:
			self.writer.writerow([individualID])
			real_counter += 1
		
		fraction = float(real_counter)/float(noOfTotalRows)
		sys.stderr.write("%s/%s (%.3f) selected out of %s samplings.\n"%(real_counter, noOfTotalRows, fraction, counter))
		
		#close the self.invariantPData.writer and self.writer
		AbstractMatrixFileWalker.reduce(self, **keywords)
Example #10
0
def castPyTablesEntryIntoPassingData(entry=None):
    """
	2013.3.11 entry is one cell in the array,that is returned from readWhere() query.
		The array is a numpy data structure: array([(1L, '1', '', 2)], 
			dtype=[('id', '<u8'), ('name', '|S512'), ('scientific_name', '|S512'), ('ploidy', '<u2')])
		
	"""
    pdata = PassingData()
    for i in xrange(len(entry.dtype.names)):
        colname = entry.dtype.names[i]
        setattr(pdata, colname, entry[i])
    return pdata
Example #11
0
 def next(self):
     try:
         row = self.csvFile.next()
     except:
         raise StopIteration
     if not self.isRealCSV:
         row = row.strip().split()
     markerID, alleleA, alleleB = row[0:3]
     return PassingData(markerID=markerID,
                        alleleA=alleleA,
                        alleleB=alleleB,
                        genotypeLikelihoodList=row[3:])
Example #12
0
def getAssociationLandscapeDataFromHDF5File(inputFname=None, associationTableName='association', \
          landscapeTableName='landscape', min_MAF=0.1):
    """
	2012.11.20
		input is in HDF5MatrixFile format (which is output of variation/src/association_peak/DefineAssociationLandscape.py)
		contains two hdf5 groups. one is by associationTableName. the other is by landscapeTableName.
	"""
    pdata = PassingData(min_MAF=min_MAF)
    genome_wide_result = getGenomeWideResultFromHDF5MatrixFile(inputFname=inputFname, \
         min_value_cutoff=None, do_log10_transformation=False, pdata=pdata,\
         construct_chr_pos2index=False, construct_data_obj_id2index=False, \
         construct_locus_db_id2index=True,\
         report=True, tableName=associationTableName)

    returnData = PassingData(genome_wide_result=genome_wide_result)

    sys.stderr.write("Reading landscape from %s ..." % (inputFname))
    current_obj = None
    bridge_ls = []
    locusLandscapeNeighborGraph = nx.Graph()
    reader = HDF5MatrixFile(inputFname, openMode='r')
    landscapeTableObject = reader.getTableObject(tableName=landscapeTableName)
    returnData.HDF5AttributeNameLs = []
    for attributeName, value in landscapeTableObject.getAttributes().iteritems(
    ):
        returnData.HDF5AttributeNameLs.append(attributeName)
        setattr(returnData, attributeName, value)

    for row in landscapeTableObject:
        if row.start_locus_id == 0:  #empty data. happens when inputFname contains no valid landscape, but one default null data point.
            continue
        start_locus_id = row.start_locus_id
        stop_locus_id = row.stop_locus_id
        no_of_loci = row.no_of_loci
        deltaX = row.deltaX

        start_obj = genome_wide_result.get_data_obj_by_locus_db_id(
            start_locus_id)
        stop_obj = genome_wide_result.get_data_obj_by_locus_db_id(
            stop_locus_id)

        bridge_ls.append([start_obj, stop_obj, no_of_loci, deltaX])

        source_index = start_obj.index
        #genome_wide_result.get_data_obj_index_by_locus_db_id(start_locus_id)
        target_index = stop_obj.index

        locusLandscapeNeighborGraph.add_edge(source_index, target_index, \
               weight=None)
        locusLandscapeNeighborGraph[source_index][target_index][
            'no_of_loci'] = no_of_loci
        locusLandscapeNeighborGraph[source_index][target_index][
            'deltaX'] = deltaX

    del reader
    sys.stderr.write("%s bridges.\n" % (len(bridge_ls)))
    returnData.bridge_ls = bridge_ls
    returnData.locusLandscapeNeighborGraph = locusLandscapeNeighborGraph
    return returnData
Example #13
0
    def writeRecombinationEvents(self,
                                 parentName=None,
                                 childName=None,
                                 recombinationLocationList=None):
        """
		2013.3.7
		"""
        parent_id = self.getIndividual(parentName).id
        child_id = self.getIndividual(childName).id
        for position in recombinationLocationList:
            oneCell = PassingData(parent_id=parent_id,
                                  child_id=child_id,
                                  position=position)
            self.recombinationTable.writeOneCell(oneCell, cellType=2)
Example #14
0
    def addPopulation(self, name=None, size=None, speciesName=None):
        """
		2013.3.8
		"""
        if name:
            species_id = None
            if speciesName:
                species = self.getSpecies(name=speciesName)
            if species:
                species_id = species.id
            oneCell = PassingData(name=name, size=size, species_id=species_id)
            self.populationTable.writeOneCell(oneCell, cellType=2)
            self.flush()
        return self.checkPopulation(
            name=name)  #would this work without flush()?
Example #15
0
    def addPolymorphism(self, name=None, individualName=None, locusName=None, chromosome_copy = None,\
        allele_sequence=None, allele_sequence_length=None, allele_type =None, **keywords):
        """
		2013.3.10
		"""
        if name:
            individual_id = self.getIndividual(name=individualName).id
            locus_id = self.getLocus(name=locusName).id
            oneCell = PassingData(name=name, individual_id=individual_id, locus_id = locus_id, \
              chromosome_copy=chromosome_copy,\
              allele_sequence = allele_sequence, allele_sequence_length=allele_sequence_length,\
              allele_type=allele_type, **keywords)
            self.polymorphismTable.writeOneCell(oneCell, cellType=2)
            self.flush()
        return self.checkPolymorphism(
            name=name)  #would this work without flush()?
Example #16
0
    def _readInData(self,
                    tableName=None,
                    tableObject=None,
                    do_log10_transformation=None):
        """
		"""
        YHFile._readInData(self, tableName=tableName, tableObject=tableObject)

        if tableName is None:
            tableName = self.tableName
        if do_log10_transformation is None:
            do_log10_transformation = getattr(self, 'do_log10_transformation',
                                              False)
        pdata = PassingData(min_MAF=self.min_MAF)
        self.genome_wide_result = getGenomeWideResultFromHDF5MatrixFile(reader=self, tableName=tableName, tableObject=tableObject,\
             min_value_cutoff=None, do_log10_transformation=do_log10_transformation, pdata=pdata,\
             construct_chr_pos2index=False, construct_data_obj_id2index=False, \
             construct_locus_db_id2index=True,\
             report=True)
        return self.genome_wide_result
Example #17
0
def parseChrStartStopFromFilename(filename=None,
                                  chr2size=None,
                                  defaultChromosomeSize=10000000000):
    """
	2013.09.18
		#10000000000 is used when filename contains data from a whole chromosome and chr2size is not available or not containing chromosome
		make it very big so that it could be intersected with any interval from any chromosome.
	"""
    searchResult = chr_start_stop_pattern.search(filename)
    if searchResult:
        chromosome = searchResult.group(1)
        start = int(searchResult.group(2))
        stop = int(searchResult.group(3))
    else:  #try
        chromosome = getChrFromFname(filename=filename)
        start = 1
        if chr2size is not None:
            stop = chr2size.get(chromosome, defaultChromosomeSize)
        else:
            stop = defaultChromosomeSize
    return PassingData(chromosome=chromosome, start=start, stop=stop)
    def getNoOfFamiliesAndKidsGivenParentSetSize(self,
                                                 noOfParents2FamilyData=None,
                                                 parentSetSize=2):
        """
		2013.07.19
		"""
        familyData = noOfParents2FamilyData.get(parentSetSize, None)

        if familyData:
            noOfFamilies = len(familyData.parentTupleSet)
            noOfParents = len(familyData.parentIDSet)
            noOfKids = len(familyData.childIDSet)
            noOfIndividuals = len(familyData.individualIDSet)
        else:
            noOfFamilies = 0
            noOfParents = 0
            noOfKids = 0
            noOfIndividuals = 0
        return PassingData(noOfFamilies=noOfFamilies,
                           noOfParents=noOfParents,
                           noOfKids=noOfKids,
                           noOfIndividuals=noOfIndividuals)
Example #19
0
    def addChromosome(self,
                      name=None,
                      length=None,
                      speciesName=None,
                      ploidy=None,
                      path=None):
        """
		2013.3.8
		"""
        if name:
            species_id = None
            if speciesName:
                speciesEntry = self.getSpecies(name=speciesName, ploidy=ploidy)
            if speciesEntry:
                species_id = speciesEntry.id
            oneCell = PassingData(name=name,
                                  length=length,
                                  species_id=species_id,
                                  path=path)
            self.chromosomeTable.writeOneCell(oneCell, cellType=2)
            self.flush()
        return self.checkChromosome(
            name=name)  #would this work without flush()?
Example #20
0
    def addLocus(self, name=None, chromosomeName=None,\
       start = None, stop = None, ref_allele = None, ref_allele_length=None,\
       ref_allele_frequency =None, alt_allele=None, alt_allele_length=None,\
       alt_allele_frequency=None, generation_mutation_arose=None, generation_mutation_fixed=None,\
       mutation_type =None, fitness = None, ancestral_amino_acid =None, \
       derived_amino_acid =None, **keywords):
        """
		2013.3.8
		"""
        if name:
            chromosome_id = None
            if chromosomeName:
                chromosome_id = self.getChromosome(name=chromosomeName).id
            oneCell = PassingData(name=name, chromosome_id=chromosome_id, start = start, stop = stop, \
              ref_allele = ref_allele, ref_allele_length=ref_allele_length,\
              ref_allele_frequency =ref_allele_frequency, alt_allele=alt_allele, alt_allele_length=alt_allele_length,\
              alt_allele_frequency=alt_allele_frequency, generation_mutation_arose=generation_mutation_arose, \
              generation_mutation_fixed=generation_mutation_fixed,\
              mutation_type =mutation_type, fitness = fitness, ancestral_amino_acid =ancestral_amino_acid, \
              derived_amino_acid =derived_amino_acid, **keywords)
            self.locusTable.writeOneCell(oneCell, cellType=2)
            self.flush()
        return self.checkLocus(name=name)  #would this work without flush()?
Example #21
0
def parseOneVCFRow(row, col_name2index, col_index_individual_name_ls, sample_id2index, minDepth=1,\
				dataEntryType=1):
	"""
	2014.01.08 fix a bug that skips calls and shortens data_row. 
	2012.9.6 turn pos into integer
	2012.5.10
		complete representation of one locus
	2012.1.17
		common snippet split out of VCFFile & VCFRecord
		row is a list of input columns from one VCF file line
		dataEntryType
			1: each cell is base call
			2: each cell is a dictionary {'GT': base-call, 'DP': depth}
	"""
	chromosome = row[0]
	pos = int(row[1])	#2012.9.6 turn pos into integer
	vcf_locus_id=row[2]
	quality = row[5]
	filter=row[6]
	info = row[7]
	format = row[8]
	info_ls = info.split(';')
	info_tag2value = {}
	for info_entry in info_ls:
		try:
			tag, value = info_entry.split('=')
		except:
			#sys.stderr.write("Error in splitting %s by =.\n"%info)	###Error in splitting DS by =.
			continue
		info_tag2value[tag] = value
	
	locus_id = (chromosome, pos)
	refBase = row[col_name2index['REF']]
	altBase = row[col_name2index['ALT']]
	
	altBaseLs = altBase.split(',')	#altBase could be just "C" or "C,G" (multi-nucleotide)
	alleleLs = [refBase] + altBaseLs
	alleleNumber2Base = {'.':'NA'}
	for i in xrange(len(alleleLs)):
		alleleNumber2Base[repr(i)] = alleleLs[i]
	
	format_column = row[col_name2index['FORMAT']]
	format_column_ls = format_column.split(':')
	format_column_name2index = getColName2IndexFromHeader(format_column_ls)
		
	if dataEntryType==1:
		data_row = ['NA']*(len(col_index_individual_name_ls)+1)	# extra 1 for the ref
		data_row[0] = refBase
	else:
		data_row = [None]*(len(col_index_individual_name_ls)+1)	# extra 1 for the ref
		data_row[0] = {'GT':refBase, 'DP':-1}
	genotypeCall2Count = {}
	for individual_col_index, individual_name in col_index_individual_name_ls:
		individual_name = individual_name
		if individual_name not in sample_id2index:
			sample_id2index[individual_name] = len(sample_id2index)
		
		#coverage = read_group2coverage[individual_name]
		genotype_data = row[individual_col_index]
		genotype_data_ls = genotype_data.split(':')
		genotype_call_index = format_column_name2index.get('GT')
		genotype_quality_index = format_column_name2index.get('GQ')
		if genotype_quality_index is None:
			genotype_quality_index = format_column_name2index.get('DP')
		depth_index = format_column_name2index.get("DP")
		#GL_index = format_column_name2index.get('GL')
		genotypeCallInBase = 'NA'
		if genotype_call_index is not None and len(genotype_data_ls)>0:
			# or (genotype_call_index is not None and len(genotype_data_ls)<=genotype_call_index):	#<len(format_column_name2index):	#this genotype call is probably empty "./." due to no reads
			#genotype_quality = genotype_data_ls[genotype_quality_index]
			if genotype_call_index is not None and len(genotype_data_ls)>genotype_call_index:
				genotype_call = genotype_data_ls[genotype_call_index]
			else:
				genotype_call = './.'	#missing
			callData = {}
			if genotype_call!='./.' and genotype_call!='.' and genotype_call!='.|.':	#missing data
				patternSearchResult = diploidGenotypePattern.search(genotype_call)
				if patternSearchResult:
					allele1 = alleleNumber2Base[patternSearchResult.group(1)]
					allele2 = alleleNumber2Base[patternSearchResult.group(2)]
					if allele1!='N' and allele2!='N':
						genotypeCallInBase = '%s%s'%(allele1, allele2)
				if depth_index is not None:
					if len(genotype_data_ls)>depth_index:
						depth = genotype_data_ls[depth_index]
					else:
						depth = '.'	#missing DP
					if depth=='.':	#this means depth=0
						depth = 0
					else:
						depth = int(depth)
					if minDepth>0 and depth<minDepth:	#no read. samtools would still assign ref/ref to this individual
						genotypeCallInBase = 'NA'	#set it to missing
					#if depth>maxNoOfReads*coverage or depth<minNoOfReads*coverage:	#2011-3-29 skip. coverage too high or too low
					#	continue
					callData['DP'] = depth

		"""
		if genotype_call=='0/1' or genotype_call =='1/0':	#heterozygous, the latter notation is never used though.
			allele = '%s%s'%(refBase, altBase)
			GL_list = genotype_data_ls[GL_index]
			GL_list = GL_list.split(',')
			GL_list = map(float, GL_list)
			GL = GL_list[1]
			sndHighestGL = max([GL_list[0], GL_list[2]])
			deltaGL = GL-sndHighestGL
			
			AD = genotype_data_ls[format_column_name2index.get('AD')]
			AD = map(int, AD.split(','))
			minorAlleleCoverage = min(AD)
			majorAlleleCoverage = max(AD)
			
			if minorAlleleCoverage<=minorAlleleDepthUpperBoundCoeff*coverage and \
					minorAlleleCoverage>=minorAlleleDepthLowerBoundCoeff*coverage and \
					majorAlleleCoverage<=majorAlleleDepthUpperBoundCoeff*coverage:
				DP4_ratio = float(AD[0])/AD[1]
				allele = '%s%s'%(refBase, altBase)

		elif genotype_call=='./.' or genotype_call=='.|.':	#missing
			allele = 'NA'
		elif genotype_call =='1/1' or genotype_call =='1|1':
			allele = '%s%s'%(altBase, altBase)
		elif genotype_call =='0/0' or genotype_call=='0|0':
			allele = '%s%s'%(refBase, refBase)
		"""
		col_index = sample_id2index.get(individual_name)
		if dataEntryType==1:
			data_row[col_index] = genotypeCallInBase
		else:
			callData['GT'] = genotypeCallInBase
			data_row[col_index] = callData
		if genotypeCallInBase!='NA':
			if genotypeCallInBase not in genotypeCall2Count:
				genotypeCall2Count[genotypeCallInBase] = 0
			genotypeCall2Count[genotypeCallInBase] += 1
	return PassingData(chr=chromosome, chromosome=chromosome, pos=pos, position=pos, locus_id=locus_id, quality=quality, \
					info_tag2value=info_tag2value, \
					refBase=refBase, altBase=altBase, \
					alleleLs=alleleLs, alleleNumber2Base=alleleNumber2Base, genotypeCall2Count=genotypeCall2Count, data_row=data_row,\
					info=info, format=format, filter=filter, vcf_locus_id=vcf_locus_id, \
					format_column_name2index=format_column_name2index, format_column_ls=format_column_ls)
Example #22
0
    def calLD(cls,
              locus1_allele_ls,
              locus2_allele_ls,
              locus1_id=None,
              locus2_id=None):
        """
		2010-9-30
			copied from pymodule/SNP.py.
			locus1_allele_ls, locus2_allele_ls should be bi-allelic.
			If locus1_allele_ls and locus2_allele_ls are of different size, the extra elements are discarded.
		2008-09-05
			adapted from variation.src.misc's LD.calculate_LD class
			only deal with 2-allele loci
			skip if either is NA, or if both are heterozygous (not phased)
		"""
        counter_matrix = numpy.zeros([2, 2])  #only 2 alleles
        snp1_allele2index = {}
        snp2_allele2index = {}
        no_of_individuals = min(len(locus1_allele_ls), len(locus2_allele_ls))
        for k in xrange(no_of_individuals):
            snp1_allele = locus1_allele_ls[k]
            snp2_allele = locus2_allele_ls[k]
            snp1_allele_index = cls.fill_in_snp_allele2index(
                snp1_allele, snp1_allele2index)
            snp2_allele_index = cls.fill_in_snp_allele2index(
                snp2_allele, snp2_allele2index)
            if snp1_allele_index > 1 or snp2_allele_index > 1:  #ignore the 3rd allele
                continue
            counter_matrix[snp1_allele_index, snp2_allele_index] += 1
            #counter_matrix[snp1_allele_index, snp2_allele_index] += 1	#this is to mimic the diploid.
        PA = sum(counter_matrix[0, :])
        Pa = sum(counter_matrix[1, :])
        PB = sum(counter_matrix[:, 0])
        Pb = sum(counter_matrix[:, 1])
        total_num = float(PA + Pa)
        try:
            PA = PA / total_num
            Pa = Pa / total_num
            PB = PB / total_num
            Pb = Pb / total_num
            PAB = counter_matrix[0, 0] / total_num
            D = PAB - PA * PB
            PAPB = PA * PB
            PAPb = PA * Pb
            PaPB = Pa * PB
            PaPb = Pa * Pb
            Dmin = max(-PAPB, -PaPb)
            Dmax = min(PAPb, PaPB)
            if D < 0:
                D_prime = D / Dmin
            else:
                D_prime = D / Dmax
            r2 = D * D / (PA * Pa * PB * Pb)
        except:  #2008-01-23 exceptions.ZeroDivisionError, Dmin or Dmax could be 0 if one of(-PAPB, -PaPb)  is >0 or <0
            sys.stderr.write('Unknown except, ignore: %s\n' %
                             repr(sys.exc_info()[0]))
            return None
        allele_freq = (min(PA, Pa), min(PB, Pb))
        return_data = PassingData()
        return_data.D = D
        return_data.D_prime = D_prime
        return_data.r2 = r2
        return_data.allele_freq = allele_freq
        return_data.snp_pair_ls = (locus1_id, locus2_id)
        return_data.no_of_pairs = total_num
        return return_data
Example #23
0
def ltsFit(x_ls, y_ls, fractionUsed=0.6, startX=1, stopX=5):
    """
	2010-6-1
		solve the computing node hang-up (I/O stuck) issue by adding these:
			import ROOT
			try:	# 2010-5-31 old version (5.18.0) doesn't have IgnoreCommandLineOptions.
				ROOT.PyConfig.IgnoreCommandLineOptions = True	#otherwise
				# Warning in <TApplication::GetOptions>: file <output file by -o > has size 0, skipping
			except:
				pass
			try:	# 2010-5-31  disable .StartGuiThread
				ROOT.PyConfig.StartGuiThread = 0
			except:
				pass
	2010-5-30
		return chiSquare as well
	2010-5-21
		use ROOT to do least trimmed square (LTS) fitting:
			fit the y=a+bx with trimming fraction = 1-fractionUsed.
	
	Example:
	
	import numpy
	x_ls = numpy.array(range(100), numpy.float)
	y_ls = x_ls/2.
	for i in range(len(y_ls)):
		import random
		new_y = random.random()-0.5
		y_ls[i] += new_y
	
	# mess up some portion of y
	for i in range(5):
		import random
		new_y = random.random()
		new_y_index = random.sample(range(100),1)
		y_ls[new_y_index[0]] = new_y
	import numpy
	x_ls = numpy.array([ 2.64884758,  3.51235008,  2.83090925,  3.41229248,  3.01451969,\
    2.49899888,  3.69988108,  2.74896216,  3.05307841,  3.75705409,\
    3.08653784,  3.10703993,  3.61071348,  3.21285319,  2.91460752,\
    3.53737831,  3.06333303,  3.35391617,  3.43568516,  3.34429312,\
    3.31576061,  2.8007164 ,  2.73639655,  3.14690256,  3.10174704,\
    2.80888581,  2.72754121,  2.90064001,  3.19270658,  3.50596333,\
    2.61804676,  3.18127131,  3.27542663,  3.09586573], dtype=numpy.float32)	# numpy.float32 is not supported by ROOT
	y_ls = numpy.array([ 2.52827311,  3.27265358,  2.36172366,  2.95760489,  2.50920248,\
    2.3443923 ,  3.23502254,  2.35410833,  2.50582743,  2.48501062,\
    2.82510138,  2.70799541,  2.43136382,  2.76342535,  2.45178652,\
    3.08224201,  2.26481771,  2.7387805 ,  3.23274207,  2.82769203,\
    2.25042009,  2.56702638,  2.4082365 ,  2.44793224,  2.65127802,\
    2.57460976,  2.43136382,  2.39005065,  2.70027065,  3.04452848,\
    2.28555727,  2.71933126,  2.6468935 ,  2.54157925], dtype=numpy.float32)
    
	fit_y_ls = ltsFit(x_ls, y_ls)
	
	import pylab
	pylab.plot(x_ls, y_ls, '.')
	pylab.plot(x_ls, fit_y_ls, '.')
	pylab.legend(['raw data','fitted'])
	pylab.show()
	sys.exit(0)
	
	"""
    import ROOT
    try:  # 2010-5-31 old version (5.18.0) doesn't have IgnoreCommandLineOptions.
        ROOT.PyConfig.IgnoreCommandLineOptions = True  #otherwise
        # Warning in <TApplication::GetOptions>: file <output file by -o > has size 0, skipping
    except:
        pass
    try:  # 2010-5-31  disable .StartGuiThread
        ROOT.PyConfig.StartGuiThread = 0
    except:
        pass

    #ROOT.gROOT.Reset()	# 2010-5-31 dont' know what this is  for.
    ROOT.gROOT.SetBatch(
        True)  #to avoid interative mode (drawing canvas and etc.)
    from ROOT import TFormula, TF1, TGraph
    import numpy
    lm = TF1(
        'lm', 'pol1', startX, stopX
    )  #[0]+[1]*x is essentially same as pol1 but option rob in Fit() only works with pol1.
    #ROOT is very dtype-sensitive. numpy.float32 won't work.
    if hasattr(x_ls, 'dtype') and x_ls.dtype == numpy.float:
        pass
    else:
        sys.stderr.write('converting x_ls')
        x_ls = numpy.array(x_ls, dtype=numpy.float)
        sys.stderr.write(".\n")
    if hasattr(y_ls, 'dtype') and y_ls.dtype == numpy.float:
        pass
    else:
        sys.stderr.write('converting y_ls')
        y_ls = numpy.array(y_ls, dtype=numpy.float)
        sys.stderr.write(".\n")
    gr = TGraph(len(x_ls), x_ls, y_ls)
    gr.Fit(lm, "+rob=%s" % fractionUsed)
    fit = gr.GetFunction('lm')
    chiSquare = fit.GetChisquare()
    fit_y_ls = []
    for x in x_ls:
        fit_y_ls.append(fit.Eval(x))
    from utils import PassingData
    return PassingData(fit_y_ls=fit_y_ls, chiSquare=chiSquare)
Example #24
0
def registerRefFastaFile(workflow=None, refFastaFname=None, registerAffiliateFiles=True, input_site_handler='local',\
      checkAffiliateFileExistence=True, addPicardDictFile=True,\
      affiliateFilenameSuffixLs=['fai', 'amb', 'ann', 'bwt', 'pac', 'sa', 'rbwt', 'rpac', 'rsa', \
      'stidx', 'sthash'], folderName="reference"):
    """
	suffix here doesn't include ".".
	
	2013.08.23 bugfix, check if workflow has a file registered before adding it
	2013.3.26 added refSAMtoolsFastaIndexF, refPicardFastaDictF into returnData
	2013.3.20 deduce needBWARefIndexJob, needSAMtoolsFastaIndexJob, needPicardFastaDictJob, needStampyRefIndexJob from missing suffixes
	2010.10.10 added argument folderName
	2012.5.23
		add an argument "addPicardDictFile" to offer user option to exclude this file (i.e. in registerBlastNucleotideDatabaseFile)
	2012.2.24
		dict is via picard, also required for GATK
		fai is via "samtools faidx" (index reference). also required for GATK
		amb', 'ann', 'bwt', 'pac', 'sa', 'rbwt', 'rpac', 'rsa' are all bwa index.
		stidx is stampy index.
		sthash is stampy hash.
	2012.2.23
		add two suffixes, stidx (stampy index) and sthash (stampy hash)
	2011-11-11
		if needAffiliatedFiles,
			all other files, with suffix in affiliateFilenameSuffixLs, will be registered (symlinked or copied) as well.
	"""
    returnData = PassingData(refFastaFList = [], needBWARefIndexJob=False, needSAMtoolsFastaIndexJob=False, \
          needPicardFastaDictJob=False, needStampyRefIndexJob=False, needBlastMakeDBJob=False,\
          refPicardFastaDictF=None, refSAMtoolsFastaIndexF=None)
    missingSuffixSet = set()  #2013.3.20

    if registerAffiliateFiles:
        refFastaF = File(
            os.path.join(folderName, os.path.basename(refFastaFname))
        )  #use relative path, otherwise, it'll go to absolute path
        # Add it into replica only when needed.
        refFastaF.addPFN(PFN("file://" + refFastaFname, input_site_handler))
        if not workflow.hasFile(refFastaF):  #2013.08.12
            workflow.addFile(refFastaF)
        returnData.refFastaFList.append(refFastaF)
        # If it's not needed, assume the index is done and all relevant files are in absolute path.
        # and no replica transfer

        #add extra affiliated files
        suffix2PathToFileLs = {}
        if addPicardDictFile:  #2012.5.23
            picardDictSuffix = 'dict'
            pathToFile = '%s.%s' % (
                os.path.splitext(refFastaFname)[0], picardDictSuffix
            )  #remove ".fasta" from refFastaFname
            if checkAffiliateFileExistence and not os.path.isfile(pathToFile):
                sys.stderr.write(
                    "Warning: %s don't exist or not a file on file system. skip registration.\n"
                    % (pathToFile))
                missingSuffixSet.add(picardDictSuffix)
                #suffix2PathToFileLs.append(pathToFile)
            else:
                suffix2PathToFileLs[picardDictSuffix] = pathToFile
        for suffix in affiliateFilenameSuffixLs:
            pathToFile = '%s.%s' % (refFastaFname, suffix)
            if checkAffiliateFileExistence and not os.path.isfile(pathToFile):
                sys.stderr.write(
                    "Warning: %s don't exist or not a file on file system. skip registration.\n"
                    % (pathToFile))
                missingSuffixSet.add(suffix)
                continue
            suffix2PathToFileLs[suffix] = pathToFile
        for suffix, pathToFile in suffix2PathToFileLs.iteritems():
            if checkAffiliateFileExistence and not os.path.isfile(pathToFile):
                sys.stderr.write(
                    "Warning: %s don't exist or not a file on file system. skip registration.\n"
                    % (pathToFile))
                continue
            affiliateF = File(
                os.path.join(folderName, os.path.basename(pathToFile)))
            #use relative path, otherwise, it'll go to absolute path
            affiliateF.addPFN(PFN("file://" + pathToFile, input_site_handler))
            if not workflow.hasFile(affiliateF):  #2013.08.12
                workflow.addFile(affiliateF)
            returnData.refFastaFList.append(affiliateF)

            if suffix == 'dict':  #2013.3.26
                returnData.refPicardFastaDictF = affiliateF
            elif suffix == 'fai':
                returnData.refSAMtoolsFastaIndexF = affiliateF
    else:
        refFastaF = File(
            os.path.join(folderName, os.path.basename(refFastaFname)))
        returnData.refFastaFList.append(refFastaF)
    if 'bwt' in missingSuffixSet or 'pac' in missingSuffixSet:
        returnData.needBWARefIndexJob = True
    if 'fai' in missingSuffixSet:
        returnData.needSAMtoolsFastaIndexJob = True
        returnData.needPicardFastaDictJob = True
    if 'stidx' in missingSuffixSet or 'sthash' in missingSuffixSet:
        returnData.needStampyRefIndexJob = True
    if 'dict' in missingSuffixSet:
        returnData.needPicardFastaDictJob = True
    if 'nin' in missingSuffixSet or 'nhr' in missingSuffixSet or 'nsq' in missingSuffixSet:
        returnData.needBlastMakeDBJob = True
    return returnData
Example #25
0
	def setup(self, **keywords):
		"""
		2012.10.15
			run before anything is run
		"""
		AbstractMatrixFileWalker.setup(self, **keywords)
		#self.writer = BeagleGenotypeFile(inputFname=self.outputFname, openMode='w')
		
		#read in the IBD check result
		self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \
								rowIDHeader=None, colIDHeader=None, \
								rowIDIndex=0, colIDIndex=1, \
								dataHeader=None, dataIndex=2, hasHeader=False)
		
		#. read in the alignment coverage data
		alignmentCoverageFile = MatrixFile(inputFname=self.individualAlignmentCoverageFname)
		alignmentCoverageFile.constructColName2IndexFromHeader()
		alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1])
		alignmentCoverageFile.close()
		
		sys.stderr.write("Reading in all samples from %s VCF input files ... \n"%(len(self.inputFnameLs)))
		# read all the Beagle files
		individualID2HaplotypeData = {}
		for inputFname in self.inputFnameLs:
			vcfFile = VCFFile(inputFname=inputFname)
			#vcfFile.readInAllHaplotypes()
			for individualID in vcfFile.getSampleIDList():
				individualID2HaplotypeData[individualID] = None
				#haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID)
				#individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList,
				#													locusIDList=vcfFile.locusIDList)
			# get all haplotypes , etc.
			# get all sample IDs
		sys.stderr.write("%s individuals total.\n"%(len(individualID2HaplotypeData)))
		
		#. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns)
		#. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child)
		sys.stderr.write("Constructing individualID2pedigreeContext ...")
		plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname)
		pGraph = plinkPedigreeFile.pedigreeGraph
		#shrink the graph to only individuals with data
		pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys())
		
		cc_subgraph_list = nx.connected_component_subgraphs(pGraph.to_undirected())
		individualID2familyContext = {}
		outDegreeContainer = NumberContainer(minValue=0)
		familySizeContainer = NumberContainer(minValue=0)
		individualCoverageContainer = NumberContainer(minValue=0)
		familyCoverageContainer = NumberContainer(minValue=0)
		for cc_subgraph in cc_subgraph_list:
			familySize= len(cc_subgraph)
			familySizeContainer.addOneValue(familySize)
			
			familyCoverage = 0
			for n in cc_subgraph:	#assuming each family is a two-generation trio/nuclear family
				individualCoverage = self.getIndividualCoverage(individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs)
				individualCoverage = float(individualCoverage)
				individualCoverageContainer.addOneValue(individualCoverage)
				familyCoverage += individualCoverage
				in_degree = pGraph.in_degree(n)
				out_degree = pGraph.out_degree(n)
				outDegreeContainer.addOneValue(out_degree)
				familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \
										individualCoverage=individualCoverage,\
										familyCoverage=None)
				if n not in individualID2familyContext:
					individualID2familyContext[n] = familyContext
				else:
					sys.stderr.write("Node %s already in individualID2familyContext.\n"%(n))
			familyCoverageContainer.addOneValue(familyCoverage)
			#set the family coverage for each member, used in weighing the individual. better covered family => better haplotype
			for n in cc_subgraph:
				individualID2familyContext[n].familyCoverage = familyCoverage
		plinkPedigreeFile.close()
		sys.stderr.write("%s individuals.\n"%(len(individualID2familyContext)))
		
		
		# weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual
		sys.stderr.write("Weighing each individual , assigning probability mass  ...")
		individualID2probabilityMass = {}
		for individualID, familyContext in individualID2familyContext.iteritems():
			outDegreeQuotient = outDegreeContainer.normalizeValue(familyContext.familySize)
			individualCoverageQuotient = individualCoverageContainer.normalizeValue(familyContext.individualCoverage)
			#familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage)
			importanceScore = outDegreeQuotient + individualCoverageQuotient
			representativeImportanceScore = importanceScore
			individualID2probabilityMass[individualID] = representativeImportanceScore
		sys.stderr.write(" %s IDs with probability mass assigned.\n"%(len(individualID2probabilityMass)))
		
		self.individualID2probabilityMass = individualID2probabilityMass
		self.individualID2HaplotypeData = individualID2HaplotypeData