def __init__(self, chr=None, chromosome=None, chromosomeSize=None, start=None, stop=None, overlapStart=None, overlapStop=None, **keywords): PassingData.__init__(self, chr=chr, chromosome=chromosome, chromosomeSize=chromosomeSize, \ start=start, stop=stop, overlapStart=overlapStart, overlapStop=overlapStop, **keywords) if not hasattr(self, 'file'): self.file = None if not hasattr(self, 'jobLs'): self.jobLs = [] if self.chr is None and self.chromosome: self.chr = self.chromosome elif self.chr and self.chromosome is None: self.chromosome = self.chr if self.overlapStart is None: self.overlapStart = self.start if self.overlapStop is None: self.overlapStop = self.stop self.subIntervalLs = [] self.subIntervalLs.append((self.overlapStart, self.overlapStop))
def __init__(self, **keywords): self.isPhased = None self.ploidy = None self.locusIDList = [] self.haplotypeList = [] self.locusPositionList = [] PassingData.__init__(self, **keywords)
def addIndividual(self, name=None, family_id = None, father_name = None, \ mother_name = None, sex = None, phenotype = None, \ populationName=None, speciesName=None, ploidy=None): """ 2013.3.8 """ if name: population_id = None species_id = None if speciesName: species = self.getSpecies(name=speciesName, ploidy=ploidy) if species: species_id = species.id if populationName: population = self.getPopulation(name=populationName, speciesName=speciesName) if population: population_id = population.id oneCell = PassingData(name=name,family_id =family_id, father_name=father_name,\ mother_name=mother_name, sex=sex, phenotype=phenotype,\ population_id=population_id) self.individualTable.writeOneCell(oneCell, cellType=2) self.flush() if name in self._individualName2ID: sys.stderr.write("Error: individual %s is not unique, already in _individualName2ID with id=%s.\n"%\ (name, self._individualName2ID.get(name))) raise else: self._individualName2ID[ name] = self.individualTable.no_of_rows #nrows is not updated until flush() return self.checkIndividual( name=name) #would this work without flush()?
def writeChrStartStopTupleList2LocusTable(self, chr_start_stop_list=None, chromosomeLength=None,\ speciesName=None, ploidy=None): """ 2013.3.7 #. establish _locus_index2id, to be used in writeIndividualName2PolymorphismData() #. make sure chr_start_stop_list is in the same order as the haplotype in writeIndividualName2PolymorphismData() """ sys.stderr.write( "Writing a %s-element list of (chr, start,stop) out ..." % (len(chr_start_stop_list))) chr_start_stop_list.sort() #make sure it's sorted if ploidy is None: ploidy = self.ploidy for i in xrange(len(chr_start_stop_list)): chromosomeName, start, stop = chr_start_stop_list[i][:3] if chromosomeName: chromosomeEntry = self.getChromosome(name=chromosomeName, length=chromosomeLength, speciesName=speciesName,\ ploidy=ploidy) else: chromosomeEntry = None name = '%s_%s_%s' % (chromosomeName, start, stop) oneCell = PassingData(name=name, chromosome_id=getattr( chromosomeEntry, 'id', None), start=start, stop=stop) self.locusTable.writeOneCell(oneCell, cellType=2) self._locus_index2id[i] = self.locusTable.no_of_rows sys.stderr.write("%s loci \n") return self._locus_index2id
def writeIndividualName2PolymorphismData(self, individualName2polymorphismData=None, \ locus_index2id=None, speciesName=None, ploidy=None): """ 2013.3.7 if locus_index2id is not available, raise error """ sys.stderr.write("Writing individualName2polymorphismData (%s individuals) out ..."%\ (len(individualName2polymorphismData))) if locus_index2id is None: locus_index2id = self._locus_index2id counter = 0 for individualName, polymorphismData in individualName2polymorphismData.iteritems( ): individual_id = self.getIndividual(individualName, speciesName=speciesName, ploidy=ploidy).id for i in xrange(len(polymorphismData.haplotypeList)): haplotype = polymorphismData.haplotypeList[i] for j in xrange(len(haplotype)): locus_id = locus_index2id.get(j) if locus_id is None: sys.stderr.write( "Error: no locus_id for locus index %s.\n" % (j)) raise oneCell = PassingData(individual_id=individual_id, locus_id=locus_id,\ chromosome_copy=i, allele_sequence=haplotype[j],\ allele_sequence_length=len(haplotype[j]), allele_type=1) self.polymorphismTable.writeOneCell(oneCell, cellType=2) counter += 1 sys.stderr.write(" %s alleles outputted.\n" % (counter))
def castPyTablesRowIntoPassingData(rowPointer=None): """ 2012.12.21 rowPointer from PyTables iteration is like a C pointer to the current row (no real content). need to convert it to a real object if you try to store its content in memory and use it later. rowPointer has these methods: ['__class__', '__contains__', '__delattr__', '__delitem__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__hash__', '__init__', '__iter__', '__new__', '__next__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '_fillCol', '_flushBufferedRows', '_flushModRows', '_getUnsavedNrows', '_iter', 'append', 'fetch_all_fields', 'next', 'nrow', 'table', 'update'] However, errors like this crop up during copying of these rowPointer. File "/usr/lib/python2.7/copy_reg.py", line 93, in __newobj__ return cls.__new__(cls, *args) File "tableExtension.pyx", line 706, in tables.tableExtension.Row.__cinit__ (tables/tableExtension.c:6910) TypeError: __cinit__() takes exactly 1 positional argument (0 given) 2012.12.21 could not use PassingDataList. because of these errors. File "/usr/lib/python2.7/copy.py", line 257, in _deepcopy_dict y[deepcopy(key, memo)] = deepcopy(value, memo) File "/usr/lib/python2.7/copy.py", line 182, in deepcopy rv = reductor(2) TypeError: 'NoneType' object is not callable """ pdata = PassingData() for colname in rowPointer.table.colnames: setattr(pdata, colname, rowPointer[colname]) return pdata
def traverseBamByRead(self, processor=None): """ 2011-7-10 add samfile to param_obj 2011-2-8 a traverser used by other functions """ self.seek(0) it = self.fetch() counter = 0 real_counter = 0 qname2count = {} param_obj = PassingData(real_counter=real_counter, counter=counter, qname2count=qname2count, samfile=self) for read in it: counter += 1 exitCode = processor.run(read, param_obj=param_obj) if counter % 10000 == 0: sys.stderr.write( "%s\t%s\t\t%s" % ('\x08' * 80, param_obj.counter, param_obj.real_counter)) if exitCode: #2011-7-8 break processor.qname2count = param_obj.qname2count #2011-2-9 pass it to the processor max_redundant_read_count = max(param_obj.qname2count.values()) sys.stderr.write("\n %s unique reads among %s mapped reads, max redundant read count=%s. Done.\n"%\ (len(param_obj.qname2count), param_obj.real_counter, max_redundant_read_count))
def addSpecies(self, name=None, scientific_name=None, ploidy=None): """ 2013.3.8 """ if name: oneCell = PassingData(name=name, scientific_name=scientific_name, ploidy=ploidy) self.speciesTable.writeOneCell(oneCell, cellType=2) self.flush() return self.checkSpecies(name=name) #would this work without flush()?
def reduce(self, **keywords): """ 2012.10.15 run after all files have been walked through """ #sample the data probabilityMassContainer = DiscreteProbabilityMassContainer(object2proabilityMassDict=self.individualID2probabilityMass) noOfTotalRows = len(self.individualID2probabilityMass) genotypeSampleID2IBDSampleID = self.mapSampleIDToIDInIBDFile(genotypeSampleIDList=self.individualID2probabilityMass.keys(), \ ibdFileSampleIDList=self.ibdData.row_id_ls) counter = 0 real_counter = 0 if self.sampleSize<noOfTotalRows: if self.ibdData: #complicated sampling starts here # sampledSetSizeHistoryData = PassingData(historyList= [], sumOfAbsStepDifference = 0, \ noOfLastRounds=20) #a metre about whether sampledIndividualIDSet has stopped growing sampledIndividualIDSet = set() while len(sampledIndividualIDSet)<self.sampleSize and \ self.detectSampledSetSizeHistoryChangeInLastRounds(sampledSetSizeHistoryData=sampledSetSizeHistoryData): sampledIndividualID = probabilityMassContainer.sampleObject() counter += 1 if sampledIndividualID: includeInTheSampling = True for alreadySampledIndividualID in sampledIndividualIDSet: #not too close to anyone previously sampled #getting the relatedness relatedness = self.ibdData.getCellDataGivenRowColID(genotypeSampleID2IBDSampleID.get(sampledIndividualID), \ genotypeSampleID2IBDSampleID.get(alreadySampledIndividualID)) if relatedness is not None and relatedness>=self.maxPairwiseKinship: includeInTheSampling = False if includeInTheSampling: sampledIndividualIDSet.add(sampledIndividualID) sampledSetSizeHistoryData.historyList.append(len(sampledIndividualIDSet)) #turn into list sampledIndividualIDList = list(sampledIndividualIDSet) else: sampledIndividualIDList = random.sample(self.individualID2probabilityMass.keys(), self.sampleSize) else: #take all sampledIndividualIDList = self.individualID2probabilityMass.keys() #output the sampled individuals for individualID in sampledIndividualIDList: self.writer.writerow([individualID]) real_counter += 1 fraction = float(real_counter)/float(noOfTotalRows) sys.stderr.write("%s/%s (%.3f) selected out of %s samplings.\n"%(real_counter, noOfTotalRows, fraction, counter)) #close the self.invariantPData.writer and self.writer AbstractMatrixFileWalker.reduce(self, **keywords)
def castPyTablesEntryIntoPassingData(entry=None): """ 2013.3.11 entry is one cell in the array,that is returned from readWhere() query. The array is a numpy data structure: array([(1L, '1', '', 2)], dtype=[('id', '<u8'), ('name', '|S512'), ('scientific_name', '|S512'), ('ploidy', '<u2')]) """ pdata = PassingData() for i in xrange(len(entry.dtype.names)): colname = entry.dtype.names[i] setattr(pdata, colname, entry[i]) return pdata
def next(self): try: row = self.csvFile.next() except: raise StopIteration if not self.isRealCSV: row = row.strip().split() markerID, alleleA, alleleB = row[0:3] return PassingData(markerID=markerID, alleleA=alleleA, alleleB=alleleB, genotypeLikelihoodList=row[3:])
def getAssociationLandscapeDataFromHDF5File(inputFname=None, associationTableName='association', \ landscapeTableName='landscape', min_MAF=0.1): """ 2012.11.20 input is in HDF5MatrixFile format (which is output of variation/src/association_peak/DefineAssociationLandscape.py) contains two hdf5 groups. one is by associationTableName. the other is by landscapeTableName. """ pdata = PassingData(min_MAF=min_MAF) genome_wide_result = getGenomeWideResultFromHDF5MatrixFile(inputFname=inputFname, \ min_value_cutoff=None, do_log10_transformation=False, pdata=pdata,\ construct_chr_pos2index=False, construct_data_obj_id2index=False, \ construct_locus_db_id2index=True,\ report=True, tableName=associationTableName) returnData = PassingData(genome_wide_result=genome_wide_result) sys.stderr.write("Reading landscape from %s ..." % (inputFname)) current_obj = None bridge_ls = [] locusLandscapeNeighborGraph = nx.Graph() reader = HDF5MatrixFile(inputFname, openMode='r') landscapeTableObject = reader.getTableObject(tableName=landscapeTableName) returnData.HDF5AttributeNameLs = [] for attributeName, value in landscapeTableObject.getAttributes().iteritems( ): returnData.HDF5AttributeNameLs.append(attributeName) setattr(returnData, attributeName, value) for row in landscapeTableObject: if row.start_locus_id == 0: #empty data. happens when inputFname contains no valid landscape, but one default null data point. continue start_locus_id = row.start_locus_id stop_locus_id = row.stop_locus_id no_of_loci = row.no_of_loci deltaX = row.deltaX start_obj = genome_wide_result.get_data_obj_by_locus_db_id( start_locus_id) stop_obj = genome_wide_result.get_data_obj_by_locus_db_id( stop_locus_id) bridge_ls.append([start_obj, stop_obj, no_of_loci, deltaX]) source_index = start_obj.index #genome_wide_result.get_data_obj_index_by_locus_db_id(start_locus_id) target_index = stop_obj.index locusLandscapeNeighborGraph.add_edge(source_index, target_index, \ weight=None) locusLandscapeNeighborGraph[source_index][target_index][ 'no_of_loci'] = no_of_loci locusLandscapeNeighborGraph[source_index][target_index][ 'deltaX'] = deltaX del reader sys.stderr.write("%s bridges.\n" % (len(bridge_ls))) returnData.bridge_ls = bridge_ls returnData.locusLandscapeNeighborGraph = locusLandscapeNeighborGraph return returnData
def writeRecombinationEvents(self, parentName=None, childName=None, recombinationLocationList=None): """ 2013.3.7 """ parent_id = self.getIndividual(parentName).id child_id = self.getIndividual(childName).id for position in recombinationLocationList: oneCell = PassingData(parent_id=parent_id, child_id=child_id, position=position) self.recombinationTable.writeOneCell(oneCell, cellType=2)
def addPopulation(self, name=None, size=None, speciesName=None): """ 2013.3.8 """ if name: species_id = None if speciesName: species = self.getSpecies(name=speciesName) if species: species_id = species.id oneCell = PassingData(name=name, size=size, species_id=species_id) self.populationTable.writeOneCell(oneCell, cellType=2) self.flush() return self.checkPopulation( name=name) #would this work without flush()?
def addPolymorphism(self, name=None, individualName=None, locusName=None, chromosome_copy = None,\ allele_sequence=None, allele_sequence_length=None, allele_type =None, **keywords): """ 2013.3.10 """ if name: individual_id = self.getIndividual(name=individualName).id locus_id = self.getLocus(name=locusName).id oneCell = PassingData(name=name, individual_id=individual_id, locus_id = locus_id, \ chromosome_copy=chromosome_copy,\ allele_sequence = allele_sequence, allele_sequence_length=allele_sequence_length,\ allele_type=allele_type, **keywords) self.polymorphismTable.writeOneCell(oneCell, cellType=2) self.flush() return self.checkPolymorphism( name=name) #would this work without flush()?
def _readInData(self, tableName=None, tableObject=None, do_log10_transformation=None): """ """ YHFile._readInData(self, tableName=tableName, tableObject=tableObject) if tableName is None: tableName = self.tableName if do_log10_transformation is None: do_log10_transformation = getattr(self, 'do_log10_transformation', False) pdata = PassingData(min_MAF=self.min_MAF) self.genome_wide_result = getGenomeWideResultFromHDF5MatrixFile(reader=self, tableName=tableName, tableObject=tableObject,\ min_value_cutoff=None, do_log10_transformation=do_log10_transformation, pdata=pdata,\ construct_chr_pos2index=False, construct_data_obj_id2index=False, \ construct_locus_db_id2index=True,\ report=True) return self.genome_wide_result
def parseChrStartStopFromFilename(filename=None, chr2size=None, defaultChromosomeSize=10000000000): """ 2013.09.18 #10000000000 is used when filename contains data from a whole chromosome and chr2size is not available or not containing chromosome make it very big so that it could be intersected with any interval from any chromosome. """ searchResult = chr_start_stop_pattern.search(filename) if searchResult: chromosome = searchResult.group(1) start = int(searchResult.group(2)) stop = int(searchResult.group(3)) else: #try chromosome = getChrFromFname(filename=filename) start = 1 if chr2size is not None: stop = chr2size.get(chromosome, defaultChromosomeSize) else: stop = defaultChromosomeSize return PassingData(chromosome=chromosome, start=start, stop=stop)
def getNoOfFamiliesAndKidsGivenParentSetSize(self, noOfParents2FamilyData=None, parentSetSize=2): """ 2013.07.19 """ familyData = noOfParents2FamilyData.get(parentSetSize, None) if familyData: noOfFamilies = len(familyData.parentTupleSet) noOfParents = len(familyData.parentIDSet) noOfKids = len(familyData.childIDSet) noOfIndividuals = len(familyData.individualIDSet) else: noOfFamilies = 0 noOfParents = 0 noOfKids = 0 noOfIndividuals = 0 return PassingData(noOfFamilies=noOfFamilies, noOfParents=noOfParents, noOfKids=noOfKids, noOfIndividuals=noOfIndividuals)
def addChromosome(self, name=None, length=None, speciesName=None, ploidy=None, path=None): """ 2013.3.8 """ if name: species_id = None if speciesName: speciesEntry = self.getSpecies(name=speciesName, ploidy=ploidy) if speciesEntry: species_id = speciesEntry.id oneCell = PassingData(name=name, length=length, species_id=species_id, path=path) self.chromosomeTable.writeOneCell(oneCell, cellType=2) self.flush() return self.checkChromosome( name=name) #would this work without flush()?
def addLocus(self, name=None, chromosomeName=None,\ start = None, stop = None, ref_allele = None, ref_allele_length=None,\ ref_allele_frequency =None, alt_allele=None, alt_allele_length=None,\ alt_allele_frequency=None, generation_mutation_arose=None, generation_mutation_fixed=None,\ mutation_type =None, fitness = None, ancestral_amino_acid =None, \ derived_amino_acid =None, **keywords): """ 2013.3.8 """ if name: chromosome_id = None if chromosomeName: chromosome_id = self.getChromosome(name=chromosomeName).id oneCell = PassingData(name=name, chromosome_id=chromosome_id, start = start, stop = stop, \ ref_allele = ref_allele, ref_allele_length=ref_allele_length,\ ref_allele_frequency =ref_allele_frequency, alt_allele=alt_allele, alt_allele_length=alt_allele_length,\ alt_allele_frequency=alt_allele_frequency, generation_mutation_arose=generation_mutation_arose, \ generation_mutation_fixed=generation_mutation_fixed,\ mutation_type =mutation_type, fitness = fitness, ancestral_amino_acid =ancestral_amino_acid, \ derived_amino_acid =derived_amino_acid, **keywords) self.locusTable.writeOneCell(oneCell, cellType=2) self.flush() return self.checkLocus(name=name) #would this work without flush()?
def parseOneVCFRow(row, col_name2index, col_index_individual_name_ls, sample_id2index, minDepth=1,\ dataEntryType=1): """ 2014.01.08 fix a bug that skips calls and shortens data_row. 2012.9.6 turn pos into integer 2012.5.10 complete representation of one locus 2012.1.17 common snippet split out of VCFFile & VCFRecord row is a list of input columns from one VCF file line dataEntryType 1: each cell is base call 2: each cell is a dictionary {'GT': base-call, 'DP': depth} """ chromosome = row[0] pos = int(row[1]) #2012.9.6 turn pos into integer vcf_locus_id=row[2] quality = row[5] filter=row[6] info = row[7] format = row[8] info_ls = info.split(';') info_tag2value = {} for info_entry in info_ls: try: tag, value = info_entry.split('=') except: #sys.stderr.write("Error in splitting %s by =.\n"%info) ###Error in splitting DS by =. continue info_tag2value[tag] = value locus_id = (chromosome, pos) refBase = row[col_name2index['REF']] altBase = row[col_name2index['ALT']] altBaseLs = altBase.split(',') #altBase could be just "C" or "C,G" (multi-nucleotide) alleleLs = [refBase] + altBaseLs alleleNumber2Base = {'.':'NA'} for i in xrange(len(alleleLs)): alleleNumber2Base[repr(i)] = alleleLs[i] format_column = row[col_name2index['FORMAT']] format_column_ls = format_column.split(':') format_column_name2index = getColName2IndexFromHeader(format_column_ls) if dataEntryType==1: data_row = ['NA']*(len(col_index_individual_name_ls)+1) # extra 1 for the ref data_row[0] = refBase else: data_row = [None]*(len(col_index_individual_name_ls)+1) # extra 1 for the ref data_row[0] = {'GT':refBase, 'DP':-1} genotypeCall2Count = {} for individual_col_index, individual_name in col_index_individual_name_ls: individual_name = individual_name if individual_name not in sample_id2index: sample_id2index[individual_name] = len(sample_id2index) #coverage = read_group2coverage[individual_name] genotype_data = row[individual_col_index] genotype_data_ls = genotype_data.split(':') genotype_call_index = format_column_name2index.get('GT') genotype_quality_index = format_column_name2index.get('GQ') if genotype_quality_index is None: genotype_quality_index = format_column_name2index.get('DP') depth_index = format_column_name2index.get("DP") #GL_index = format_column_name2index.get('GL') genotypeCallInBase = 'NA' if genotype_call_index is not None and len(genotype_data_ls)>0: # or (genotype_call_index is not None and len(genotype_data_ls)<=genotype_call_index): #<len(format_column_name2index): #this genotype call is probably empty "./." due to no reads #genotype_quality = genotype_data_ls[genotype_quality_index] if genotype_call_index is not None and len(genotype_data_ls)>genotype_call_index: genotype_call = genotype_data_ls[genotype_call_index] else: genotype_call = './.' #missing callData = {} if genotype_call!='./.' and genotype_call!='.' and genotype_call!='.|.': #missing data patternSearchResult = diploidGenotypePattern.search(genotype_call) if patternSearchResult: allele1 = alleleNumber2Base[patternSearchResult.group(1)] allele2 = alleleNumber2Base[patternSearchResult.group(2)] if allele1!='N' and allele2!='N': genotypeCallInBase = '%s%s'%(allele1, allele2) if depth_index is not None: if len(genotype_data_ls)>depth_index: depth = genotype_data_ls[depth_index] else: depth = '.' #missing DP if depth=='.': #this means depth=0 depth = 0 else: depth = int(depth) if minDepth>0 and depth<minDepth: #no read. samtools would still assign ref/ref to this individual genotypeCallInBase = 'NA' #set it to missing #if depth>maxNoOfReads*coverage or depth<minNoOfReads*coverage: #2011-3-29 skip. coverage too high or too low # continue callData['DP'] = depth """ if genotype_call=='0/1' or genotype_call =='1/0': #heterozygous, the latter notation is never used though. allele = '%s%s'%(refBase, altBase) GL_list = genotype_data_ls[GL_index] GL_list = GL_list.split(',') GL_list = map(float, GL_list) GL = GL_list[1] sndHighestGL = max([GL_list[0], GL_list[2]]) deltaGL = GL-sndHighestGL AD = genotype_data_ls[format_column_name2index.get('AD')] AD = map(int, AD.split(',')) minorAlleleCoverage = min(AD) majorAlleleCoverage = max(AD) if minorAlleleCoverage<=minorAlleleDepthUpperBoundCoeff*coverage and \ minorAlleleCoverage>=minorAlleleDepthLowerBoundCoeff*coverage and \ majorAlleleCoverage<=majorAlleleDepthUpperBoundCoeff*coverage: DP4_ratio = float(AD[0])/AD[1] allele = '%s%s'%(refBase, altBase) elif genotype_call=='./.' or genotype_call=='.|.': #missing allele = 'NA' elif genotype_call =='1/1' or genotype_call =='1|1': allele = '%s%s'%(altBase, altBase) elif genotype_call =='0/0' or genotype_call=='0|0': allele = '%s%s'%(refBase, refBase) """ col_index = sample_id2index.get(individual_name) if dataEntryType==1: data_row[col_index] = genotypeCallInBase else: callData['GT'] = genotypeCallInBase data_row[col_index] = callData if genotypeCallInBase!='NA': if genotypeCallInBase not in genotypeCall2Count: genotypeCall2Count[genotypeCallInBase] = 0 genotypeCall2Count[genotypeCallInBase] += 1 return PassingData(chr=chromosome, chromosome=chromosome, pos=pos, position=pos, locus_id=locus_id, quality=quality, \ info_tag2value=info_tag2value, \ refBase=refBase, altBase=altBase, \ alleleLs=alleleLs, alleleNumber2Base=alleleNumber2Base, genotypeCall2Count=genotypeCall2Count, data_row=data_row,\ info=info, format=format, filter=filter, vcf_locus_id=vcf_locus_id, \ format_column_name2index=format_column_name2index, format_column_ls=format_column_ls)
def calLD(cls, locus1_allele_ls, locus2_allele_ls, locus1_id=None, locus2_id=None): """ 2010-9-30 copied from pymodule/SNP.py. locus1_allele_ls, locus2_allele_ls should be bi-allelic. If locus1_allele_ls and locus2_allele_ls are of different size, the extra elements are discarded. 2008-09-05 adapted from variation.src.misc's LD.calculate_LD class only deal with 2-allele loci skip if either is NA, or if both are heterozygous (not phased) """ counter_matrix = numpy.zeros([2, 2]) #only 2 alleles snp1_allele2index = {} snp2_allele2index = {} no_of_individuals = min(len(locus1_allele_ls), len(locus2_allele_ls)) for k in xrange(no_of_individuals): snp1_allele = locus1_allele_ls[k] snp2_allele = locus2_allele_ls[k] snp1_allele_index = cls.fill_in_snp_allele2index( snp1_allele, snp1_allele2index) snp2_allele_index = cls.fill_in_snp_allele2index( snp2_allele, snp2_allele2index) if snp1_allele_index > 1 or snp2_allele_index > 1: #ignore the 3rd allele continue counter_matrix[snp1_allele_index, snp2_allele_index] += 1 #counter_matrix[snp1_allele_index, snp2_allele_index] += 1 #this is to mimic the diploid. PA = sum(counter_matrix[0, :]) Pa = sum(counter_matrix[1, :]) PB = sum(counter_matrix[:, 0]) Pb = sum(counter_matrix[:, 1]) total_num = float(PA + Pa) try: PA = PA / total_num Pa = Pa / total_num PB = PB / total_num Pb = Pb / total_num PAB = counter_matrix[0, 0] / total_num D = PAB - PA * PB PAPB = PA * PB PAPb = PA * Pb PaPB = Pa * PB PaPb = Pa * Pb Dmin = max(-PAPB, -PaPb) Dmax = min(PAPb, PaPB) if D < 0: D_prime = D / Dmin else: D_prime = D / Dmax r2 = D * D / (PA * Pa * PB * Pb) except: #2008-01-23 exceptions.ZeroDivisionError, Dmin or Dmax could be 0 if one of(-PAPB, -PaPb) is >0 or <0 sys.stderr.write('Unknown except, ignore: %s\n' % repr(sys.exc_info()[0])) return None allele_freq = (min(PA, Pa), min(PB, Pb)) return_data = PassingData() return_data.D = D return_data.D_prime = D_prime return_data.r2 = r2 return_data.allele_freq = allele_freq return_data.snp_pair_ls = (locus1_id, locus2_id) return_data.no_of_pairs = total_num return return_data
def ltsFit(x_ls, y_ls, fractionUsed=0.6, startX=1, stopX=5): """ 2010-6-1 solve the computing node hang-up (I/O stuck) issue by adding these: import ROOT try: # 2010-5-31 old version (5.18.0) doesn't have IgnoreCommandLineOptions. ROOT.PyConfig.IgnoreCommandLineOptions = True #otherwise # Warning in <TApplication::GetOptions>: file <output file by -o > has size 0, skipping except: pass try: # 2010-5-31 disable .StartGuiThread ROOT.PyConfig.StartGuiThread = 0 except: pass 2010-5-30 return chiSquare as well 2010-5-21 use ROOT to do least trimmed square (LTS) fitting: fit the y=a+bx with trimming fraction = 1-fractionUsed. Example: import numpy x_ls = numpy.array(range(100), numpy.float) y_ls = x_ls/2. for i in range(len(y_ls)): import random new_y = random.random()-0.5 y_ls[i] += new_y # mess up some portion of y for i in range(5): import random new_y = random.random() new_y_index = random.sample(range(100),1) y_ls[new_y_index[0]] = new_y import numpy x_ls = numpy.array([ 2.64884758, 3.51235008, 2.83090925, 3.41229248, 3.01451969,\ 2.49899888, 3.69988108, 2.74896216, 3.05307841, 3.75705409,\ 3.08653784, 3.10703993, 3.61071348, 3.21285319, 2.91460752,\ 3.53737831, 3.06333303, 3.35391617, 3.43568516, 3.34429312,\ 3.31576061, 2.8007164 , 2.73639655, 3.14690256, 3.10174704,\ 2.80888581, 2.72754121, 2.90064001, 3.19270658, 3.50596333,\ 2.61804676, 3.18127131, 3.27542663, 3.09586573], dtype=numpy.float32) # numpy.float32 is not supported by ROOT y_ls = numpy.array([ 2.52827311, 3.27265358, 2.36172366, 2.95760489, 2.50920248,\ 2.3443923 , 3.23502254, 2.35410833, 2.50582743, 2.48501062,\ 2.82510138, 2.70799541, 2.43136382, 2.76342535, 2.45178652,\ 3.08224201, 2.26481771, 2.7387805 , 3.23274207, 2.82769203,\ 2.25042009, 2.56702638, 2.4082365 , 2.44793224, 2.65127802,\ 2.57460976, 2.43136382, 2.39005065, 2.70027065, 3.04452848,\ 2.28555727, 2.71933126, 2.6468935 , 2.54157925], dtype=numpy.float32) fit_y_ls = ltsFit(x_ls, y_ls) import pylab pylab.plot(x_ls, y_ls, '.') pylab.plot(x_ls, fit_y_ls, '.') pylab.legend(['raw data','fitted']) pylab.show() sys.exit(0) """ import ROOT try: # 2010-5-31 old version (5.18.0) doesn't have IgnoreCommandLineOptions. ROOT.PyConfig.IgnoreCommandLineOptions = True #otherwise # Warning in <TApplication::GetOptions>: file <output file by -o > has size 0, skipping except: pass try: # 2010-5-31 disable .StartGuiThread ROOT.PyConfig.StartGuiThread = 0 except: pass #ROOT.gROOT.Reset() # 2010-5-31 dont' know what this is for. ROOT.gROOT.SetBatch( True) #to avoid interative mode (drawing canvas and etc.) from ROOT import TFormula, TF1, TGraph import numpy lm = TF1( 'lm', 'pol1', startX, stopX ) #[0]+[1]*x is essentially same as pol1 but option rob in Fit() only works with pol1. #ROOT is very dtype-sensitive. numpy.float32 won't work. if hasattr(x_ls, 'dtype') and x_ls.dtype == numpy.float: pass else: sys.stderr.write('converting x_ls') x_ls = numpy.array(x_ls, dtype=numpy.float) sys.stderr.write(".\n") if hasattr(y_ls, 'dtype') and y_ls.dtype == numpy.float: pass else: sys.stderr.write('converting y_ls') y_ls = numpy.array(y_ls, dtype=numpy.float) sys.stderr.write(".\n") gr = TGraph(len(x_ls), x_ls, y_ls) gr.Fit(lm, "+rob=%s" % fractionUsed) fit = gr.GetFunction('lm') chiSquare = fit.GetChisquare() fit_y_ls = [] for x in x_ls: fit_y_ls.append(fit.Eval(x)) from utils import PassingData return PassingData(fit_y_ls=fit_y_ls, chiSquare=chiSquare)
def registerRefFastaFile(workflow=None, refFastaFname=None, registerAffiliateFiles=True, input_site_handler='local',\ checkAffiliateFileExistence=True, addPicardDictFile=True,\ affiliateFilenameSuffixLs=['fai', 'amb', 'ann', 'bwt', 'pac', 'sa', 'rbwt', 'rpac', 'rsa', \ 'stidx', 'sthash'], folderName="reference"): """ suffix here doesn't include ".". 2013.08.23 bugfix, check if workflow has a file registered before adding it 2013.3.26 added refSAMtoolsFastaIndexF, refPicardFastaDictF into returnData 2013.3.20 deduce needBWARefIndexJob, needSAMtoolsFastaIndexJob, needPicardFastaDictJob, needStampyRefIndexJob from missing suffixes 2010.10.10 added argument folderName 2012.5.23 add an argument "addPicardDictFile" to offer user option to exclude this file (i.e. in registerBlastNucleotideDatabaseFile) 2012.2.24 dict is via picard, also required for GATK fai is via "samtools faidx" (index reference). also required for GATK amb', 'ann', 'bwt', 'pac', 'sa', 'rbwt', 'rpac', 'rsa' are all bwa index. stidx is stampy index. sthash is stampy hash. 2012.2.23 add two suffixes, stidx (stampy index) and sthash (stampy hash) 2011-11-11 if needAffiliatedFiles, all other files, with suffix in affiliateFilenameSuffixLs, will be registered (symlinked or copied) as well. """ returnData = PassingData(refFastaFList = [], needBWARefIndexJob=False, needSAMtoolsFastaIndexJob=False, \ needPicardFastaDictJob=False, needStampyRefIndexJob=False, needBlastMakeDBJob=False,\ refPicardFastaDictF=None, refSAMtoolsFastaIndexF=None) missingSuffixSet = set() #2013.3.20 if registerAffiliateFiles: refFastaF = File( os.path.join(folderName, os.path.basename(refFastaFname)) ) #use relative path, otherwise, it'll go to absolute path # Add it into replica only when needed. refFastaF.addPFN(PFN("file://" + refFastaFname, input_site_handler)) if not workflow.hasFile(refFastaF): #2013.08.12 workflow.addFile(refFastaF) returnData.refFastaFList.append(refFastaF) # If it's not needed, assume the index is done and all relevant files are in absolute path. # and no replica transfer #add extra affiliated files suffix2PathToFileLs = {} if addPicardDictFile: #2012.5.23 picardDictSuffix = 'dict' pathToFile = '%s.%s' % ( os.path.splitext(refFastaFname)[0], picardDictSuffix ) #remove ".fasta" from refFastaFname if checkAffiliateFileExistence and not os.path.isfile(pathToFile): sys.stderr.write( "Warning: %s don't exist or not a file on file system. skip registration.\n" % (pathToFile)) missingSuffixSet.add(picardDictSuffix) #suffix2PathToFileLs.append(pathToFile) else: suffix2PathToFileLs[picardDictSuffix] = pathToFile for suffix in affiliateFilenameSuffixLs: pathToFile = '%s.%s' % (refFastaFname, suffix) if checkAffiliateFileExistence and not os.path.isfile(pathToFile): sys.stderr.write( "Warning: %s don't exist or not a file on file system. skip registration.\n" % (pathToFile)) missingSuffixSet.add(suffix) continue suffix2PathToFileLs[suffix] = pathToFile for suffix, pathToFile in suffix2PathToFileLs.iteritems(): if checkAffiliateFileExistence and not os.path.isfile(pathToFile): sys.stderr.write( "Warning: %s don't exist or not a file on file system. skip registration.\n" % (pathToFile)) continue affiliateF = File( os.path.join(folderName, os.path.basename(pathToFile))) #use relative path, otherwise, it'll go to absolute path affiliateF.addPFN(PFN("file://" + pathToFile, input_site_handler)) if not workflow.hasFile(affiliateF): #2013.08.12 workflow.addFile(affiliateF) returnData.refFastaFList.append(affiliateF) if suffix == 'dict': #2013.3.26 returnData.refPicardFastaDictF = affiliateF elif suffix == 'fai': returnData.refSAMtoolsFastaIndexF = affiliateF else: refFastaF = File( os.path.join(folderName, os.path.basename(refFastaFname))) returnData.refFastaFList.append(refFastaF) if 'bwt' in missingSuffixSet or 'pac' in missingSuffixSet: returnData.needBWARefIndexJob = True if 'fai' in missingSuffixSet: returnData.needSAMtoolsFastaIndexJob = True returnData.needPicardFastaDictJob = True if 'stidx' in missingSuffixSet or 'sthash' in missingSuffixSet: returnData.needStampyRefIndexJob = True if 'dict' in missingSuffixSet: returnData.needPicardFastaDictJob = True if 'nin' in missingSuffixSet or 'nhr' in missingSuffixSet or 'nsq' in missingSuffixSet: returnData.needBlastMakeDBJob = True return returnData
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) #self.writer = BeagleGenotypeFile(inputFname=self.outputFname, openMode='w') #read in the IBD check result self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \ rowIDHeader=None, colIDHeader=None, \ rowIDIndex=0, colIDIndex=1, \ dataHeader=None, dataIndex=2, hasHeader=False) #. read in the alignment coverage data alignmentCoverageFile = MatrixFile(inputFname=self.individualAlignmentCoverageFname) alignmentCoverageFile.constructColName2IndexFromHeader() alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1]) alignmentCoverageFile.close() sys.stderr.write("Reading in all samples from %s VCF input files ... \n"%(len(self.inputFnameLs))) # read all the Beagle files individualID2HaplotypeData = {} for inputFname in self.inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) #vcfFile.readInAllHaplotypes() for individualID in vcfFile.getSampleIDList(): individualID2HaplotypeData[individualID] = None #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID) #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList, # locusIDList=vcfFile.locusIDList) # get all haplotypes , etc. # get all sample IDs sys.stderr.write("%s individuals total.\n"%(len(individualID2HaplotypeData))) #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns) #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child) sys.stderr.write("Constructing individualID2pedigreeContext ...") plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname) pGraph = plinkPedigreeFile.pedigreeGraph #shrink the graph to only individuals with data pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys()) cc_subgraph_list = nx.connected_component_subgraphs(pGraph.to_undirected()) individualID2familyContext = {} outDegreeContainer = NumberContainer(minValue=0) familySizeContainer = NumberContainer(minValue=0) individualCoverageContainer = NumberContainer(minValue=0) familyCoverageContainer = NumberContainer(minValue=0) for cc_subgraph in cc_subgraph_list: familySize= len(cc_subgraph) familySizeContainer.addOneValue(familySize) familyCoverage = 0 for n in cc_subgraph: #assuming each family is a two-generation trio/nuclear family individualCoverage = self.getIndividualCoverage(individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs) individualCoverage = float(individualCoverage) individualCoverageContainer.addOneValue(individualCoverage) familyCoverage += individualCoverage in_degree = pGraph.in_degree(n) out_degree = pGraph.out_degree(n) outDegreeContainer.addOneValue(out_degree) familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \ individualCoverage=individualCoverage,\ familyCoverage=None) if n not in individualID2familyContext: individualID2familyContext[n] = familyContext else: sys.stderr.write("Node %s already in individualID2familyContext.\n"%(n)) familyCoverageContainer.addOneValue(familyCoverage) #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype for n in cc_subgraph: individualID2familyContext[n].familyCoverage = familyCoverage plinkPedigreeFile.close() sys.stderr.write("%s individuals.\n"%(len(individualID2familyContext))) # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual sys.stderr.write("Weighing each individual , assigning probability mass ...") individualID2probabilityMass = {} for individualID, familyContext in individualID2familyContext.iteritems(): outDegreeQuotient = outDegreeContainer.normalizeValue(familyContext.familySize) individualCoverageQuotient = individualCoverageContainer.normalizeValue(familyContext.individualCoverage) #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage) importanceScore = outDegreeQuotient + individualCoverageQuotient representativeImportanceScore = importanceScore individualID2probabilityMass[individualID] = representativeImportanceScore sys.stderr.write(" %s IDs with probability mass assigned.\n"%(len(individualID2probabilityMass))) self.individualID2probabilityMass = individualID2probabilityMass self.individualID2HaplotypeData = individualID2HaplotypeData