Exemple #1
0
def getAssociationLandscapeDataFromHDF5File(inputFname=None, associationTableName='association', \
										landscapeTableName='landscape', min_MAF=0.1):
	"""
	2012.11.20
		input is in HDF5MatrixFile format (which is output of variation/src/association_peak/DefineAssociationLandscape.py)
		contains two hdf5 groups. one is by associationTableName. the other is by landscapeTableName.
	"""
	pdata = PassingData(min_MAF=min_MAF)
	genome_wide_result = getGenomeWideResultFromHDF5MatrixFile(inputFname=inputFname, \
						min_value_cutoff=None, do_log10_transformation=False, pdata=pdata,\
						construct_chr_pos2index=False, construct_data_obj_id2index=False, \
						construct_locus_db_id2index=True,\
						report=True, tableName=associationTableName)
	
	returnData = PassingData(genome_wide_result=genome_wide_result)
	
	sys.stderr.write("Reading landscape from %s ..."%(inputFname))
	current_obj = None
	bridge_ls = []
	locusLandscapeNeighborGraph = nx.Graph()
	reader = HDF5MatrixFile(inputFname, mode='r')
	landscapeTableObject = reader.getTableObject(tableName=landscapeTableName)
	returnData.HDF5AttributeNameLs = []
	for attributeName, value in landscapeTableObject.getAttributes().items():
		returnData.HDF5AttributeNameLs.append(attributeName)
		setattr(returnData, attributeName, value)
	
	for row in landscapeTableObject:
		if row.start_locus_id==0:	#empty data. happens when inputFname contains no valid landscape, but one default null data point.
			continue
		start_locus_id = row.start_locus_id
		stop_locus_id = row.stop_locus_id
		no_of_loci = row.no_of_loci
		deltaX = row.deltaX
		
		start_obj = genome_wide_result.get_data_obj_by_locus_db_id(start_locus_id)
		stop_obj = genome_wide_result.get_data_obj_by_locus_db_id(stop_locus_id)
		
		bridge_ls.append([start_obj, stop_obj, no_of_loci, deltaX])
		
		source_index = start_obj.index
		#genome_wide_result.get_data_obj_index_by_locus_db_id(start_locus_id)
		target_index = stop_obj.index
		
		locusLandscapeNeighborGraph.add_edge(source_index, target_index, \
									weight=None)
		locusLandscapeNeighborGraph[source_index][target_index]['no_of_loci'] = no_of_loci
		locusLandscapeNeighborGraph[source_index][target_index]['deltaX'] = deltaX
		
	del reader
	sys.stderr.write("%s bridges.\n"%(len(bridge_ls)))
	returnData.bridge_ls = bridge_ls
	returnData.locusLandscapeNeighborGraph = locusLandscapeNeighborGraph
	return returnData
	def addPolymorphism(self, name=None, individualName=None, locusName=None,
		chromosome_copy = None,\
		allele_sequence=None, allele_sequence_length=None, allele_type =None, **keywords):
		"""
		2013.3.10
		"""
		if name:
			individual_id = self.getIndividual(name=individualName).id
			locus_id = self.getLocus(name=locusName).id
			oneCell = PassingData(name=name, individual_id=individual_id, locus_id = locus_id,
				chromosome_copy=chromosome_copy,
				allele_sequence = allele_sequence,
				allele_sequence_length=allele_sequence_length,\
				allele_type=allele_type, **keywords)
			self.polymorphismTable.writeOneCell(oneCell, cellType=2)
			self.flush()
		return self.checkPolymorphism(name=name)	#would this work without flush()?
Exemple #3
0
    def getNoOfFamiliesAndKidsGivenParentSetSize(self,
                                                 noOfParents2FamilyData=None,
                                                 parentSetSize=2):
        """
		2013.07.19
		"""
        familyData = noOfParents2FamilyData.get(parentSetSize, None)

        if familyData:
            noOfFamilies = len(familyData.parentTupleSet)
            noOfParents = len(familyData.parentIDSet)
            noOfKids = len(familyData.childIDSet)
            noOfIndividuals = len(familyData.individualIDSet)
        else:
            noOfFamilies = 0
            noOfParents = 0
            noOfKids = 0
            noOfIndividuals = 0
        return PassingData(noOfFamilies=noOfFamilies,
                           noOfParents=noOfParents,
                           noOfKids=noOfKids,
                           noOfIndividuals=noOfIndividuals)
Exemple #4
0
def parseChrStartStopFromFilename(filename=None, chr2size=None,
	defaultChromosomeSize=10000000000):
	"""
	2013.09.18
		#10000000000 is used when filename contains data from a whole chromosome
		#  and chr2size is not available or not containing chromosome
		make it very big so that it could be intersected with any interval
			from any chromosome.
	"""
	searchResult = chr_start_stop_pattern.search(filename)
	if searchResult:
		chromosome = searchResult.group(1)
		start = int(searchResult.group(2))
		stop = int(searchResult.group(3))
	else:	#try
		chromosome = getChrFromFname(filename=filename)
		start =1
		if chr2size is not None:
			stop = chr2size.get(chromosome, defaultChromosomeSize)
		else:
			stop = defaultChromosomeSize
	return PassingData(chromosome=chromosome, start=start, stop=stop)
	def writeChrStartStopTupleList2LocusTable(self, chr_start_stop_list=None, chromosomeLength=None,\
											speciesName=None, ploidy=None):
		"""
		2013.3.7
			#. establish _locus_index2id, to be used in writeIndividualName2PolymorphismData()
			#. make sure chr_start_stop_list is in the same order as the haplotype in writeIndividualName2PolymorphismData()
		"""
		sys.stderr.write("Writing a %s-element list of (chr, start,stop) out ..."%(len(chr_start_stop_list)))
		chr_start_stop_list.sort()	#make sure it's sorted
		if ploidy is None:
			ploidy=self.ploidy
		for i in range(len(chr_start_stop_list)):
			chromosomeName, start, stop = chr_start_stop_list[i][:3]
			if chromosomeName:
				chromosomeEntry = self.getChromosome(name=chromosomeName, length=chromosomeLength, speciesName=speciesName,\
													ploidy=ploidy)
			else:
				chromosomeEntry = None
			name = '%s_%s_%s'%(chromosomeName, start, stop)
			oneCell = PassingData(name=name, chromosome_id=getattr(chromosomeEntry, 'id', None), start=start, stop=stop)
			self.locusTable.writeOneCell(oneCell, cellType=2)
			self._locus_index2id[i] = self.locusTable.no_of_rows
		sys.stderr.write("%s loci \n")
		return self._locus_index2id
Exemple #6
0
    def setup(self, **keywords):
        """
		2012.10.15
			run before anything is run
		"""
        AbstractMatrixFileWalker.setup(self, **keywords)
        #self.writer = BeagleGenotypeFile(path=self.outputFname, mode='w')

        #read in the IBD check result
        self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \
            rowIDHeader=None, colIDHeader=None, \
            rowIDIndex=0, colIDIndex=1, \
            dataHeader=None, dataIndex=2, hasHeader=False)

        #. read in the alignment coverage data
        alignmentCoverageFile = MatrixFile(
            path=self.individualAlignmentCoverageFname)
        alignmentCoverageFile.constructColName2IndexFromHeader()
        alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(
            keyColumnIndexList=[0], valueColumnIndexList=[1])
        alignmentCoverageFile.close()

        sys.stderr.write(
            "Reading in all samples from %s VCF input files ... \n" %
            (len(self.inputFnameLs)))
        # read all the Beagle files
        individualID2HaplotypeData = {}
        for inputFname in self.inputFnameLs:
            vcfFile = VCFFile(inputFname=inputFname)
            #vcfFile.readInAllHaplotypes()
            for individualID in vcfFile.getSampleIDList():
                individualID2HaplotypeData[individualID] = None
                #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID)
                #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList,
                #													locusIDList=vcfFile.locusIDList)
            # get all haplotypes , etc.
            # get all sample IDs
        sys.stderr.write("%s individuals total.\n" %
                         (len(individualID2HaplotypeData)))

        #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns)
        #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child)
        sys.stderr.write("Constructing individualID2pedigreeContext ...")
        plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname)
        pGraph = plinkPedigreeFile.pedigreeGraph
        #shrink the graph to only individuals with data
        pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys())

        cc_subgraph_list = nx.connected_component_subgraphs(
            pGraph.to_undirected())
        individualID2familyContext = {}
        outDegreeContainer = NumberContainer(minValue=0)
        familySizeContainer = NumberContainer(minValue=0)
        individualCoverageContainer = NumberContainer(minValue=0)
        familyCoverageContainer = NumberContainer(minValue=0)
        for cc_subgraph in cc_subgraph_list:
            familySize = len(cc_subgraph)
            familySizeContainer.addOneValue(familySize)

            familyCoverage = 0
            for n in cc_subgraph:  #assuming each family is a two-generation trio/nuclear family
                individualCoverage = self.getIndividualCoverage(
                    individualID=n,
                    alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs
                )
                individualCoverage = float(individualCoverage)
                individualCoverageContainer.addOneValue(individualCoverage)
                familyCoverage += individualCoverage
                in_degree = pGraph.in_degree(n)
                out_degree = pGraph.out_degree(n)
                outDegreeContainer.addOneValue(out_degree)
                familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \
                      individualCoverage=individualCoverage,\
                      familyCoverage=None)
                if n not in individualID2familyContext:
                    individualID2familyContext[n] = familyContext
                else:
                    sys.stderr.write(
                        "Node %s already in individualID2familyContext.\n" %
                        (n))
            familyCoverageContainer.addOneValue(familyCoverage)
            #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype
            for n in cc_subgraph:
                individualID2familyContext[n].familyCoverage = familyCoverage
        plinkPedigreeFile.close()
        sys.stderr.write("%s individuals.\n" %
                         (len(individualID2familyContext)))

        # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual
        sys.stderr.write(
            "Weighing each individual , assigning probability mass  ...")
        individualID2probabilityMass = {}
        for individualID, familyContext in individualID2familyContext.items():
            outDegreeQuotient = outDegreeContainer.normalizeValue(
                familyContext.familySize)
            individualCoverageQuotient = individualCoverageContainer.normalizeValue(
                familyContext.individualCoverage)
            #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage)
            importanceScore = outDegreeQuotient + individualCoverageQuotient
            representativeImportanceScore = importanceScore
            individualID2probabilityMass[
                individualID] = representativeImportanceScore
        sys.stderr.write(" %s IDs with probability mass assigned.\n" %
                         (len(individualID2probabilityMass)))

        self.individualID2probabilityMass = individualID2probabilityMass
        self.individualID2HaplotypeData = individualID2HaplotypeData
Exemple #7
0
 def calLD(cls,
           locus1_allele_ls,
           locus2_allele_ls,
           locus1_id=None,
           locus2_id=None):
     """
     2010-9-30
         copied from palos/SNP.py.
         locus1_allele_ls, locus2_allele_ls should be bi-allelic.
         If locus1_allele_ls and locus2_allele_ls are of different size, the extra elements are discarded.
     2008-09-05
         adapted from variation.src.misc's LD.calculate_LD class
         only deal with 2-allele loci
         skip if either is NA, or if both are heterozygous (not phased)
     """
     counter_matrix = numpy.zeros([2, 2])  #only 2 alleles
     snp1_allele2index = {}
     snp2_allele2index = {}
     no_of_individuals = min(len(locus1_allele_ls), len(locus2_allele_ls))
     for k in range(no_of_individuals):
         snp1_allele = locus1_allele_ls[k]
         snp2_allele = locus2_allele_ls[k]
         snp1_allele_index = cls.fill_in_snp_allele2index(
             snp1_allele, snp1_allele2index)
         snp2_allele_index = cls.fill_in_snp_allele2index(
             snp2_allele, snp2_allele2index)
         if snp1_allele_index > 1 or snp2_allele_index > 1:  #ignore the 3rd allele
             continue
         counter_matrix[snp1_allele_index, snp2_allele_index] += 1
         #counter_matrix[snp1_allele_index, snp2_allele_index] += 1	#this is to mimic the diploid.
     PA = sum(counter_matrix[0, :])
     Pa = sum(counter_matrix[1, :])
     PB = sum(counter_matrix[:, 0])
     Pb = sum(counter_matrix[:, 1])
     total_num = float(PA + Pa)
     try:
         PA = PA / total_num
         Pa = Pa / total_num
         PB = PB / total_num
         Pb = Pb / total_num
         PAB = counter_matrix[0, 0] / total_num
         D = PAB - PA * PB
         PAPB = PA * PB
         PAPb = PA * Pb
         PaPB = Pa * PB
         PaPb = Pa * Pb
         Dmin = max(-PAPB, -PaPb)
         Dmax = min(PAPb, PaPB)
         if D < 0:
             D_prime = D / Dmin
         else:
             D_prime = D / Dmax
         r2 = D * D / (PA * Pa * PB * Pb)
     except:  #2008-01-23 exceptions.ZeroDivisionError, Dmin or Dmax could be 0 if one of(-PAPB, -PaPb)  is >0 or <0
         sys.stderr.write('Unknown except, ignore: %s\n' %
                          repr(sys.exc_info()[0]))
         return None
     allele_freq = (min(PA, Pa), min(PB, Pb))
     return_data = PassingData()
     return_data.D = D
     return_data.D_prime = D_prime
     return_data.r2 = r2
     return_data.allele_freq = allele_freq
     return_data.snp_pair_ls = (locus1_id, locus2_id)
     return_data.no_of_pairs = total_num
     return return_data
Exemple #8
0
def ltsFit(x_ls, y_ls, fractionUsed=0.6, startX=1, stopX=5):
    """
    2010-6-1
        solve the computing node hang-up (I/O stuck) issue by adding these:
            import ROOT
            try:	# 2010-5-31 old version (5.18.0) doesn't have IgnoreCommandLineOptions.
                ROOT.PyConfig.IgnoreCommandLineOptions = True	#otherwise
                # Warning in <TApplication::GetOptions>: file <output file by -o > has size 0, skipping
            except:
                pass
            try:	# 2010-5-31  disable .StartGuiThread
                ROOT.PyConfig.StartGuiThread = 0
            except:
                pass
    2010-5-30
        return chiSquare as well
    2010-5-21
        use ROOT to do least trimmed square (LTS) fitting:
            fit the y=a+bx with trimming fraction = 1-fractionUsed.
    
    Example:
    
    import numpy
    x_ls = numpy.array(range(100), numpy.float)
    y_ls = x_ls/2.
    for i in range(len(y_ls)):
        import random
        new_y = random.random()-0.5
        y_ls[i] += new_y
    
    # mess up some portion of y
    for i in range(5):
        import random
        new_y = random.random()
        new_y_index = random.sample(range(100),1)
        y_ls[new_y_index[0]] = new_y
    import numpy
    x_ls = numpy.array([ 2.64884758,  3.51235008,  2.83090925,  3.41229248,  3.01451969,\
    2.49899888,  3.69988108,  2.74896216,  3.05307841,  3.75705409,\
    3.08653784,  3.10703993,  3.61071348,  3.21285319,  2.91460752,\
    3.53737831,  3.06333303,  3.35391617,  3.43568516,  3.34429312,\
    3.31576061,  2.8007164 ,  2.73639655,  3.14690256,  3.10174704,\
    2.80888581,  2.72754121,  2.90064001,  3.19270658,  3.50596333,\
    2.61804676,  3.18127131,  3.27542663,  3.09586573], dtype=numpy.float32)	# numpy.float32 is not supported by ROOT
    y_ls = numpy.array([ 2.52827311,  3.27265358,  2.36172366,  2.95760489,  2.50920248,\
    2.3443923 ,  3.23502254,  2.35410833,  2.50582743,  2.48501062,\
    2.82510138,  2.70799541,  2.43136382,  2.76342535,  2.45178652,\
    3.08224201,  2.26481771,  2.7387805 ,  3.23274207,  2.82769203,\
    2.25042009,  2.56702638,  2.4082365 ,  2.44793224,  2.65127802,\
    2.57460976,  2.43136382,  2.39005065,  2.70027065,  3.04452848,\
    2.28555727,  2.71933126,  2.6468935 ,  2.54157925], dtype=numpy.float32)
    
    fit_y_ls = ltsFit(x_ls, y_ls)
    
    import pylab
    pylab.plot(x_ls, y_ls, '.')
    pylab.plot(x_ls, fit_y_ls, '.')
    pylab.legend(['raw data','fitted'])
    pylab.show()
    sys.exit(0)
    
    """
    import ROOT
    try:  # 2010-5-31 old version (5.18.0) doesn't have IgnoreCommandLineOptions.
        ROOT.PyConfig.IgnoreCommandLineOptions = True  #otherwise
        # Warning in <TApplication::GetOptions>: file <output file by -o > has size 0, skipping
    except:
        pass
    try:  # 2010-5-31  disable .StartGuiThread
        ROOT.PyConfig.StartGuiThread = 0
    except:
        pass

    #ROOT.gROOT.Reset()	# 2010-5-31 dont' know what this is  for.
    ROOT.gROOT.SetBatch(
        True)  #to avoid interative mode (drawing canvas and etc.)
    from ROOT import TFormula, TF1, TGraph
    import numpy
    lm = TF1(
        'lm', 'pol1', startX, stopX
    )  #[0]+[1]*x is essentially same as pol1 but option rob in Fit() only works with pol1.
    #ROOT is very dtype-sensitive. numpy.float32 won't work.
    if hasattr(x_ls, 'dtype') and x_ls.dtype == numpy.float:
        pass
    else:
        sys.stderr.write('converting x_ls')
        x_ls = numpy.array(x_ls, dtype=numpy.float)
        sys.stderr.write(".\n")
    if hasattr(y_ls, 'dtype') and y_ls.dtype == numpy.float:
        pass
    else:
        sys.stderr.write('converting y_ls')
        y_ls = numpy.array(y_ls, dtype=numpy.float)
        sys.stderr.write(".\n")
    gr = TGraph(len(x_ls), x_ls, y_ls)
    gr.Fit(lm, "+rob=%s" % fractionUsed)
    fit = gr.GetFunction('lm')
    chiSquare = fit.GetChisquare()
    fit_y_ls = []
    for x in x_ls:
        fit_y_ls.append(fit.Eval(x))
    from utils import PassingData
    return PassingData(fit_y_ls=fit_y_ls, chiSquare=chiSquare)
Exemple #9
0
def parseOneVCFRow(row, col_name2index, col_index_individual_name_ls,
    sample_id2index, minDepth=1,\
    dataEntryType=1):
    """
    2014.01.08 fix a bug that skips calls and shortens data_row. 
    2012.9.6 turn pos into integer
    2012.5.10
        complete representation of one locus
    2012.1.17
        common snippet split out of VCFFile & VCFRecord
        row is a list of input columns from one VCF file line
        dataEntryType
            1: each cell is base call
            2: each cell is a dictionary {'GT': base-call, 'DP': depth}
    """
    chromosome = row[0]
    pos = int(row[1])  #2012.9.6 turn pos into integer
    vcf_locus_id = row[2]
    quality = row[5]
    filter = row[6]
    info = row[7]
    format = row[8]
    info_ls = info.split(';')
    info_tag2value = {}
    for info_entry in info_ls:
        try:
            tag, value = info_entry.split('=')
        except:
            #sys.stderr.write("Error in splitting %s by =.\n"%info)
            # ###Error in splitting DS by =.
            continue
        info_tag2value[tag] = value

    locus_id = (chromosome, pos)
    refBase = row[col_name2index['REF']]
    altBase = row[col_name2index['ALT']]

    altBaseLs = altBase.split(',')
    #altBase could be just "C" or "C,G" (multi-nucleotide)
    alleleLs = [refBase] + altBaseLs
    alleleNumber2Base = {'.': 'NA'}
    for i in range(len(alleleLs)):
        alleleNumber2Base[repr(i)] = alleleLs[i]

    format_column = row[col_name2index['FORMAT']]
    format_column_ls = format_column.split(':')
    format_column_name2index = getColName2IndexFromHeader(format_column_ls)

    if dataEntryType == 1:
        data_row = ['NA'] * (len(col_index_individual_name_ls) + 1
                             )  # extra 1 for the ref
        data_row[0] = refBase
    else:
        data_row = [None] * (len(col_index_individual_name_ls) + 1
                             )  # extra 1 for the ref
        data_row[0] = {'GT': refBase, 'DP': -1}
    genotypeCall2Count = {}
    for individual_col_index, individual_name in col_index_individual_name_ls:
        individual_name = individual_name
        if individual_name not in sample_id2index:
            sample_id2index[individual_name] = len(sample_id2index)

        #coverage = read_group2coverage[individual_name]
        genotype_data = row[individual_col_index]
        genotype_data_ls = genotype_data.split(':')
        genotype_call_index = format_column_name2index.get('GT')
        genotype_quality_index = format_column_name2index.get('GQ')
        if genotype_quality_index is None:
            genotype_quality_index = format_column_name2index.get('DP')
        depth_index = format_column_name2index.get("DP")
        #GL_index = format_column_name2index.get('GL')
        genotypeCallInBase = 'NA'
        if genotype_call_index is not None and len(genotype_data_ls) > 0:
            # or (genotype_call_index is not None and len(genotype_data_ls)<=genotype_call_index):
            # 	#<len(format_column_name2index):
            # #this genotype call is probably empty "./." due to no reads
            #genotype_quality = genotype_data_ls[genotype_quality_index]
            if genotype_call_index is not None and len(
                    genotype_data_ls) > genotype_call_index:
                genotype_call = genotype_data_ls[genotype_call_index]
            else:
                genotype_call = './.'  #missing
            callData = {}
            if genotype_call != './.' and genotype_call != '.' and genotype_call != '.|.':
                #missing data
                patternSearchResult = diploidGenotypePattern.search(
                    genotype_call)
                if patternSearchResult:
                    allele1 = alleleNumber2Base[patternSearchResult.group(1)]
                    allele2 = alleleNumber2Base[patternSearchResult.group(2)]
                    if allele1 != 'N' and allele2 != 'N':
                        genotypeCallInBase = '%s%s' % (allele1, allele2)
                if depth_index is not None:
                    if len(genotype_data_ls) > depth_index:
                        depth = genotype_data_ls[depth_index]
                    else:
                        depth = '.'  #missing DP
                    if depth == '.':  #this means depth=0
                        depth = 0
                    else:
                        depth = int(depth)
                    if minDepth > 0 and depth < minDepth:
                        #no read. samtools would still assign ref/ref to this individual
                        genotypeCallInBase = 'NA'  #set it to missing
                    #if depth>maxNoOfReads*coverage or depth<minNoOfReads*coverage:
                    # #2011-3-29 skip. coverage too high or too low
                    #	continue
                    callData['DP'] = depth
        """
        if genotype_call=='0/1' or genotype_call =='1/0':
            #heterozygous, the latter notation is never used though.
            allele = '%s%s'%(refBase, altBase)
            GL_list = genotype_data_ls[GL_index]
            GL_list = GL_list.split(',')
            GL_list = map(float, GL_list)
            GL = GL_list[1]
            sndHighestGL = max([GL_list[0], GL_list[2]])
            deltaGL = GL-sndHighestGL
            
            AD = genotype_data_ls[format_column_name2index.get('AD')]
            AD = map(int, AD.split(','))
            minorAlleleCoverage = min(AD)
            majorAlleleCoverage = max(AD)
            
            if minorAlleleCoverage<=minorAlleleDepthUpperBoundCoeff*coverage and \
                    minorAlleleCoverage>=minorAlleleDepthLowerBoundCoeff*coverage and \
                    majorAlleleCoverage<=majorAlleleDepthUpperBoundCoeff*coverage:
                DP4_ratio = float(AD[0])/AD[1]
                allele = '%s%s'%(refBase, altBase)

        elif genotype_call=='./.' or genotype_call=='.|.':	#missing
            allele = 'NA'
        elif genotype_call =='1/1' or genotype_call =='1|1':
            allele = '%s%s'%(altBase, altBase)
        elif genotype_call =='0/0' or genotype_call=='0|0':
            allele = '%s%s'%(refBase, refBase)
        """
        col_index = sample_id2index.get(individual_name)
        if dataEntryType == 1:
            data_row[col_index] = genotypeCallInBase
        else:
            callData['GT'] = genotypeCallInBase
            data_row[col_index] = callData
        if genotypeCallInBase != 'NA':
            if genotypeCallInBase not in genotypeCall2Count:
                genotypeCall2Count[genotypeCallInBase] = 0
            genotypeCall2Count[genotypeCallInBase] += 1
    return PassingData(chr=chromosome,
                       chromosome=chromosome,
                       pos=pos,
                       position=pos,
                       locus_id=locus_id,
                       quality=quality,
                       info_tag2value=info_tag2value,
                       refBase=refBase,
                       altBase=altBase,
                       alleleLs=alleleLs,
                       alleleNumber2Base=alleleNumber2Base,
                       genotypeCall2Count=genotypeCall2Count,
                       data_row=data_row,
                       info=info,
                       format=format,
                       filter=filter,
                       vcf_locus_id=vcf_locus_id,
                       format_column_name2index=format_column_name2index,
                       format_column_ls=format_column_ls)