def openWriteBeagleFiles(self,
                             pedigreeFamilyData=None,
                             outputFnamePrefix=None):
        """
        2013.05.02
            
        The non-likelihood (unphased, trios, pairs) Beagle format:
            I id sample1 sample1 sample2 sample2
            A diabetes 1 1 2 2
            M rs12082861 C C C C
            M rs4912233 T C C C
            M rs12732823 G A A A
            M rs17451521 C C C C
            M rs12033358 C T T T
        
        The likelihood version is
            marker alleleA alleleB 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1002_711_2001039_GA_vs_524
            Contig791:1086 C A 0.9693 0.0307 0.0000 0.6660 0.3338 0.0003 0.0000
            Contig791:1649 G C 0.9406 0.0594 0.0000 0.9693 0.0307 0.0000 0.0000
            Contig791:4084 A C 0.9980 0.0020 0.0000 0.9844 0.0156 0.0000 0.0000
        
        The markers file has this format (markerID, position, alleleA, alleleB)
            Contig791:1086 1086 C A
        """
        sys.stderr.write(
            "Opening beagle files (outputFnamePrefix =%s) to write ..." %
            (outputFnamePrefix))
        familySize2BeagleFileHandler = {}
        familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList
        counter = 0
        for familySize, sampleIDList in familySize2SampleIDList.items():
            if familySize not in familySize2BeagleFileHandler:
                tmpOutputFnamePrefix = '%s_familySize%s' % (outputFnamePrefix,
                                                            familySize)
                writer = MatrixFile(path='%s.bgl' % (tmpOutputFnamePrefix),
                                    mode='w',
                                    delimiter=' ')
                familySize2BeagleFileHandler[familySize] = writer
                if familySize == 1:
                    headerRow = ['marker', 'alleleA', 'alleleB']
                else:
                    headerRow = ['I', 'id']
                for sampleID in sampleIDList:
                    if familySize == 1:
                        #likelihood format has sample name replicated three times, rather than 2 times
                        headerRow.extend([sampleID] * 3)
                    else:
                        headerRow.extend([sampleID] * 2)
                writer.writeHeader(headerRow)
                counter += 1
        markersFile = MatrixFile(path='%s.markers' % (outputFnamePrefix),
                                 mode='w',
                                 delimiter=' ')

        counter += 1
        sys.stderr.write("%s files outputted.\n" % (counter))

        return PassingData(
            familySize2BeagleFileHandler=familySize2BeagleFileHandler,
            markersFile=markersFile)
Esempio n. 2
0
 def __init__(self, path=None, **keywords):
     MatrixFile.__init__(self, path=path, **keywords)
     
     self.familyID2MemberList= {}
     self.familySize2SampleIDList = {}
     self._pedigreeGraph = None
     self._childNodeSet = None
Esempio n. 3
0
 def __init__(self, path=None, **keywords):
     MatrixFile.__init__(self, path=path, **keywords)
     self.header = None
     self.col_name2index = None	#key is sampleID, value is index of first haplotype
     
     self.sampleIDList = []
     self.sampleID2index = {}	#same as col_name2index
     self.locusIDList = []
     self.haplotypeMatrix = []
     
     self.snpData = None	#to store everything above . SNPData type
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, mode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
Esempio n. 5
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        snp_pos2genotypeVectorLs = self.readInSNPID2GenotypeVectorLs(
            self.inputFname).snp_pos2returnData

        writer = MatrixFile(self.outputFname, mode='w', delimiter='\t')
        header = [
            'chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance'
        ]
        writer.writeHeader(header)

        counter = 0
        real_counter = 0
        no_of_pairs = 0
        snp_pos_ls = sorted(snp_pos2genotypeVectorLs)
        for i in range(len(snp_pos_ls)):
            counter += 1
            key = snp_pos_ls[i]
            chromosome, position = snp_pos_ls[i][:2]
            genotypeVectorLs = snp_pos2genotypeVectorLs.get(key)
            if len(genotypeVectorLs) > 1:
                real_counter += 1
                for k in range(0, len(genotypeVectorLs) - 1):
                    for l in range(k + 1, len(genotypeVectorLs)):
                        no_of_pairs += 1
                        noOfMatches = 0
                        noOfTotal = 0
                        genotypeVector0 = genotypeVectorLs[k]
                        genotypeVector1 = genotypeVectorLs[l]
                        for j in range(len(genotypeVector0)):
                            call1 = genotypeVector0[j]['GT']
                            call2 = genotypeVector1[j]['GT']
                            if call1 != 'NA' and call2 != 'NA':
                                noOfTotal += 1
                                if SNP.nt2number[call1] == SNP.nt2number[
                                        call2]:
                                    noOfMatches += 1
                        if noOfTotal > 0:
                            concordance = float(noOfMatches) / float(noOfTotal)
                        else:
                            concordance = -1
                        data_row = [
                            chromosome, position, noOfMatches, noOfTotal,
                            concordance
                        ]
                        writer.writerow(data_row)
        writer.close()
        sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \
                                                real_counter/float(counter), no_of_pairs))
 def __init__(self, path=None, **keywords):
     MatrixFile.__init__(self, path=path, **keywords)
     
     #summary data
     self.no_of_intervals = 0
     self.interval_value_ls = []
     self.interval_length_ls = []
     self.chromosome_size = 0
     
     self.min_interval_value = None
     self.max_interval_value = None
     self.median_interval_value = None
     
     self.min_interval_length = None
     self.max_interval_length = None
     self.median_interval_length = None
     
     self.interval_ls = []
 def getLocusID2MissingFraction(self, inputFname=None):
     """
     2014.01.08
         
     """
     sys.stderr.write("Reading in the missing statistics from %s ... "%(inputFname))
     locusID2Stat = {}
     
     reader = MatrixFile(path=inputFname)
     reader.constructColName2IndexFromHeader()
     locusIDIndex = reader.getColIndexGivenColHeader("locusID")
     statIndex = reader.getColIndexGivenColHeader("occurrence_byFixedValue")
     counter = 0
     for row in reader:
         locusID = row[locusIDIndex]
         chromosome, start = locusID.split('_')[:2]
         start = int(start)
         stat = float(row[statIndex])
         
         key = (chromosome, start, start)
         if key in locusID2Stat:
             if stat < locusID2Stat[key]:
                 #take lowest value
                 locusID2Stat[key] = stat
         else:
             locusID2Stat[key] = stat
         counter += 1
     del reader
     sys.stderr.write(" %s unique loci with missing fraction out of %s total loci.\n"%(len(locusID2Stat), counter))
     return locusID2Stat
    def readInSwitchDensity(self, inputFname=None):
        """
		2013.07.11
		"""
        sys.stderr.write("Reading in switch density from %s ..." %
                         (inputFname))

        reader = MatrixFile(path=inputFname)
        reader.constructColName2IndexFromHeader()

        noOfSwitchesPerLocusIndex = reader.getColIndexGivenColHeader(
            "noOfSwitchesPerLocus")

        counter = 0
        real_counter = 0
        switchDensity = 0
        for row in reader:
            switchDensity = float(row[noOfSwitchesPerLocusIndex])
            counter += 1
            break
        del reader
        return PassingData(switchDensity=switchDensity)
 def openOneInputFile(self, inputFname=None):
     """
     2013.09.05 split out of fileWalker() , added VCFFile
     """
     if self.inputFileFormat==2:
         reader = YHFile(inputFname, mode='r', tableName=self.h5TableName)
     elif self.inputFileFormat==3:
         reader = HDF5MatrixFile(inputFname, mode='r')
     elif self.inputFileFormat==4:
         reader = VCFFile(inputFname=inputFname)
     else:
         reader = MatrixFile(inputFname)
     return reader
 def outputFinalData(self,
                     outputFname,
                     key2dataLs=None,
                     delimiter=None,
                     header=None):
     """
     header output is not dependent on key2dataLs anymore 
     """
     writer = MatrixFile(path=outputFname, delimiter=delimiter, mode='w')
     if header and delimiter:
         writer.writerow(header)
     if key2dataLs and delimiter:
         keyLs = sorted(key2dataLs)
         for key in keyLs:
             dataLs = key2dataLs.get(key)
             writer.writerow(list(key) + dataLs)
     writer.close()
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        switchPointData = self.readInStats(inputFname=self.inputFname)

        sys.stderr.write("Processing data ...")
        writer = MatrixFile(self.outputFname, mode='w')
        header = [
            "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction',
            "noOfLoci", 'noOfLociFraction'
        ]
        writer.writeHeader(header)

        data_matrix = switchPointData.data_matrix
        totalSpan = switchPointData.totalSpan
        totalNoOfLoci = switchPointData.totalNoOfLoci

        #sort it based on switchFrequency
        data_matrix.sort(reverse=True)
        maxSwitchFrequencyLs = []
        cumulativeRegionSpanLs = []
        cumulativeNoOfLociLs = []
        for i in range(len(data_matrix)):
            switchFrequency, regionSpan, noOfLoci = data_matrix[i]
            maxSwitchFrequencyLs.append(switchFrequency)
            if i == 0:
                cumulativeRegionSpan = totalSpan - regionSpan

                cumulativeNoOfLoci = totalNoOfLoci - noOfLoci
            else:
                cumulativeRegionSpan = cumulativeRegionSpanLs[i -
                                                              1] - regionSpan
                cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci
            cumulativeRegionSpanLs.append(cumulativeRegionSpan)
            cumulativeNoOfLociLs.append(cumulativeNoOfLoci)
            writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\
                cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)])
        writer.close()
        sys.stderr.write(".\n")
 def setup(self, **keywords):
     """
     do not open the file if it's a png file.
     run before anything is run.
     """
     writer = None
     if self.outputFileFormat in [1,4]:
         suffix = os.path.splitext(self.outputFname)[1]
         if self.outputFname and suffix!='.png':
             writer = MatrixFile(self.outputFname, mode='w', delimiter='\t')
     else:
         #HDF5MatrixFile
         #can't generate HDF5MatrixFile, because it needs dtypeList
         pass
     #pass it to the invariantPData
     self.invariantPData.writer = writer
     self.writer = writer
Esempio n. 13
0
    def getLocusNewID2mapPvalue(self, liftOverLocusMapPvalueFname=None):
        """
		2014.01.04
			oldChromosome, oldStart, oldStop, oldStrand, newChromosome, newStart, newStop, mapPvalue
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (liftOverLocusMapPvalueFname))
        locusNewID2mapPvalue = {}
        reader = MatrixFile(path=liftOverLocusMapPvalueFname)
        reader.constructColName2IndexFromHeader()
        strandIndex = reader.getColIndexGivenColHeader("oldStrand")
        newChromosomeIndex = reader.getColIndexGivenColHeader("newChromosome")
        newStartIndex = reader.getColIndexGivenColHeader("newStart")
        newStopIndex = reader.getColIndexGivenColHeader("newStop")
        mapPvalueIndex = reader.getColIndexGivenColHeader("mapPvalue")
        counter = 0
        for row in reader:
            strand = row[strandIndex]
            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            mapPvalue = float(row[mapPvalueIndex])

            key = (newChromosome, newStart, newStop)
            if key in locusNewID2mapPvalue:
                if mapPvalue < locusNewID2mapPvalue[key]:
                    #take lowest value
                    locusNewID2mapPvalue[key] = mapPvalue
            else:
                locusNewID2mapPvalue[key] = mapPvalue
            counter += 1
        del reader
        sys.stderr.write(
            "%s unique loci with map p-value out of %s total loci.\n" %
            (len(locusNewID2mapPvalue), counter))
        return locusNewID2mapPvalue
    def readInStats(self, inputFname=None):
        """
		2013.07.15
		"""
        sys.stderr.write("Reading stats from %s ..." % (inputFname))

        data_matrix = []

        reader = MatrixFile(inputFname)
        reader.constructColName2IndexFromHeader()
        switchFrequencyIndex = reader.getColIndexGivenColHeader(
            "noOfSwitchPoints_by_noOfLociWithUniqueHit")
        regionSpanIndex = reader.getColIndexGivenColHeader("regionSpan")
        noOfLociIndex = reader.getColIndexGivenColHeader("#sitesInInput2")

        totalSpan = 0
        totalNoOfLoci = 0
        counter = 0
        for row in reader:
            counter += 1
            switchFrequency = row[switchFrequencyIndex]
            regionSpan = row[regionSpanIndex]
            noOfLoci = row[noOfLociIndex]
            if switchFrequency and regionSpan and noOfLoci:  #non-empty
                switchFrequency = float(switchFrequency)
                regionSpan = int(float(regionSpan))
                noOfLoci = int(float(noOfLoci))
                data_matrix.append([switchFrequency, regionSpan, noOfLoci])
                totalSpan += regionSpan
                totalNoOfLoci += noOfLoci
        reader.close()
        sys.stderr.write(" %s valid entries (from %s rows) with totalSpan=%s, totalNoOfLoci=%s.\n"%\
            (len(data_matrix), counter, totalSpan, totalNoOfLoci))
        return PassingData(data_matrix=data_matrix,
                           totalSpan=totalSpan,
                           totalNoOfLoci=totalNoOfLoci)
Esempio n. 15
0
 def outputOverlapSites(self, overlapping_sites_set=None, outputFname=None):
     """
     overlapping_sites_set is a set of (chromosome, pos) tuples.
     output is tab-delimited, 3-column. Last column is always 0 to mimic output of CalculateSNPMismatchRateOfTwoVCF.py
         chromosome	position	0
     """
     sys.stderr.write("Outputting overlap %s sites ..." %
                      (len(overlapping_sites_set)))
     header = ['chromosome', 'position', 'random']
     overlapping_sites_list = list(overlapping_sites_set)
     writer = MatrixFile(outputFname, mode='w', delimiter='\t')
     writer.writerow(header)
     overlapping_sites_list.sort()
     for chromosome, pos in overlapping_sites_list:
         writer.writerow([chromosome, pos, 0])
     sys.stderr.write("%s sites.\n" % (len(overlapping_sites_list)))
Esempio n. 16
0
    def calculatePerSampleMismatchFraction(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlapping_sample_id_set=None,\
                                        NA_call_encoding_set = set(['.', 'NA'])):
        """
        2013.08.13 bugfix, derive overlapping_sites_set by itself, rather than use calculateOverlappingSites()
        2013.07.17 vcf files are no longer pre-loaded.
        2012.8.16
        """
        sys.stderr.write(
            "Finding matches for each sample at overlapping sites ...")
        writer = MatrixFile(outputFname, mode='w', delimiter='\t')
        header = [
            'sample_id', 'no_of_matches', 'no_of_non_NA_pairs', 'matchFraction'
        ]
        no_of_samples_to_compare = len(overlapping_sample_id_set)

        vcfFile1._resetInput()
        vcfFile1.parseFile()
        vcfFile2._resetInput()
        vcfFile2.parseFile()

        overlapping_sites_set = set(vcfFile1.locus_id_ls) & set(
            vcfFile2.locus_id_ls)
        sys.stderr.write(" %s overlapping loci, " %
                         (len(overlapping_sites_set)))

        header_ls_for_no_of_matches = []
        header_ls_for_no_of_non_NA_pairs = []
        header_ls_for_matchFraction = []
        overlapping_sample_id_list = sorted(overlapping_sample_id_set)
        """
        for sample_id in overlapping_sample_id_list:
            header_ls_for_no_of_matches.append('no_of_matches_for_%s'%(sample_id))
            header_ls_for_no_of_non_NA_pairs.append('no_of_non_NA_pairs_for_%s'%(sample_id))
            header_ls_for_matchFraction.append('matchFraction_for_%s'%(sample_id))
        
        #header = header + header_ls_for_no_of_matches + header_ls_for_no_of_non_NA_pairs + header_ls_for_matchFraction
        """
        no_of_matches_per_sample_ls = [0] * no_of_samples_to_compare
        no_of_non_NA_pairs_per_sample_ls = [0] * no_of_samples_to_compare

        for locus_id in overlapping_sites_set:
            row_index1 = vcfFile1.locus_id2row_index[locus_id]
            row_index2 = vcfFile2.locus_id2row_index[locus_id]
            for j in range(len(overlapping_sample_id_list)):
                sample_id = overlapping_sample_id_list[j]
                col_index1 = vcfFile1.sample_id2index.get(sample_id)
                col_index2 = vcfFile2.sample_id2index.get(sample_id)
                #2012.1.17 bugfix below. so that 'AG' and 'GA' are same.
                call1 = vcfFile1.genotype_call_matrix[row_index1][col_index1]
                call2 = vcfFile2.genotype_call_matrix[row_index2][col_index2]
                if call1 not in NA_call_encoding_set and call2 not in NA_call_encoding_set:
                    no_of_non_NA_pairs_per_sample_ls[j] += 1
                    if nt2number[call1] == nt2number[
                            call2]:  #2013.07.03 bugfix, 'AT' and 'TA' should be same. no phase
                        no_of_matches_per_sample_ls[j] += 1
                    else:
                        #do nothing
                        pass
        matchFractionLs = [-1] * no_of_samples_to_compare
        for j in range(no_of_samples_to_compare):
            if no_of_non_NA_pairs_per_sample_ls[j] > 0:
                matchFractionLs[j] = no_of_matches_per_sample_ls[j] / float(
                    no_of_non_NA_pairs_per_sample_ls[j])

        writer.writerow(header)
        for i in range(no_of_samples_to_compare):
            data_row = [overlapping_sample_id_list[i], no_of_matches_per_sample_ls[i], no_of_non_NA_pairs_per_sample_ls[i],\
                    matchFractionLs[i]]
            writer.writerow(data_row)
        del writer
        sys.stderr.write("%s samples.\n" % (no_of_samples_to_compare))
    def traverse(self):
        """
        """
        newHeader = []
        key2dataLs = {}
        #key is the keyColumn,
        #  dataLs corresponds to the sum of each column from valueColumnLs
        noOfDataColumnsFromPriorFiles = 0
        for inputFname in self.inputFnameLs:
            if not os.path.isfile(inputFname):
                if self.exitNonZeroIfAnyInputFileInexistent:
                    logging.error(f'{inputFname} does not exist.')
                    sys.exit(3)
                else:
                    continue
            reader = None
            try:
                inputFile = utils.openGzipFile(inputFname)
                if self.inputDelimiter is None or self.inputDelimiter == '':
                    self.inputDelimiter = figureOutDelimiter(inputFile)
                reader = MatrixFile(file_handle=inputFile,
                                    delimiter=self.inputDelimiter)
            except:
                logging.error(f'Except type: {sys.exc_info()}')
                import traceback
                traceback.print_exc()

            valueColumnLs = []
            try:
                header = next(reader)
                self.handleNewHeader(header,
                                     newHeader,
                                     self.keyColumnLs,
                                     valueColumnLs,
                                     keyColumnSet=self.keyColumnSet)
                if self.noHeader:
                    inputFile.seek(0)
                    reader = MatrixFile(file_handle=inputFile,
                                        delimiter=self.inputDelimiter)
            except:
                #in case something wrong (i.e. file is empty)
                logging.error(f'Except type: {sys.exc_info()}')
                import traceback
                traceback.print_exc()

            if reader is not None and valueColumnLs:
                visitedKeySet = set()
                for row in reader:
                    try:
                        self.handleValueColumns(row,
                                                key2dataLs=key2dataLs,
                                                keyColumnLs=self.keyColumnLs,
                                                valueColumnLs=valueColumnLs,
                                                noOfDataColumnsFromPriorFiles=
                                                noOfDataColumnsFromPriorFiles,
                                                visitedKeySet=visitedKeySet)
                    except:
                        logging.error(f'Ignore this row: {row}.')
                        logging.error(f'Except type: {sys.exc_info()}')
                        import traceback
                        traceback.print_exc()
                del reader
                #append empty data to keys who are missing in the current file.
                totalKeySet = set(key2dataLs.keys())
                unvisitedKeySet = totalKeySet - visitedKeySet
                for key in unvisitedKeySet:
                    for i in valueColumnLs:
                        key2dataLs[key].append('')
            noOfDataColumnsFromPriorFiles += len(valueColumnLs)
        if self.noHeader:
            newHeader = None
        returnData = PassingData(key2dataLs=key2dataLs,
                                 delimiter=self.inputDelimiter,
                                 header=newHeader)
        return returnData
Esempio n. 18
0
    def run(self):
        """
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        reader = MatrixFile(path=self.inputFname)
        reader.constructColName2IndexFromHeader()

        meanMendelErrorIndex = reader.getColIndexGivenColHeader(
            "meanMendelError")
        noOfLociIndex = reader.getColIndexGivenColHeader("sampled_base_count")
        sumOfMendelErrorIndex = reader.getColIndexGivenColHeader(
            "sumOfMendelError")

        plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname)
        familyStructureData = plinkPedigreeFile.getFamilyStructurePlinkWay()

        twoParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=2)
        singleParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=1)
        zeroParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=0)

        writer = MatrixFile(self.outputFname, mode='w', delimiter='\t')
        header = ["ID", "noOfTotalLoci", \
          "noOfTwoParentFamilies", "noOfParentsInTwoParentFamilies", "noOfKidsInTwoParentFamilies", "noOfIndividualsInTwoParentFamilies", \
          "noOfSingleParentFamilies", "noOfParentsInSingleParentFamilies", "noOfKidsInSingleParentFamilies",  "noOfIndividualsInSingleParentFamilies", \
          "noOfZeroParentFamilies", "noOfParentsInZeroParentFamilies", "noOfKidsInZeroParentFamilies", "noOfIndividualsInZeroParentFamilies", \
          "noOfTotalMendelErrors", \
          "noOfMendelErrorsPerLocusPerNuclearFamily", "noOfMendelErrorsPerNuclearFamily"]
        writer.writeHeader(header)
        for row in reader:
            meanMendelError = float(row[meanMendelErrorIndex])
            noOfLoci = int(row[noOfLociIndex])
            sumOfMendelError = int(row[sumOfMendelErrorIndex])
            noOfNuclearFamilies = twoParentFamilyCountData.noOfFamilies
            if noOfNuclearFamilies > 0:
                noOfMendelErrorsPerLocusPerNuclearFamily = meanMendelError / float(
                    noOfNuclearFamilies)
                noOfMendelErrorsPerNuclearFamily = sumOfMendelError / float(
                    noOfNuclearFamilies)
            else:
                noOfMendelErrorsPerLocusPerNuclearFamily = -1
                noOfMendelErrorsPerNuclearFamily = -1
            data_row = [row[0], noOfLoci, \
              noOfNuclearFamilies, twoParentFamilyCountData.noOfParents, twoParentFamilyCountData.noOfKids, \
               twoParentFamilyCountData.noOfIndividuals,\
              singleParentFamilyCountData.noOfFamilies,  singleParentFamilyCountData.noOfParents,  singleParentFamilyCountData.noOfKids,\
               singleParentFamilyCountData.noOfIndividuals,\
              zeroParentFamilyCountData.noOfFamilies, zeroParentFamilyCountData.noOfParents,  zeroParentFamilyCountData.noOfKids,\
               zeroParentFamilyCountData.noOfIndividuals,\
              sumOfMendelError, \
              noOfMendelErrorsPerLocusPerNuclearFamily,noOfMendelErrorsPerNuclearFamily ]
            writer.writerow(data_row)

        plinkPedigreeFile.close()
        reader.close()
        writer.close()
Esempio n. 19
0
    def calculateOverlappingSites(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlappingSitesOutputFname=None,\
        chromosome=None, chrLength=None):
        """
        2013.07.17 vcf files are no longer pre-loaded. read in locus ids first. 
        """
        writer = MatrixFile(outputFname, mode='w', delimiter='\t')
        header = ['#chromosome', 'length', '#sitesInInput1', '#sitesInInput2', '#overlapping', 'overlappingOverTotal', \
                'overlappingOverInput1', 'overlappingOverInput2', '#segregatingSitesNormalized', ]

        vcf1_locus_id_list = []
        for row in vcfFile1.reader:
            vcf1_locus_id_list.append((row[0], row[1]))
        vcf2_locus_id_list = []
        for row in vcfFile2.reader:
            vcf2_locus_id_list.append((row[0], row[1]))

        no_of_sites_of_input1 = len(vcf1_locus_id_list)
        no_of_sites_of_input2 = len(vcf2_locus_id_list)
        overlapping_sites_set = set(vcf1_locus_id_list) & set(
            vcf2_locus_id_list)
        if overlappingSitesOutputFname:
            #outputFname = "%s_overlapSitePos.tsv"%(outputFnamePrefix)
            self.outputOverlapSites(
                overlapping_sites_set=overlapping_sites_set,
                outputFname=overlappingSitesOutputFname)

        no_of_overlapping_sites = len(overlapping_sites_set)
        no_of_total_sites = no_of_sites_of_input1 + no_of_sites_of_input2 - no_of_overlapping_sites
        if no_of_total_sites > 0:
            overlapping_fraction = no_of_overlapping_sites / float(
                no_of_total_sites)
        else:
            overlapping_fraction = -1

        if no_of_sites_of_input1 > 0:
            overlappingOverInput1 = no_of_overlapping_sites / float(
                no_of_sites_of_input1)
        else:
            overlappingOverInput1 = -1

        if no_of_sites_of_input2 > 0:
            overlappingOverInput2 = no_of_overlapping_sites / float(
                no_of_sites_of_input2)
        else:
            overlappingOverInput2 = -1

        no_of_samples = len(vcfFile1.sample_id2index)
        no_of_samples_in_vcf2 = len(vcfFile2.sample_id2index)
        overlapping_sample_id_set = set(vcfFile1.sample_id2index.keys()) & set(
            vcfFile2.sample_id2index.keys())

        if no_of_samples != no_of_samples_in_vcf2:
            sys.stderr.write("Warning: sample size in %s is %s, in %s is %s. not matching.\n"%\
                            (vcfFile1.inputFname, no_of_samples, vcfFile2.inputFname, no_of_samples_in_vcf2))

        #exclude the ref sample in the 1st column
        if no_of_samples > 1:
            normalizingConstant = float(
                utils.sumOfReciprocals(no_of_samples * 2 - 1))
        else:
            normalizingConstant = 1
        noOfSegregatesSitesNormalized = no_of_overlapping_sites / (
            normalizingConstant * chrLength)

        writer.writerow(header)
        """
        #reformat for output
        no_of_matches_per_sample_ls = map(repr, no_of_matches_per_sample_ls)
        no_of_non_NA_pairs_per_sample_ls = map(repr, no_of_non_NA_pairs_per_sample_ls)
        matchFractionLs = map(repr, matchFractionLs)
        """
        writer.writerow([chromosome, chrLength, no_of_sites_of_input1, no_of_sites_of_input2, no_of_overlapping_sites, \
                        overlapping_fraction, overlappingOverInput1, overlappingOverInput2, \
                        noOfSegregatesSitesNormalized])
        del writer
        return PassingData(overlapping_sample_id_set=overlapping_sample_id_set,
                           overlapping_sites_set=overlapping_sites_set)
Esempio n. 20
0
    def setup(self, **keywords):
        """
		2012.10.15
			run before anything is run
		"""
        AbstractMatrixFileWalker.setup(self, **keywords)
        #self.writer = BeagleGenotypeFile(path=self.outputFname, mode='w')

        #read in the IBD check result
        self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \
            rowIDHeader=None, colIDHeader=None, \
            rowIDIndex=0, colIDIndex=1, \
            dataHeader=None, dataIndex=2, hasHeader=False)

        #. read in the alignment coverage data
        alignmentCoverageFile = MatrixFile(
            path=self.individualAlignmentCoverageFname)
        alignmentCoverageFile.constructColName2IndexFromHeader()
        alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(
            keyColumnIndexList=[0], valueColumnIndexList=[1])
        alignmentCoverageFile.close()

        sys.stderr.write(
            "Reading in all samples from %s VCF input files ... \n" %
            (len(self.inputFnameLs)))
        # read all the Beagle files
        individualID2HaplotypeData = {}
        for inputFname in self.inputFnameLs:
            vcfFile = VCFFile(inputFname=inputFname)
            #vcfFile.readInAllHaplotypes()
            for individualID in vcfFile.getSampleIDList():
                individualID2HaplotypeData[individualID] = None
                #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID)
                #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList,
                #													locusIDList=vcfFile.locusIDList)
            # get all haplotypes , etc.
            # get all sample IDs
        sys.stderr.write("%s individuals total.\n" %
                         (len(individualID2HaplotypeData)))

        #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns)
        #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child)
        sys.stderr.write("Constructing individualID2pedigreeContext ...")
        plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname)
        pGraph = plinkPedigreeFile.pedigreeGraph
        #shrink the graph to only individuals with data
        pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys())

        cc_subgraph_list = nx.connected_component_subgraphs(
            pGraph.to_undirected())
        individualID2familyContext = {}
        outDegreeContainer = NumberContainer(minValue=0)
        familySizeContainer = NumberContainer(minValue=0)
        individualCoverageContainer = NumberContainer(minValue=0)
        familyCoverageContainer = NumberContainer(minValue=0)
        for cc_subgraph in cc_subgraph_list:
            familySize = len(cc_subgraph)
            familySizeContainer.addOneValue(familySize)

            familyCoverage = 0
            for n in cc_subgraph:  #assuming each family is a two-generation trio/nuclear family
                individualCoverage = self.getIndividualCoverage(
                    individualID=n,
                    alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs
                )
                individualCoverage = float(individualCoverage)
                individualCoverageContainer.addOneValue(individualCoverage)
                familyCoverage += individualCoverage
                in_degree = pGraph.in_degree(n)
                out_degree = pGraph.out_degree(n)
                outDegreeContainer.addOneValue(out_degree)
                familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \
                      individualCoverage=individualCoverage,\
                      familyCoverage=None)
                if n not in individualID2familyContext:
                    individualID2familyContext[n] = familyContext
                else:
                    sys.stderr.write(
                        "Node %s already in individualID2familyContext.\n" %
                        (n))
            familyCoverageContainer.addOneValue(familyCoverage)
            #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype
            for n in cc_subgraph:
                individualID2familyContext[n].familyCoverage = familyCoverage
        plinkPedigreeFile.close()
        sys.stderr.write("%s individuals.\n" %
                         (len(individualID2familyContext)))

        # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual
        sys.stderr.write(
            "Weighing each individual , assigning probability mass  ...")
        individualID2probabilityMass = {}
        for individualID, familyContext in individualID2familyContext.items():
            outDegreeQuotient = outDegreeContainer.normalizeValue(
                familyContext.familySize)
            individualCoverageQuotient = individualCoverageContainer.normalizeValue(
                familyContext.individualCoverage)
            #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage)
            importanceScore = outDegreeQuotient + individualCoverageQuotient
            representativeImportanceScore = importanceScore
            individualID2probabilityMass[
                individualID] = representativeImportanceScore
        sys.stderr.write(" %s IDs with probability mass assigned.\n" %
                         (len(individualID2probabilityMass)))

        self.individualID2probabilityMass = individualID2probabilityMass
        self.individualID2HaplotypeData = individualID2HaplotypeData
    def traverse(self):
        """
        """
        newHeader = []
        key2dataLs = {}
        #key is the keyColumn,
        #  dataLs corresponds to the sum of each column from valueColumnLs
        delimiter = None
        for inputFname in self.inputFnameLs:
            if not os.path.isfile(inputFname):
                if self.exitNonZeroIfAnyInputFileInexistent:
                    sys.exit(3)
                else:
                    continue
            reader = None
            try:
                inputFile = utils.openGzipFile(inputFname)
                delimiter = figureOutDelimiter(inputFile)
                reader = MatrixFile(file_handle=inputFile, delimiter=delimiter)
            except:
                logging.error(f'Except type: {sys.exc_info()}')
                import traceback
                traceback.print_exc()

            try:
                header = next(reader)
                self.handleNewHeader(header,
                                     newHeader,
                                     self.keyColumnLs,
                                     self.valueColumnLs,
                                     keyColumnSet=self.keyColumnSet)
                if self.noHeader:
                    inputFile.seek(0)
                    reader = MatrixFile(file_handle=inputFile,
                                        delimiter=delimiter)
            except:
                logging.error(f'Except type: {sys.exc_info()}')
                import traceback
                traceback.print_exc()

            if reader is not None:
                for row in reader:
                    try:
                        self.handleValueColumns(
                            row,
                            key2dataLs=key2dataLs,
                            keyColumnLs=self.keyColumnLs,
                            valueColumnLs=self.valueColumnLs)
                    except:
                        #in case something wrong (i.e. file is empty)
                        logging.error(f'Ignore this row: {row}.')
                        logging.error(f'Except type: {sys.exc_info()}')
                        import traceback
                        traceback.print_exc()
                del reader
        if self.noHeader:
            newHeader = None
        returnData = PassingData(key2dataLs=key2dataLs,
                                 delimiter=delimiter,
                                 header=newHeader)
        return returnData