def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, mode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
Ejemplo n.º 2
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        snp_pos2genotypeVectorLs = self.readInSNPID2GenotypeVectorLs(
            self.inputFname).snp_pos2returnData

        writer = MatrixFile(self.outputFname, mode='w', delimiter='\t')
        header = [
            'chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance'
        ]
        writer.writeHeader(header)

        counter = 0
        real_counter = 0
        no_of_pairs = 0
        snp_pos_ls = sorted(snp_pos2genotypeVectorLs)
        for i in range(len(snp_pos_ls)):
            counter += 1
            key = snp_pos_ls[i]
            chromosome, position = snp_pos_ls[i][:2]
            genotypeVectorLs = snp_pos2genotypeVectorLs.get(key)
            if len(genotypeVectorLs) > 1:
                real_counter += 1
                for k in range(0, len(genotypeVectorLs) - 1):
                    for l in range(k + 1, len(genotypeVectorLs)):
                        no_of_pairs += 1
                        noOfMatches = 0
                        noOfTotal = 0
                        genotypeVector0 = genotypeVectorLs[k]
                        genotypeVector1 = genotypeVectorLs[l]
                        for j in range(len(genotypeVector0)):
                            call1 = genotypeVector0[j]['GT']
                            call2 = genotypeVector1[j]['GT']
                            if call1 != 'NA' and call2 != 'NA':
                                noOfTotal += 1
                                if SNP.nt2number[call1] == SNP.nt2number[
                                        call2]:
                                    noOfMatches += 1
                        if noOfTotal > 0:
                            concordance = float(noOfMatches) / float(noOfTotal)
                        else:
                            concordance = -1
                        data_row = [
                            chromosome, position, noOfMatches, noOfTotal,
                            concordance
                        ]
                        writer.writerow(data_row)
        writer.close()
        sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \
                                                real_counter/float(counter), no_of_pairs))
Ejemplo n.º 3
0
 def outputOverlapSites(self, overlapping_sites_set=None, outputFname=None):
     """
     overlapping_sites_set is a set of (chromosome, pos) tuples.
     output is tab-delimited, 3-column. Last column is always 0 to mimic output of CalculateSNPMismatchRateOfTwoVCF.py
         chromosome	position	0
     """
     sys.stderr.write("Outputting overlap %s sites ..." %
                      (len(overlapping_sites_set)))
     header = ['chromosome', 'position', 'random']
     overlapping_sites_list = list(overlapping_sites_set)
     writer = MatrixFile(outputFname, mode='w', delimiter='\t')
     writer.writerow(header)
     overlapping_sites_list.sort()
     for chromosome, pos in overlapping_sites_list:
         writer.writerow([chromosome, pos, 0])
     sys.stderr.write("%s sites.\n" % (len(overlapping_sites_list)))
 def outputFinalData(self,
                     outputFname,
                     key2dataLs=None,
                     delimiter=None,
                     header=None):
     """
     header output is not dependent on key2dataLs anymore 
     """
     writer = MatrixFile(path=outputFname, delimiter=delimiter, mode='w')
     if header and delimiter:
         writer.writerow(header)
     if key2dataLs and delimiter:
         keyLs = sorted(key2dataLs)
         for key in keyLs:
             dataLs = key2dataLs.get(key)
             writer.writerow(list(key) + dataLs)
     writer.close()
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        switchPointData = self.readInStats(inputFname=self.inputFname)

        sys.stderr.write("Processing data ...")
        writer = MatrixFile(self.outputFname, mode='w')
        header = [
            "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction',
            "noOfLoci", 'noOfLociFraction'
        ]
        writer.writeHeader(header)

        data_matrix = switchPointData.data_matrix
        totalSpan = switchPointData.totalSpan
        totalNoOfLoci = switchPointData.totalNoOfLoci

        #sort it based on switchFrequency
        data_matrix.sort(reverse=True)
        maxSwitchFrequencyLs = []
        cumulativeRegionSpanLs = []
        cumulativeNoOfLociLs = []
        for i in range(len(data_matrix)):
            switchFrequency, regionSpan, noOfLoci = data_matrix[i]
            maxSwitchFrequencyLs.append(switchFrequency)
            if i == 0:
                cumulativeRegionSpan = totalSpan - regionSpan

                cumulativeNoOfLoci = totalNoOfLoci - noOfLoci
            else:
                cumulativeRegionSpan = cumulativeRegionSpanLs[i -
                                                              1] - regionSpan
                cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci
            cumulativeRegionSpanLs.append(cumulativeRegionSpan)
            cumulativeNoOfLociLs.append(cumulativeNoOfLoci)
            writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\
                cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)])
        writer.close()
        sys.stderr.write(".\n")
Ejemplo n.º 6
0
    def calculateOverlappingSites(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlappingSitesOutputFname=None,\
        chromosome=None, chrLength=None):
        """
        2013.07.17 vcf files are no longer pre-loaded. read in locus ids first. 
        """
        writer = MatrixFile(outputFname, mode='w', delimiter='\t')
        header = ['#chromosome', 'length', '#sitesInInput1', '#sitesInInput2', '#overlapping', 'overlappingOverTotal', \
                'overlappingOverInput1', 'overlappingOverInput2', '#segregatingSitesNormalized', ]

        vcf1_locus_id_list = []
        for row in vcfFile1.reader:
            vcf1_locus_id_list.append((row[0], row[1]))
        vcf2_locus_id_list = []
        for row in vcfFile2.reader:
            vcf2_locus_id_list.append((row[0], row[1]))

        no_of_sites_of_input1 = len(vcf1_locus_id_list)
        no_of_sites_of_input2 = len(vcf2_locus_id_list)
        overlapping_sites_set = set(vcf1_locus_id_list) & set(
            vcf2_locus_id_list)
        if overlappingSitesOutputFname:
            #outputFname = "%s_overlapSitePos.tsv"%(outputFnamePrefix)
            self.outputOverlapSites(
                overlapping_sites_set=overlapping_sites_set,
                outputFname=overlappingSitesOutputFname)

        no_of_overlapping_sites = len(overlapping_sites_set)
        no_of_total_sites = no_of_sites_of_input1 + no_of_sites_of_input2 - no_of_overlapping_sites
        if no_of_total_sites > 0:
            overlapping_fraction = no_of_overlapping_sites / float(
                no_of_total_sites)
        else:
            overlapping_fraction = -1

        if no_of_sites_of_input1 > 0:
            overlappingOverInput1 = no_of_overlapping_sites / float(
                no_of_sites_of_input1)
        else:
            overlappingOverInput1 = -1

        if no_of_sites_of_input2 > 0:
            overlappingOverInput2 = no_of_overlapping_sites / float(
                no_of_sites_of_input2)
        else:
            overlappingOverInput2 = -1

        no_of_samples = len(vcfFile1.sample_id2index)
        no_of_samples_in_vcf2 = len(vcfFile2.sample_id2index)
        overlapping_sample_id_set = set(vcfFile1.sample_id2index.keys()) & set(
            vcfFile2.sample_id2index.keys())

        if no_of_samples != no_of_samples_in_vcf2:
            sys.stderr.write("Warning: sample size in %s is %s, in %s is %s. not matching.\n"%\
                            (vcfFile1.inputFname, no_of_samples, vcfFile2.inputFname, no_of_samples_in_vcf2))

        #exclude the ref sample in the 1st column
        if no_of_samples > 1:
            normalizingConstant = float(
                utils.sumOfReciprocals(no_of_samples * 2 - 1))
        else:
            normalizingConstant = 1
        noOfSegregatesSitesNormalized = no_of_overlapping_sites / (
            normalizingConstant * chrLength)

        writer.writerow(header)
        """
        #reformat for output
        no_of_matches_per_sample_ls = map(repr, no_of_matches_per_sample_ls)
        no_of_non_NA_pairs_per_sample_ls = map(repr, no_of_non_NA_pairs_per_sample_ls)
        matchFractionLs = map(repr, matchFractionLs)
        """
        writer.writerow([chromosome, chrLength, no_of_sites_of_input1, no_of_sites_of_input2, no_of_overlapping_sites, \
                        overlapping_fraction, overlappingOverInput1, overlappingOverInput2, \
                        noOfSegregatesSitesNormalized])
        del writer
        return PassingData(overlapping_sample_id_set=overlapping_sample_id_set,
                           overlapping_sites_set=overlapping_sites_set)
Ejemplo n.º 7
0
    def calculatePerSampleMismatchFraction(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlapping_sample_id_set=None,\
                                        NA_call_encoding_set = set(['.', 'NA'])):
        """
        2013.08.13 bugfix, derive overlapping_sites_set by itself, rather than use calculateOverlappingSites()
        2013.07.17 vcf files are no longer pre-loaded.
        2012.8.16
        """
        sys.stderr.write(
            "Finding matches for each sample at overlapping sites ...")
        writer = MatrixFile(outputFname, mode='w', delimiter='\t')
        header = [
            'sample_id', 'no_of_matches', 'no_of_non_NA_pairs', 'matchFraction'
        ]
        no_of_samples_to_compare = len(overlapping_sample_id_set)

        vcfFile1._resetInput()
        vcfFile1.parseFile()
        vcfFile2._resetInput()
        vcfFile2.parseFile()

        overlapping_sites_set = set(vcfFile1.locus_id_ls) & set(
            vcfFile2.locus_id_ls)
        sys.stderr.write(" %s overlapping loci, " %
                         (len(overlapping_sites_set)))

        header_ls_for_no_of_matches = []
        header_ls_for_no_of_non_NA_pairs = []
        header_ls_for_matchFraction = []
        overlapping_sample_id_list = sorted(overlapping_sample_id_set)
        """
        for sample_id in overlapping_sample_id_list:
            header_ls_for_no_of_matches.append('no_of_matches_for_%s'%(sample_id))
            header_ls_for_no_of_non_NA_pairs.append('no_of_non_NA_pairs_for_%s'%(sample_id))
            header_ls_for_matchFraction.append('matchFraction_for_%s'%(sample_id))
        
        #header = header + header_ls_for_no_of_matches + header_ls_for_no_of_non_NA_pairs + header_ls_for_matchFraction
        """
        no_of_matches_per_sample_ls = [0] * no_of_samples_to_compare
        no_of_non_NA_pairs_per_sample_ls = [0] * no_of_samples_to_compare

        for locus_id in overlapping_sites_set:
            row_index1 = vcfFile1.locus_id2row_index[locus_id]
            row_index2 = vcfFile2.locus_id2row_index[locus_id]
            for j in range(len(overlapping_sample_id_list)):
                sample_id = overlapping_sample_id_list[j]
                col_index1 = vcfFile1.sample_id2index.get(sample_id)
                col_index2 = vcfFile2.sample_id2index.get(sample_id)
                #2012.1.17 bugfix below. so that 'AG' and 'GA' are same.
                call1 = vcfFile1.genotype_call_matrix[row_index1][col_index1]
                call2 = vcfFile2.genotype_call_matrix[row_index2][col_index2]
                if call1 not in NA_call_encoding_set and call2 not in NA_call_encoding_set:
                    no_of_non_NA_pairs_per_sample_ls[j] += 1
                    if nt2number[call1] == nt2number[
                            call2]:  #2013.07.03 bugfix, 'AT' and 'TA' should be same. no phase
                        no_of_matches_per_sample_ls[j] += 1
                    else:
                        #do nothing
                        pass
        matchFractionLs = [-1] * no_of_samples_to_compare
        for j in range(no_of_samples_to_compare):
            if no_of_non_NA_pairs_per_sample_ls[j] > 0:
                matchFractionLs[j] = no_of_matches_per_sample_ls[j] / float(
                    no_of_non_NA_pairs_per_sample_ls[j])

        writer.writerow(header)
        for i in range(no_of_samples_to_compare):
            data_row = [overlapping_sample_id_list[i], no_of_matches_per_sample_ls[i], no_of_non_NA_pairs_per_sample_ls[i],\
                    matchFractionLs[i]]
            writer.writerow(data_row)
        del writer
        sys.stderr.write("%s samples.\n" % (no_of_samples_to_compare))
Ejemplo n.º 8
0
    def run(self):
        """
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        reader = MatrixFile(path=self.inputFname)
        reader.constructColName2IndexFromHeader()

        meanMendelErrorIndex = reader.getColIndexGivenColHeader(
            "meanMendelError")
        noOfLociIndex = reader.getColIndexGivenColHeader("sampled_base_count")
        sumOfMendelErrorIndex = reader.getColIndexGivenColHeader(
            "sumOfMendelError")

        plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname)
        familyStructureData = plinkPedigreeFile.getFamilyStructurePlinkWay()

        twoParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=2)
        singleParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=1)
        zeroParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=0)

        writer = MatrixFile(self.outputFname, mode='w', delimiter='\t')
        header = ["ID", "noOfTotalLoci", \
          "noOfTwoParentFamilies", "noOfParentsInTwoParentFamilies", "noOfKidsInTwoParentFamilies", "noOfIndividualsInTwoParentFamilies", \
          "noOfSingleParentFamilies", "noOfParentsInSingleParentFamilies", "noOfKidsInSingleParentFamilies",  "noOfIndividualsInSingleParentFamilies", \
          "noOfZeroParentFamilies", "noOfParentsInZeroParentFamilies", "noOfKidsInZeroParentFamilies", "noOfIndividualsInZeroParentFamilies", \
          "noOfTotalMendelErrors", \
          "noOfMendelErrorsPerLocusPerNuclearFamily", "noOfMendelErrorsPerNuclearFamily"]
        writer.writeHeader(header)
        for row in reader:
            meanMendelError = float(row[meanMendelErrorIndex])
            noOfLoci = int(row[noOfLociIndex])
            sumOfMendelError = int(row[sumOfMendelErrorIndex])
            noOfNuclearFamilies = twoParentFamilyCountData.noOfFamilies
            if noOfNuclearFamilies > 0:
                noOfMendelErrorsPerLocusPerNuclearFamily = meanMendelError / float(
                    noOfNuclearFamilies)
                noOfMendelErrorsPerNuclearFamily = sumOfMendelError / float(
                    noOfNuclearFamilies)
            else:
                noOfMendelErrorsPerLocusPerNuclearFamily = -1
                noOfMendelErrorsPerNuclearFamily = -1
            data_row = [row[0], noOfLoci, \
              noOfNuclearFamilies, twoParentFamilyCountData.noOfParents, twoParentFamilyCountData.noOfKids, \
               twoParentFamilyCountData.noOfIndividuals,\
              singleParentFamilyCountData.noOfFamilies,  singleParentFamilyCountData.noOfParents,  singleParentFamilyCountData.noOfKids,\
               singleParentFamilyCountData.noOfIndividuals,\
              zeroParentFamilyCountData.noOfFamilies, zeroParentFamilyCountData.noOfParents,  zeroParentFamilyCountData.noOfKids,\
               zeroParentFamilyCountData.noOfIndividuals,\
              sumOfMendelError, \
              noOfMendelErrorsPerLocusPerNuclearFamily,noOfMendelErrorsPerNuclearFamily ]
            writer.writerow(data_row)

        plinkPedigreeFile.close()
        reader.close()
        writer.close()