Beispiel #1
0
    def openWriteBeagleFiles(self,
                             pedigreeFamilyData=None,
                             outputFnamePrefix=None):
        """
		2013.05.02
			
		The non-likelihood (unphased, trios, pairs) Beagle format:
			I id sample1 sample1 sample2 sample2
			A diabetes 1 1 2 2
			M rs12082861 C C C C
			M rs4912233 T C C C
			M rs12732823 G A A A
			M rs17451521 C C C C
			M rs12033358 C T T T
		
		The likelihood version is
			marker alleleA alleleB 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1002_711_2001039_GA_vs_524
			Contig791:1086 C A 0.9693 0.0307 0.0000 0.6660 0.3338 0.0003 0.0000
			Contig791:1649 G C 0.9406 0.0594 0.0000 0.9693 0.0307 0.0000 0.0000
			Contig791:4084 A C 0.9980 0.0020 0.0000 0.9844 0.0156 0.0000 0.0000
		
		The markers file has this format (markerID, position, alleleA, alleleB)
			Contig791:1086 1086 C A
		"""
        sys.stderr.write(
            "Opening beagle files (outputFnamePrefix =%s) to write ..." %
            (outputFnamePrefix))
        familySize2BeagleFileHandler = {}
        familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList
        counter = 0
        for familySize, sampleIDList in familySize2SampleIDList.iteritems():
            if familySize not in familySize2BeagleFileHandler:
                tmpOutputFnamePrefix = '%s_familySize%s' % (outputFnamePrefix,
                                                            familySize)
                writer = MatrixFile(inputFname='%s.bgl' %
                                    (tmpOutputFnamePrefix),
                                    openMode='w',
                                    delimiter=' ')
                familySize2BeagleFileHandler[familySize] = writer
                if familySize == 1:
                    headerRow = ['marker', 'alleleA', 'alleleB']
                else:
                    headerRow = ['I', 'id']
                for sampleID in sampleIDList:
                    if familySize == 1:  #likelihood format has sample name replicated three times, rather than 2 times
                        headerRow.extend([sampleID] * 3)
                    else:
                        headerRow.extend([sampleID] * 2)
                writer.writeHeader(headerRow)
                counter += 1
        markersFile = MatrixFile(inputFname='%s.markers' % (outputFnamePrefix),
                                 openMode='w',
                                 delimiter=' ')

        counter += 1
        sys.stderr.write("%s files outputted.\n" % (counter))

        return PassingData(
            familySize2BeagleFileHandler=familySize2BeagleFileHandler,
            markersFile=markersFile)
	def openOneInputFile(self, inputFname=None):
		"""
		2013.09.05 split out of fileWalker() , added VCFFile
		"""
		if self.inputFileFormat==2:	#2012.12.20
			reader = YHFile(inputFname, openMode='r', tableName=self.h5TableName)
		elif self.inputFileFormat==3:	#2012.11.22
			reader = HDF5MatrixFile(inputFname, openMode='r')
		elif self.inputFileFormat==4:
			reader = VCFFile(inputFname=inputFname)
		else:
			reader = MatrixFile(inputFname)
		return reader
Beispiel #3
0
	def outputFinalData(self, outputFname, key2dataLs=None, delimiter=None, header=None):
		"""
		2013.07.18 header output is not dependent on key2dataLs anymore 
		2013.3.3 bugfix , added openMode='w' for MatrixFile()
		2013.2.12 replace csv.writer with MatrixFile
		2012.7.30
			open the outputFname regardless whether there is data or not.
		2012.1.9
		"""
		writer = MatrixFile(inputFname=outputFname, delimiter=delimiter, openMode='w')
		if header and delimiter:
			writer.writerow(header)
		if key2dataLs and delimiter:
			keyLs = key2dataLs.keys()
			keyLs.sort()
			for key in keyLs:
				dataLs = key2dataLs.get(key)
				writer.writerow(list(key) + dataLs)
		writer.close()
	def setup(self, **keywords):
		"""
		2012.11.22
		2012.10.25
			do not open the file if it's a png file
		2012.10.15
			run before anything is run
		"""
		writer = None
		if self.outputFileFormat in [1,4]:
			suffix = os.path.splitext(self.outputFname)[1]
			if self.outputFname and suffix!='.png':
				writer = MatrixFile(self.outputFname, openMode='w', delimiter='\t')
				#writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		else:	#HDF5MatrixFile
			#can't generate HDF5MatrixFile, because it needs dtypeList
			pass
		#pass it to the invariantPData
		self.invariantPData.writer = writer
		self.writer = writer
	def setup(self, **keywords):
		"""
		2013.09.30
		"""
		parentClass.setup(self, **keywords)
		
		#. read in alignment coverage data
		trioInconsistencyFile = MatrixFile(inputFname=self.originalTrioInconsistencyFname)
		trioInconsistencyFile.constructColName2IndexFromHeader()
		self.trioID2dataList = trioInconsistencyFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1,2,3],\
														keyUniqueInInputFile=True)
		trioInconsistencyFile.close()
		
		self.alignmentID2alignmentReadGroup = {}
		for alignmentReadGroup in self.alignmentReadGroup2individualID:
			alignmentID = alignmentReadGroup.split('_')[0]
			self.alignmentID2alignmentReadGroup[alignmentID] = alignmentReadGroup
    def run(self):
        """
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        reader = MatrixFile(inputFname=self.inputFname)
        reader.constructColName2IndexFromHeader()

        meanMendelErrorIndex = reader.getColIndexGivenColHeader(
            "meanMendelError")
        noOfLociIndex = reader.getColIndexGivenColHeader("sampled_base_count")
        sumOfMendelErrorIndex = reader.getColIndexGivenColHeader(
            "sumOfMendelError")

        plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname)
        familyStructureData = plinkPedigreeFile.getFamilyStructurePlinkWay()

        twoParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=2)
        singleParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=1)
        zeroParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=0)

        writer = MatrixFile(self.outputFname, openMode='w', delimiter='\t')
        header = ["ID", "noOfTotalLoci", \
          "noOfTwoParentFamilies", "noOfParentsInTwoParentFamilies", "noOfKidsInTwoParentFamilies", "noOfIndividualsInTwoParentFamilies", \
          "noOfSingleParentFamilies", "noOfParentsInSingleParentFamilies", "noOfKidsInSingleParentFamilies",  "noOfIndividualsInSingleParentFamilies", \
          "noOfZeroParentFamilies", "noOfParentsInZeroParentFamilies", "noOfKidsInZeroParentFamilies", "noOfIndividualsInZeroParentFamilies", \
          "noOfTotalMendelErrors", \
          "noOfMendelErrorsPerLocusPerNuclearFamily", "noOfMendelErrorsPerNuclearFamily"]
        writer.writeHeader(header)
        for row in reader:
            meanMendelError = float(row[meanMendelErrorIndex])
            noOfLoci = int(row[noOfLociIndex])
            sumOfMendelError = int(row[sumOfMendelErrorIndex])
            noOfNuclearFamilies = twoParentFamilyCountData.noOfFamilies
            if noOfNuclearFamilies > 0:
                noOfMendelErrorsPerLocusPerNuclearFamily = meanMendelError / float(
                    noOfNuclearFamilies)
                noOfMendelErrorsPerNuclearFamily = sumOfMendelError / float(
                    noOfNuclearFamilies)
            else:
                noOfMendelErrorsPerLocusPerNuclearFamily = -1
                noOfMendelErrorsPerNuclearFamily = -1
            data_row = [row[0], noOfLoci, \
              noOfNuclearFamilies, twoParentFamilyCountData.noOfParents, twoParentFamilyCountData.noOfKids, \
               twoParentFamilyCountData.noOfIndividuals,\
              singleParentFamilyCountData.noOfFamilies,  singleParentFamilyCountData.noOfParents,  singleParentFamilyCountData.noOfKids,\
               singleParentFamilyCountData.noOfIndividuals,\
              zeroParentFamilyCountData.noOfFamilies, zeroParentFamilyCountData.noOfParents,  zeroParentFamilyCountData.noOfKids,\
               zeroParentFamilyCountData.noOfIndividuals,\
              sumOfMendelError, \
              noOfMendelErrorsPerLocusPerNuclearFamily,noOfMendelErrorsPerNuclearFamily ]
            writer.writerow(data_row)

        plinkPedigreeFile.close()
        reader.close()
        writer.close()
Beispiel #7
0
	def setup(self, **keywords):
		"""
		2012.10.15
			run before anything is run
		"""
		AbstractMatrixFileWalker.setup(self, **keywords)
		#self.writer = BeagleGenotypeFile(inputFname=self.outputFname, openMode='w')
		
		#read in the IBD check result
		self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \
								rowIDHeader=None, colIDHeader=None, \
								rowIDIndex=0, colIDIndex=1, \
								dataHeader=None, dataIndex=2, hasHeader=False)
		
		#. read in the alignment coverage data
		alignmentCoverageFile = MatrixFile(inputFname=self.individualAlignmentCoverageFname)
		alignmentCoverageFile.constructColName2IndexFromHeader()
		alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1])
		alignmentCoverageFile.close()
		
		sys.stderr.write("Reading in all samples from %s VCF input files ... \n"%(len(self.inputFnameLs)))
		# read all the Beagle files
		individualID2HaplotypeData = {}
		for inputFname in self.inputFnameLs:
			vcfFile = VCFFile(inputFname=inputFname)
			#vcfFile.readInAllHaplotypes()
			for individualID in vcfFile.getSampleIDList():
				individualID2HaplotypeData[individualID] = None
				#haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID)
				#individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList,
				#													locusIDList=vcfFile.locusIDList)
			# get all haplotypes , etc.
			# get all sample IDs
		sys.stderr.write("%s individuals total.\n"%(len(individualID2HaplotypeData)))
		
		#. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns)
		#. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child)
		sys.stderr.write("Constructing individualID2pedigreeContext ...")
		plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname)
		pGraph = plinkPedigreeFile.pedigreeGraph
		#shrink the graph to only individuals with data
		pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys())
		
		cc_subgraph_list = nx.connected_component_subgraphs(pGraph.to_undirected())
		individualID2familyContext = {}
		outDegreeContainer = NumberContainer(minValue=0)
		familySizeContainer = NumberContainer(minValue=0)
		individualCoverageContainer = NumberContainer(minValue=0)
		familyCoverageContainer = NumberContainer(minValue=0)
		for cc_subgraph in cc_subgraph_list:
			familySize= len(cc_subgraph)
			familySizeContainer.addOneValue(familySize)
			
			familyCoverage = 0
			for n in cc_subgraph:	#assuming each family is a two-generation trio/nuclear family
				individualCoverage = self.getIndividualCoverage(individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs)
				individualCoverage = float(individualCoverage)
				individualCoverageContainer.addOneValue(individualCoverage)
				familyCoverage += individualCoverage
				in_degree = pGraph.in_degree(n)
				out_degree = pGraph.out_degree(n)
				outDegreeContainer.addOneValue(out_degree)
				familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \
										individualCoverage=individualCoverage,\
										familyCoverage=None)
				if n not in individualID2familyContext:
					individualID2familyContext[n] = familyContext
				else:
					sys.stderr.write("Node %s already in individualID2familyContext.\n"%(n))
			familyCoverageContainer.addOneValue(familyCoverage)
			#set the family coverage for each member, used in weighing the individual. better covered family => better haplotype
			for n in cc_subgraph:
				individualID2familyContext[n].familyCoverage = familyCoverage
		plinkPedigreeFile.close()
		sys.stderr.write("%s individuals.\n"%(len(individualID2familyContext)))
		
		
		# weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual
		sys.stderr.write("Weighing each individual , assigning probability mass  ...")
		individualID2probabilityMass = {}
		for individualID, familyContext in individualID2familyContext.iteritems():
			outDegreeQuotient = outDegreeContainer.normalizeValue(familyContext.familySize)
			individualCoverageQuotient = individualCoverageContainer.normalizeValue(familyContext.individualCoverage)
			#familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage)
			importanceScore = outDegreeQuotient + individualCoverageQuotient
			representativeImportanceScore = importanceScore
			individualID2probabilityMass[individualID] = representativeImportanceScore
		sys.stderr.write(" %s IDs with probability mass assigned.\n"%(len(individualID2probabilityMass)))
		
		self.individualID2probabilityMass = individualID2probabilityMass
		self.individualID2HaplotypeData = individualID2HaplotypeData
	def setup(self, **keywords):
		"""
		2013.09.30
		"""
		parentClass.setup(self, **keywords)
		#. read in pedigree graph
		self.plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname)
		self.pedigreeGraph = self.plinkPedigreeFile.pedigreeGraph
		
		#. read in alignment coverage data
		alignmentCoverageFile = MatrixFile(inputFname=self.individualAlignmentCoverageFname)
		alignmentCoverageFile.constructColName2IndexFromHeader()
		self.alignmentReadGroup2coverage = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1],\
														keyUniqueInInputFile=True, valueDataType=float)
		
		#. also get map from read_group to individual.id (used in graph)
		alignmentCoverageFile._resetInput()
		self.alignmentReadGroup2individualID = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[2], \
														keyUniqueInInputFile=True)
		#. also get map from individual.id to read_group (used in graph)
		alignmentCoverageFile._resetInput()
		self.individualID2alignmentReadGroup = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[2], valueColumnIndexList=[0], \
																		keyUniqueInInputFile=True)
		alignmentCoverageFile.close()
Beispiel #9
0
	def traverse(self):
		"""
		self.noHeader:	#2012.8.10
		2012.1.9
		"""
		newHeader = []
		key2dataLs = {}	#key is the keyColumn, dataLs corresponds to the sum of each column from valueColumnLs 
		delimiter = None
		noOfDataColumnsFromPriorFiles = 0
		for inputFname in self.inputFnameLs:
			if not os.path.isfile(inputFname):
				if self.exitNonZeroIfAnyInputFileInexistent:
					sys.exit(3)
				else:
					continue
			reader = None
			try:
				inputFile = utils.openGzipFile(inputFname)
				delimiter = figureOutDelimiter(inputFile)
				reader = MatrixFile(inputFile=inputFile, delimiter=delimiter)
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
			
			valueColumnLs = []
			try:
				header = reader.next()
				self.handleNewHeader(header, newHeader, self.keyColumnLs, valueColumnLs, keyColumnSet=self.keyColumnSet)
				if self.noHeader:	#2012.8.10
					inputFile.seek(0)
					reader = MatrixFile(inputFile=inputFile, delimiter=delimiter)
			except:	#in case something wrong (i.e. file is empty)
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
			
			if reader is not None and valueColumnLs:
				visitedKeySet = set()
				for row in reader:
					try:
						self.handleValueColumns(row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, \
								valueColumnLs=valueColumnLs, noOfDataColumnsFromPriorFiles=noOfDataColumnsFromPriorFiles, \
								visitedKeySet=visitedKeySet)
					except:	#in case something wrong (i.e. file is empty)
						sys.stderr.write('Ignore this row: %s.\n'%repr(row))
						sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
						import traceback
						traceback.print_exc()
				del reader
				#append empty data to keys who are not present in this current "reader" file
				totalKeySet = set(key2dataLs.keys())
				unvisitedKeySet = totalKeySet - visitedKeySet
				for key in unvisitedKeySet:
					for i in valueColumnLs:
						key2dataLs[key].append('')
			noOfDataColumnsFromPriorFiles += len(valueColumnLs)
		if self.noHeader:	#2012.8.10
			newHeader = None
		returnData = PassingData(key2dataLs=key2dataLs, delimiter=delimiter, header=newHeader)
		return returnData
Beispiel #10
0
    def traverse(self):
        """
		2012.1.9
		"""
        newHeader = []
        key2dataLs = {
        }  #key is the keyColumn, dataLs corresponds to the sum of each column from valueColumnLs
        delimiter = None
        for inputFname in self.inputFnameLs:
            if not os.path.isfile(inputFname):
                if self.exitNonZeroIfAnyInputFileInexistent:
                    sys.exit(3)
                else:
                    continue
            reader = None
            try:
                inputFile = utils.openGzipFile(inputFname)
                delimiter = figureOutDelimiter(inputFile)
                reader = MatrixFile(inputFile=inputFile, delimiter=delimiter)
            except:
                sys.stderr.write('Except type: %s\n' % repr(sys.exc_info()))
                import traceback
                traceback.print_exc()

            try:
                #if isCSVReader:
                header = reader.next()
                #else:
                #	header = inputFile.readline().strip().split()	#whatever splits them
                self.handleNewHeader(header,
                                     newHeader,
                                     self.keyColumnLs,
                                     self.valueColumnLs,
                                     keyColumnSet=self.keyColumnSet)
                if self.noHeader:  #2012.8.10
                    inputFile.seek(0)
                    reader = MatrixFile(inputFile=inputFile,
                                        delimiter=delimiter)
            except:  #in case something wrong (i.e. file is empty)
                sys.stderr.write('Except type: %s\n' % repr(sys.exc_info()))
                import traceback
                traceback.print_exc()

            if reader is not None:
                for row in reader:
                    #if not isCSVReader:
                    #	row = row.strip().split()
                    try:
                        self.handleValueColumns(
                            row,
                            key2dataLs=key2dataLs,
                            keyColumnLs=self.keyColumnLs,
                            valueColumnLs=self.valueColumnLs)
                    except:  #in case something wrong (i.e. file is empty)
                        sys.stderr.write('Ignore this row: %s.\n' % repr(row))
                        sys.stderr.write('Except type: %s\n' %
                                         repr(sys.exc_info()))
                        import traceback
                        traceback.print_exc()
                del reader
        if self.noHeader:  #2012.8.10
            newHeader = None
        returnData = PassingData(key2dataLs=key2dataLs,
                                 delimiter=delimiter,
                                 header=newHeader)
        return returnData