def openWriteBeagleFiles(self, pedigreeFamilyData=None, outputFnamePrefix=None): """ 2013.05.02 The non-likelihood (unphased, trios, pairs) Beagle format: I id sample1 sample1 sample2 sample2 A diabetes 1 1 2 2 M rs12082861 C C C C M rs4912233 T C C C M rs12732823 G A A A M rs17451521 C C C C M rs12033358 C T T T The likelihood version is marker alleleA alleleB 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1002_711_2001039_GA_vs_524 Contig791:1086 C A 0.9693 0.0307 0.0000 0.6660 0.3338 0.0003 0.0000 Contig791:1649 G C 0.9406 0.0594 0.0000 0.9693 0.0307 0.0000 0.0000 Contig791:4084 A C 0.9980 0.0020 0.0000 0.9844 0.0156 0.0000 0.0000 The markers file has this format (markerID, position, alleleA, alleleB) Contig791:1086 1086 C A """ sys.stderr.write( "Opening beagle files (outputFnamePrefix =%s) to write ..." % (outputFnamePrefix)) familySize2BeagleFileHandler = {} familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList counter = 0 for familySize, sampleIDList in familySize2SampleIDList.iteritems(): if familySize not in familySize2BeagleFileHandler: tmpOutputFnamePrefix = '%s_familySize%s' % (outputFnamePrefix, familySize) writer = MatrixFile(inputFname='%s.bgl' % (tmpOutputFnamePrefix), openMode='w', delimiter=' ') familySize2BeagleFileHandler[familySize] = writer if familySize == 1: headerRow = ['marker', 'alleleA', 'alleleB'] else: headerRow = ['I', 'id'] for sampleID in sampleIDList: if familySize == 1: #likelihood format has sample name replicated three times, rather than 2 times headerRow.extend([sampleID] * 3) else: headerRow.extend([sampleID] * 2) writer.writeHeader(headerRow) counter += 1 markersFile = MatrixFile(inputFname='%s.markers' % (outputFnamePrefix), openMode='w', delimiter=' ') counter += 1 sys.stderr.write("%s files outputted.\n" % (counter)) return PassingData( familySize2BeagleFileHandler=familySize2BeagleFileHandler, markersFile=markersFile)
def run(self): """ """ if self.debug: import pdb pdb.set_trace() reader = MatrixFile(inputFname=self.inputFname) reader.constructColName2IndexFromHeader() meanMendelErrorIndex = reader.getColIndexGivenColHeader( "meanMendelError") noOfLociIndex = reader.getColIndexGivenColHeader("sampled_base_count") sumOfMendelErrorIndex = reader.getColIndexGivenColHeader( "sumOfMendelError") plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname) familyStructureData = plinkPedigreeFile.getFamilyStructurePlinkWay() twoParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=2) singleParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=1) zeroParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=0) writer = MatrixFile(self.outputFname, openMode='w', delimiter='\t') header = ["ID", "noOfTotalLoci", \ "noOfTwoParentFamilies", "noOfParentsInTwoParentFamilies", "noOfKidsInTwoParentFamilies", "noOfIndividualsInTwoParentFamilies", \ "noOfSingleParentFamilies", "noOfParentsInSingleParentFamilies", "noOfKidsInSingleParentFamilies", "noOfIndividualsInSingleParentFamilies", \ "noOfZeroParentFamilies", "noOfParentsInZeroParentFamilies", "noOfKidsInZeroParentFamilies", "noOfIndividualsInZeroParentFamilies", \ "noOfTotalMendelErrors", \ "noOfMendelErrorsPerLocusPerNuclearFamily", "noOfMendelErrorsPerNuclearFamily"] writer.writeHeader(header) for row in reader: meanMendelError = float(row[meanMendelErrorIndex]) noOfLoci = int(row[noOfLociIndex]) sumOfMendelError = int(row[sumOfMendelErrorIndex]) noOfNuclearFamilies = twoParentFamilyCountData.noOfFamilies if noOfNuclearFamilies > 0: noOfMendelErrorsPerLocusPerNuclearFamily = meanMendelError / float( noOfNuclearFamilies) noOfMendelErrorsPerNuclearFamily = sumOfMendelError / float( noOfNuclearFamilies) else: noOfMendelErrorsPerLocusPerNuclearFamily = -1 noOfMendelErrorsPerNuclearFamily = -1 data_row = [row[0], noOfLoci, \ noOfNuclearFamilies, twoParentFamilyCountData.noOfParents, twoParentFamilyCountData.noOfKids, \ twoParentFamilyCountData.noOfIndividuals,\ singleParentFamilyCountData.noOfFamilies, singleParentFamilyCountData.noOfParents, singleParentFamilyCountData.noOfKids,\ singleParentFamilyCountData.noOfIndividuals,\ zeroParentFamilyCountData.noOfFamilies, zeroParentFamilyCountData.noOfParents, zeroParentFamilyCountData.noOfKids,\ zeroParentFamilyCountData.noOfIndividuals,\ sumOfMendelError, \ noOfMendelErrorsPerLocusPerNuclearFamily,noOfMendelErrorsPerNuclearFamily ] writer.writerow(data_row) plinkPedigreeFile.close() reader.close() writer.close()