def outputPedigreeForPlink(self, DG=None, db_vervet=None, inputFname=None, outputFname=None, \ treatEveryOneIndependent=None, sampleIDFormat=1,\ addUngenotypedDuoParents=False): """ http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml either space or tab could be the delimiter. sampleIDFormat 1: individual.ucla_id 2: input sampleID argument addUngenotypedDuoParents for mendel error detection, if an ungenotyped parent in a duo (the other is genotyped) is not present in the genotype file (PED/TPED/BED), then plink won't look for its mendel inconsistency 2013.07.18 added argument addUngenotypedDuoParents for mendel error detection, if an ungenotyped parent in a duo is not present in the genotype file (PED/TPED/BED), then plink won't look for its mendel inconsistency 2013.06.24 added argument sampleIDFormat 1: individual.ucla_id 2: alignment.read_group 2013.1.2 copied from run() """ sys.stderr.write("Outputting pedigree constrained by %s to %s, treatEveryOneIndependent=%s, sampleIDFormat=%s, addUngenotypedDuoParents=%s ... "%\ (inputFname, outputFname, treatEveryOneIndependent, sampleIDFormat, addUngenotypedDuoParents)) vcfFile = VCFFile(inputFname=inputFname) alignmentLs = [] alignmentID2sampleData = {} individual_id2alignment = {} for sampleID in vcfFile.getSampleIDList(): alignment = db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment alignmentLs.append(alignment) if alignment.id in alignmentID2sampleData: sys.stderr.write("Error: alignment %s (%s) for sample %s already in alignmentID2sampleData, with sampleID=%s.\n"%\ (alignment.id, alignment.read_group, sampleID, \ alignmentID2sampleData.get(alignment.id).sampleID)) raise alignmentID2sampleData[alignment.id] = PassingData(sampleID=sampleID, alignment=alignment) individual_id = alignment.individual_sequence.individual_id if individual_id in individual_id2alignment: sys.stderr.write("Error: alignment %s (%s) for sample %s already in alignmentID2sampleData, with sampleID=%s.\n"%\ (alignment.id, alignment.read_group, sampleID, \ alignmentID2sampleData.get(alignment.id).sampleID)) raise individual_id2alignment[individual_id] = alignment #alignmentLs = db_vervet.getAlignmentsFromVCFFile(inputFname =inputFname) """ pedigreeGraphData = db_vervet.constructPedgreeGraphOutOfAlignments(alignmentLs) DG = pedigreeGraphData.DG individual_id2alignmentLs = pedigreeGraphData.individual_id2alignmentLs """ individual_id2individual = {} ungenotypedNodeID2Data = {} writer = csv.writer(open(outputFname, 'w'), delimiter=' ') counter = 0 family_id= 1 #all in one family currentNoOfFakes = 0 for alignment in alignmentLs: nodeID = alignment.individual_sequence.individual_id individual = self.getIndividual(db_vervet=db_vervet, individual_id=nodeID, \ individual_id2individual=individual_id2individual) if nodeID in DG: parents = DG.predecessors(nodeID) if len(parents)==2: parent1 = self.getIndividual(db_vervet=db_vervet, individual_id=parents[0], \ individual_id2individual=individual_id2individual) parent2 = self.getIndividual(db_vervet=db_vervet, individual_id=parents[1], \ individual_id2individual=individual_id2individual) parent1Sex = parent1.codeSexInNumber() parent2Sex = parent2.codeSexInNumber() #2013.07.18 one and only genotyped, then add the ungenotyped as a ungenotyped duo if parents[0] not in individual_id2alignment and parents[1] in individual_id2alignment: if parents[0] not in ungenotypedNodeID2Data: ungenotypedNodeID2Data[parents[0]] = PassingData(individualDBEntry=parent1, sex=parent1Sex) elif parents[0] in individual_id2alignment and parents[1] not in individual_id2alignment: if parents[1] not in ungenotypedNodeID2Data: ungenotypedNodeID2Data[parents[1]] = PassingData(individualDBEntry=parent2, sex=parent2Sex) if parent1Sex==2: #swap the father and mother row tmp = parent1 parent1 = parent2 parent2 = tmp father_id = self.getProperSampleIDForPlinkOutput(individual=parent1, \ alignmentID2sampleData=alignmentID2sampleData, \ individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat) mother_id = self.getProperSampleIDForPlinkOutput(individual=parent2, \ alignmentID2sampleData=alignmentID2sampleData, \ individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat) elif len(parents)==1: parent1 = self.getIndividual(db_vervet=db_vervet, individual_id=parents[0], \ individual_id2individual=individual_id2individual) parent1Sex = parent1.codeSexInNumber() if parent1Sex==2: parent2Sex = 1 father_id = 0 mother_id = self.getProperSampleIDForPlinkOutput(individual=parent1, \ alignmentID2sampleData=alignmentID2sampleData, \ individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat) else: parent2Sex = 2 father_id = self.getProperSampleIDForPlinkOutput(individual=parent1, \ alignmentID2sampleData=alignmentID2sampleData, \ individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat) mother_id = 0 #2013.07.18 parent1 (parents[0]) has to be in individual_id2alignment (genotyped) in order for the other #to qualify as an ungenotype parent in a duo if parents[0] in individual_id2alignment: #if parents[0] not in ungenotypedNodeID2Data: # ungenotypedNodeID2Data[parents[0]] = PassingData(individualDBEntry=parent1, sex=parent1Sex) fakeParentData = self.generateFakeIndividualID(pedigreeGraph=DG, currentNoOfFakes=currentNoOfFakes) currentNoOfFakes = fakeParentData.currentNoOfFakes fakeParent2ID = fakeParentData.individualID if fakeParent2ID not in individual_id2alignment: if fakeParent2ID not in ungenotypedNodeID2Data: ungenotypedNodeID2Data[fakeParent2ID] = PassingData(individualDBEntry=None, sex=parent2Sex) elif len(parents)==0: father_id = 0 mother_id = 0 else: sys.stderr.write("Error: number of parents (%s) for %s is %s.\n"%(repr(parents), nodeID, len(parents))) sys.exit(3) else: # founders father_id = 0 mother_id = 0 if treatEveryOneIndependent: #force the parents to be 0, everyone becomes founders father_id = 0 mother_id = 0 individual_id = self.getProperSampleIDForPlinkOutput(individual=individual, \ alignmentID2sampleData=alignmentID2sampleData, \ individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat) data_row = [family_id, individual_id, father_id, mother_id, \ individual.codeSexInNumber(), 1] writer.writerow(data_row) counter += 1 noOfUngenotypedParentsOutputted = 0 if addUngenotypedDuoParents: for ungenotypedNodeID, pdata in ungenotypedNodeID2Data.iteritems(): individual_id = self.getProperSampleIDForPlinkOutput(individual=pdata.individualDBEntry, \ alignmentID2sampleData=alignmentID2sampleData, \ individual_id2alignment=individual_id2alignment, \ sampleIDFormat=sampleIDFormat, defaultSampleID=ungenotypedNodeID) data_row = [family_id, individual_id, 0, 0, pdata.sex, 1] writer.writerow(data_row) noOfUngenotypedParentsOutputted += 1 sys.stderr.write("%s individuals and %s ungenotyped duo-parents outputted, number of fake parents %s, addUngenotypedDuoParents=%s.\n"%\ (counter, noOfUngenotypedParentsOutputted, currentNoOfFakes, addUngenotypedDuoParents)) del writer
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.data_dir: self.data_dir = self.db_vervet.data_dir data_dir = self.data_dir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) #query = VervetDB.GenotypeFile.query.filter_by(genotype_method_id=self.genotypeMethodID).filter_by(format=self.format) #for genotypeFile in query: if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(data_dir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) sampleIDList = vcfFile.getSampleIDList() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['Chromosome', 'position', 'ref'] columnIndexList = [] for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.individual_sequence.individual.site #2012.8.29 get scientific name from the taxonomy db scientifcName = self.db_taxonomy.returnScientificNameGivenTaxID(individualAlignment.individual_sequence.individual.tax_id) #if individualAlignment.individual_sequence.individual.tax_id==60711 and (site.country_id!=144 and site.country_id!=135 \ # and site.country_id!=136 and site.country_id!=148): header.append('%s %s'%(sampleID, scientifcName)) columnIndexList.append(i) writer.writerow(header) for vcfRecord in vcfFile: data_row = [vcfRecord.chr, vcfRecord.pos] refCall = vcfRecord.data_row[0] data_row.append(refCall['GT']) #get alternative allele frequency AF_list = vcfRecord.info_tag2value.get('AF') #info_tag2value['AF'] AF_list = AF_list.split(',') AF_list = map(float, AF_list) for columnIndex in columnIndexList: #for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. #it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex+1] if vcfCall: data_row.append(vcfCall['GT']) else: data_row.append('NA') writer.writerow(data_row) counter += 1 sys.stderr.write("%s loci outputted.\n"%(counter)) del writer
def run(self): """ 2011-7-11 """ if self.run_type!=1: self.needSplitChrIntervalData = False #2013.06.21 turn this off before setup_run() to not construct chr2IntervalDataLs else: self.needSplitChrIntervalData = True pdata = self.setup_run() workflow = pdata.workflow db_vervet = self.db if self.run_type in [2,3]: inputData = self.registerAllInputFiles(workflow, self.inputDir, input_site_handler=self.input_site_handler, \ checkEmptyVCFByReading=self.checkEmptyVCFByReading,\ pegasusFolderName=self.pegasusFolderName,\ maxContigID=self.maxContigID, \ minContigID=self.minContigID, db_vervet=db_vervet, \ needToKnowNoOfLoci=abs(1-self.notToKnowNoOfLoci),\ minNoOfLociInVCF=self.minNoOfLociInVCF) #ignore files with too few loci inputF = inputData.jobDataLs[0].vcfFile vcfFile = VCFFile(inputFname=inputF.abspath) alignmentLs = db_vervet.getAlignmentsFromVCFSampleIDList(vcfFile.getSampleIDList()) del vcfFile cumulativeMedianDepth = db_vervet.getCumulativeAlignmentMedianDepth(alignmentLs=pdata.alignmentLs, \ defaultSampleAlignmentDepth=self.defaultSampleAlignmentDepth) registerReferenceData = pdata.registerReferenceData if self.run_type==1: #chr2size = set(['Contig149']) #temporary when testing Contig149 #chr2size = set(['1MbBAC']) #temporary when testing the 1Mb-BAC (formerly vervet_path2) #2012.6.12 #self.outputAlignmentDepthAndOthersForFilter(db_vervet=db_vervet, outputFname=self.alnStatForFilterFname, \ # ref_ind_seq_id=self.ref_ind_seq_id, \ # foldChange=self.depthFoldChange, minGQ=30) #minGQ doesn't matter anymore. self.addGenotypeCallJobs(workflow=workflow, alignmentDataLs=pdata.alignmentDataLs, chr2IntervalDataLs=self.chr2IntervalDataLs, \ registerReferenceData=registerReferenceData, \ site_handler=self.site_handler, input_site_handler=self.input_site_handler,\ needFastaIndexJob=self.needFastaIndexJob, needFastaDictJob=self.needFastaDictJob, \ intervalSize=self.intervalSize, intervalOverlapSize=self.intervalOverlapSize, \ site_type=self.site_type, data_dir=self.data_dir,\ outputDirPrefix="",\ genotypeCallerType=self.genotypeCallerType,\ cumulativeMedianDepth=cumulativeMedianDepth,\ transferOutput=True) elif self.run_type in [2, 3]: self.addTrioCallerJobsONVCFFiles(workflow=workflow, alignmentLs=alignmentLs, inputData=inputData, \ samtools=workflow.samtools, \ genotyperJava=workflow.genotyperJava, SelectVariantsJava=workflow.SelectVariantsJava, \ GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, \ addOrReplaceReadGroupsJava=workflow.addOrReplaceReadGroupsJava, AddOrReplaceReadGroupsJar=workflow.AddOrReplaceReadGroupsJar, \ CreateSequenceDictionaryJava=workflow.CreateSequenceDictionaryJava, CreateSequenceDictionaryJar=workflow.CreateSequenceDictionaryJar, \ MergeSamFilesJar=workflow.MergeSamFilesJar, \ BuildBamIndexFilesJava=workflow.BuildBamIndexFilesJava, BuildBamIndexJar=workflow.BuildBamIndexJar, \ mv=workflow.mv, CallVariantBySamtools=workflow.CallVariantBySamtools, \ trioCallerPath=self.trioCallerPath, trioCallerWrapper=workflow.trioCallerWrapper, \ replicateIndividualTag=self.replicateIndividualTag, treatEveryOneIndependent=self.treatEveryOneIndependent,\ bgzip_tabix=workflow.bgzip_tabix, vcf_convert=workflow.vcf_convert, \ vcf_isec=workflow.vcf_isec, vcf_concat=workflow.vcf_concat, \ concatGATK=workflow.concatGATK, concatSamtools=workflow.concatSamtools,\ ligateVcf=self.ligateVcf, ligateVcfExecutableFile=self.ligateVcfExecutableFile,\ registerReferenceData=registerReferenceData, \ namespace=workflow.namespace, version=workflow.version, site_handler=self.site_handler, input_site_handler=self.input_site_handler,\ needFastaIndexJob=self.needFastaIndexJob, needFastaDictJob=self.needFastaDictJob, \ outputDirPrefix="", \ intervalSize=self.intervalSize, intervalOverlapSize=self.intervalOverlapSize, \ site_type=self.site_type, data_dir=self.data_dir,\ onlyKeepBiAllelicSNP=self.onlyKeepBiAllelicSNP, maxSNPMissingRate=self.maxSNPMissingRate,\ alnStatForFilterF=None, cumulativeMedianDepth=cumulativeMedianDepth,\ run_type=self.run_type, transferOutput=True) self.end_run()