def run(self): """ 2013.07.24 """ if self.debug: import pdb pdb.set_trace() #inf = utils.openGzipFile(self.inputFname) reader = MatrixFile(inputFname=self.inputFname) reader.constructColName2IndexFromHeader() writer = MatrixFile(inputFname=self.outputFname, openMode='w', delimiter='\t') header = ["SNPID", "oldChromosome", "Chromosome", "Start", "Stop", "N"] writer.writeHeader(header) counter = 0 for row in reader: new_row = self.processRow(row) writer.writerow(new_row) counter += 1 sys.stderr.write("%s lines processed.\n" % (counter)) del reader del writer
def getMendelErrorIndividualLocusData(self, mendelErrorFname=None, individualID2Index=None): """ 2013.1.29 """ sys.stderr.write("Getting data on loci involved in mendel-errors from %s ..."%(mendelErrorFname)) locus_id2individual_index_ls = {} #inf = utils.openGzipFile(mendelErrorFname, 'r') reader = MatrixFile(inputFname=mendelErrorFname) #header = reader.next() reader.constructColName2IndexFromHeader() counter = 0 for row in reader: individual_id = row[reader.getColIndexGivenColHeader('KID')] if individual_id in individualID2Index: index =individualID2Index.get(individual_id) else: sys.stderr.write("Individual %s not in individualID2Index.\n"%(individual_id)) sys.exit(3) snp_id = row[3] if snp_id not in locus_id2individual_index_ls: locus_id2individual_index_ls[snp_id] = [] locus_id2individual_index_ls[snp_id].append(index) counter += 1 del reader sys.stderr.write(" %s calls of %s loci, involved in mendel errors.\n"%\ (counter, len(locus_id2individual_index_ls))) return locus_id2individual_index_ls
def setup(self, **keywords): """ noOfTotalIntervals = 0 noOfCrossChromosomeIntervals = 0 targetChromosome 2 mapData intervalDeltaList => median orientation (queryStrand) 0=forward 1=backward mean => using 80% of data (sort the delta list, then take 10% to 90% of the list) stddev => if stddev is zero, use 1. locusKey (oldChromosome, oldStart, oldStop) 2 mapData targetCoordinate (newChromosome, newStart, newStop). leftIntervalDelta: None = boundary rightIntervalDelta: None = boundary, 10E10 = cross chromosome probability: max( P(SNP_i_left_interval), P(SNP_i_right_interval)). P(interval): If one interval is on the same chromosome, P(target-chromosome)*P(interval delta size) If not, P(chromosome-cross event). Not implemented: for a whole genome input (rather than a window), an RBTree of windows should be used to counter regional effect. 2013.11.24 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) self.noOfTotalIntervals = 0.0 self.noOfCrossChromosomeIntervals = 0.0 #make it float for division self.targetChromosome2mapData = {} self.locusKey2mapData = {} self.previousLocusData = None #write header for the main output header = [ 'oldChromosome', 'oldStart', 'oldStop', 'oldStrand', 'newChromosome', 'newStart', 'newStop', 'mapPvalue' ] self.writer.writerow(header) self.invariantPData.headerOutputted = True #avoid double header output #open the other writer and write header self.sideOutput = MatrixFile(self.locusIntervalDeltaOutputFname, openMode='w', delimiter='\t') header = [ 'oldChromosome', 'oldStart', 'oldStop', 'oldStrand', 'newChromosome', 'newStart', 'newStop', 'intervalDelta' ] self.sideOutput.writeHeader(header)
def appendInfo(self, inputFname=None, db_vervet=None, outputFname=None,\ inversePCValue=True): """ #2012.9.25 skip samples whose individual_alignment entry could not be parsed. 2012.9.5 """ sys.stderr.write("Appending info to %s ..."%(inputFname)) reader = MatrixFile(inputFname) header = reader.next() newHeader = ['individualID'] for i in xrange(1, len(header)): newHeader.append('PC%s'%(i)) newHeader.extend(['sex|string', 'country|string', 'site-id', 'site-name|string', 'latitude', 'longitude', 'ucla_id|string', \ 'tax_id|string',\ 'species|string', 'collectionYear', 'medianDepth']) writer = csv.writer(open(outputFname, 'w'), delimiter='\t') writer.writerow(newHeader) counter = 0 for row in reader: row = row[:len(header)] #don't take extra columns sampleID = row[0] individualAlignment = db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment if individualAlignment is None: #2012.9.25 #sampleID is not beginned with alignment ID, probably "ref" but could be something , skip them sys.stderr.write("Warning: sampleID %s is not parsable to get alignment out of it. Skip.\n"%(sampleID)) continue individual = individualAlignment.individual_sequence.individual data_row = ['%s_%s'%(individual.code, individualAlignment.id)] floatValue_row = row[1:] if inversePCValue: floatValue_row = map(float, floatValue_row) floatValue_row = numpy.array(floatValue_row) floatValue_row = -floatValue_row data_row.extend(list(floatValue_row)) scientifcName = self.db_taxonomy.returnScientificNameGivenTaxID(individual.tax_id) if scientifcName is None: scientifcName = "" if individual.collection_date: collectionYear = individual.collection_date.year else: collectionYear = '' data_row.extend([individual.sex, individual.site.country.name, individual.site.id, individual.site.short_name, \ individual.latitude, individual.longitude, individual.ucla_id, \ individual.tax_id, scientifcName, collectionYear, individualAlignment.median_depth]) writer.writerow(data_row) counter += 1 del writer sys.stderr.write("%s rows outputted.\n"%(counter))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) reader = VCFFile(inputFname=self.inputFname) alignmentFile = pysam.Samfile(self.alignmentFilename, "rb") writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t') header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \ 'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads'] statWriter.writeHeader(header) counter = 0 real_counter = 0 minDepth = self.alignmentMedianDepth/self.alignmentDepthFold maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold for vcfRecord in reader: locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position) alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1) #start and end in fetch() are 0-based. locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\ minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead) locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator depth = locusLowMapQData.totalNoOfReads if depth>=minDepth and depth <=maxDepth: locusOutOfDepthIndicator = 0 #good else: locusOutOfDepthIndicator = 1 locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\ 1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \ locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads] statWriter.writerow(data_row) if locusLowQualityIndicator>0: real_counter += 1 #modify the VCF record #get sample ID column, then set its genotype missing vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True) #2014.1.4 output VCF record writer.writeVCFRecord(vcfRecord) counter += 1 reader.close() statWriter.close() writer.close() sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \ real_counter/float(counter)))
def run(self): """ """ if self.debug: import pdb pdb.set_trace() reader = MatrixFile(self.inputFname) #reader.constructColName2IndexFromHeader() #no header #noOfMendelErrorColumnIndex = reader.getColIndexGivenColHeader(colHeader='N') SNPIDColumnIndex = 1 writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['chromosome', 'position'] writer.writerow(header) counter = 0 real_counter = 0 for row in reader: SNPID = row[SNPIDColumnIndex] chr, pos = SNPID.split('_') data_row = [chr, pos] writer.writerow(data_row) real_counter += 1 counter += 1 del reader del writer sys.stderr.write("%s/%s lines outputted.\n" % (real_counter, counter))
def outputOverlapSites(self, overlapping_sites_set=None, outputFname=None): """ 2011-12.9 overlapping_sites_set is a set of (chromosome, pos) tuples. output is tab-delimited, 3-column. Last column is always 0 to mimic output of CalculateSNPMismatchRateOfTwoVCF.py chromosome position 0 """ sys.stderr.write("Outputting overlap %s sites ..."%(len(overlapping_sites_set))) header = ['chromosome', 'position', 'random'] overlapping_sites_list = list(overlapping_sites_set) writer = MatrixFile(outputFname, openMode='w', delimiter='\t') writer.writerow(header) overlapping_sites_list.sort() for chromosome, pos in overlapping_sites_list: writer.writerow([chromosome, pos, 0]) sys.stderr.write("%s sites.\n"%(len(overlapping_sites_list)))
def run(self): """ """ if self.debug: import pdb pdb.set_trace() #inf = utils.openGzipFile(self.inputFname) reader = MatrixFile(inputFname=self.inputFname) #a TPED file writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') counter = 0 tfamIndividualData = self.getIndividualID2IndexFromTFAMFile(tfamFname=self.tfamFname) individualID2Index = tfamIndividualData.individualID2Index noOfIndividuals = len(individualID2Index) noOfExtraIndividuals = None for row in reader: #chromosome, snp_id, genetic_distace, physical_distance = row[:4] noOfExistingIndividuals = len(row[4:])/2 noOfExtraIndividuals = noOfIndividuals - noOfExistingIndividuals writer.writerow(row+ [0]*2*noOfExtraIndividuals) counter += 1 del reader del writer sys.stderr.write("%s rows (loci) and added %s extra individuals.\n"%(counter, noOfExtraIndividuals))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) snp_pos2genotypeVectorLs =self.readInSNPID2GenotypeVectorLs(self.inputFname).snp_pos2returnData writer = MatrixFile(self.outputFname, openMode='w', delimiter='\t') header = ['chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance'] writer.writeHeader(header) counter = 0 real_counter = 0 no_of_pairs = 0 snp_pos_ls = snp_pos2genotypeVectorLs.keys() snp_pos_ls.sort() for i in xrange(len(snp_pos_ls)): counter += 1 key = snp_pos_ls[i] chromosome, position = snp_pos_ls[i][:2] genotypeVectorLs = snp_pos2genotypeVectorLs.get(key) if len(genotypeVectorLs)>1: real_counter += 1 for k in xrange(0, len(genotypeVectorLs)-1): for l in xrange(k+1, len(genotypeVectorLs)): no_of_pairs +=1 noOfMatches = 0 noOfTotal = 0 genotypeVector0 = genotypeVectorLs[k] genotypeVector1 = genotypeVectorLs[l] for j in xrange(len(genotypeVector0)): call1 = genotypeVector0[j]['GT'] call2 = genotypeVector1[j]['GT'] if call1!='NA' and call2!='NA': noOfTotal += 1 if SNP.nt2number[call1]==SNP.nt2number[call2]: noOfMatches += 1 if noOfTotal>0: concordance = float(noOfMatches)/float(noOfTotal) else: concordance = -1 data_row = [chromosome, position,noOfMatches, noOfTotal, concordance ] writer.writerow(data_row) writer.close() sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \ real_counter/float(counter), no_of_pairs))
def setup(self, **keywords): """ """ AbstractMatrixFileWalker.setup(self, **keywords) #construct a individualCode2readGroup from readGroupFname self.invariantPData.individualCode2readGroup = {} reader = MatrixFile(inputFname=self.readGroupFname) reader.constructColName2IndexFromHeader() if self.readGroupHeader: readGroupIndex = reader.getColIndexGivenColHeader(self.readGroupHeader) else: readGroupIndex = 0 for row in reader: readGroup = row[readGroupIndex] individualAlignment = self.db_vervet.parseAlignmentReadGroup(readGroup).individualAlignment if individualAlignment: individual_code = individualAlignment.individual_sequence.individual.code self.invariantPData.individualCode2readGroup[individual_code] = readGroup del reader return 1
def getSampleID2IndividualData_UNGC(self, inputFname=None): """ 2013.04.04 Format is like this from UNGC = UCLA Neuroscience Genomics Core: FCID Lane sample ID sample code sample name Index Description SampleProject D1HYNACXX 1 Ilmn Human control pool ( 4plex) IP1 INDEX IS UNKNOWN prepared by Illumina (4 plex pool) 2013-029A D1HYNACXX 2 UNGC Human Sample 1 S1 AS001A ATTACTCG TruSeq DNA PCR Free beta kit 2013-029A """ sys.stderr.write("Getting sampleID2IndividualData from %s ..."%(inputFname)) sampleID2IndividualData = {} reader = MatrixFile(inputFname, openMode='r', delimiter=',') reader.constructColName2IndexFromHeader() sampleIDIndex = reader.getColIndexGivenColHeader("sample ID") sampleNameIndex = reader.getColIndexGivenColHeader("sample name") libraryIndexIndex = reader.getColIndexGivenColHeader("Index") for row in reader: sampleID = row[sampleIDIndex].replace(' ', '_') #2013.04.04 stupid quirks sampleName = row[sampleNameIndex] libraryIndex = row[libraryIndexIndex] if sampleID not in sampleID2IndividualData: sampleID2IndividualData[sampleID] = PassingData(sampleName=sampleName, libraryIndexList=[]) if sampleName!=sampleID2IndividualData[sampleID].sampleName: sys.stderr.write("Error: sampleID %s is associated with two different sample names (%s, %s).\n"%\ (sampleID, sampleName, sampleID2IndividualData[sampleID].sampleName)) raise sampleID2IndividualData[sampleID].libraryIndexList.append(libraryIndex) sys.stderr.write("%s entries.\n"%(len(sampleID2IndividualData))) return sampleID2IndividualData
def getMonkeyID2Coverage(self, inputFname): """ 2012.9.4 copied from vervet/src/misc.py 2012.2.10 inputFname is output of SequencingStrategy.assignVRCSequencePriorityBasedOnPedigree() + manual change of top ones """ sys.stderr.write("Reading the list of ranked monkeys from %s ..." % (inputFname)) reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() monkey_id_index = reader.getColIndexGivenColHeader("UCLAID") pre_set_coverage_index = reader.getColIndexGivenColHeader("pre-set-coverage") future_coverage_index = reader.getColIndexGivenColHeader("future coverage") to_sequence_monkey_id2coverage = {} for row in reader: monkey_id = row[monkey_id_index] pre_set_coverage = row[pre_set_coverage_index] if pre_set_coverage: pre_set_coverage = float(pre_set_coverage) else: pre_set_coverage = 0 future_coverage = 0 if len(row) >= future_coverage_index + 1: future_coverage = float(row[future_coverage_index]) to_sequence_monkey_id2coverage[monkey_id] = max(future_coverage, pre_set_coverage) del reader sys.stderr.write(" %s monkeys are to-be-sequenced.\n" % (len(to_sequence_monkey_id2coverage))) return to_sequence_monkey_id2coverage """
def outputGenotypeMarkedMissingStat(self, outputFname=None, \ individual_index2no_of_genotype_marked_missing=None,\ individualIDList=None): """ 2013.07.24 """ if outputFname and individual_index2no_of_genotype_marked_missing is not None: writer = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t') header = ["individualID", "noOfGenotypesMarkedMissing"] writer.writeHeader(header) for individual_index, no_of_genotype_marked_missing in individual_index2no_of_genotype_marked_missing.iteritems(): individual_id = individualIDList[individual_index] writer.writerow([individual_id, no_of_genotype_marked_missing]) writer.close()
def constructPedigreeGraphFromPOEdgeFile(self, inputFname=None): """ 2012.8.23 inputFname is output of vervet/src/pedigree/DiscoverParentOffspringFromPlinkIBD.py """ sys.stderr.write("Constructing pedigree-graph out of %s ..."%(inputFname)) DG=nx.DiGraph() reader = None childNodeSet = set() reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() parentIDIndex = reader.getColIndexGivenColHeader("parentID") childIDIndex = reader.getColIndexGivenColHeader("childID") distToPOVectorIndex = reader.getColIndexGivenColHeader("distToPOVector") counter = 0 for row in reader: childID = row[childIDIndex] childNodeSet.add(childID) parentID = row[parentIDIndex] distToPOVector = float(row[distToPOVectorIndex]) DG.add_edge(parentID, childID, weight=distToPOVector) counter += 1 del reader sys.stderr.write("%s children, %s nodes. %s edges. %s connected components.\n"%(\ len(childNodeSet), DG.number_of_nodes(), DG.number_of_edges(), \ nx.number_connected_components(DG.to_undirected()))) return PassingData(DG=DG, childNodeSet=childNodeSet)
def getLocusID2MissingFraction(self, inputFname=None): """ 2014.01.08 """ sys.stderr.write("Reading in the missing statistics from %s ... " % (inputFname)) locusID2Stat = {} reader = MatrixFile(inputFname=inputFname) reader.constructColName2IndexFromHeader() locusIDIndex = reader.getColIndexGivenColHeader("locusID") statIndex = reader.getColIndexGivenColHeader("occurrence_byFixedValue") counter = 0 for row in reader: locusID = row[locusIDIndex] chromosome, start = locusID.split('_')[:2] start = int(start) stat = float(row[statIndex]) key = (chromosome, start, start) if key in locusID2Stat: if stat < locusID2Stat[key]: #take lowest value locusID2Stat[key] = stat else: locusID2Stat[key] = stat counter += 1 del reader sys.stderr.write( " %s unique loci with missing fraction out of %s total loci.\n" % (len(locusID2Stat), counter)) return locusID2Stat
def outputAlignmentDepthAndOthersForFilter(self, db_vervet=None, inputFname=None, \ ref_ind_seq_id=None, depthFoldChange=2, minGQ=30, \ outputFname=None, outputFileFormat=1): """ 2012.6.12 added argument db_vervet, moved from FilterVCFPipeline.py 2011-9-2 """ sys.stderr.write("Outputting alignment (from %s) coverage to %s ..."%(inputFname, outputFname)) if inputFname: alignmentLs = db_vervet.getAlignmentsFromVCFFile(inputFname=inputFname) else: alignmentLs = db_vervet.getAlignments(ref_ind_seq_id=self.ref_ind_seq_id, \ alignment_method_id=self.alignment_method_id, data_dir=self.data_dir,\ local_realigned=self.local_realigned, outdated_index=self.alignment_outdated_index,\ completedAlignment=self.completedAlignment, \ reduce_reads=self.reduce_reads) """ TableClass = VervetDB.IndividualAlignment query = TableClass.query.filter(TableClass.median_depth!=None) if ref_ind_seq_id: query = query.filter(TableClass.ref_ind_seq_id==ref_ind_seq_id) alignmentLs = query.order_by(TableClass.id) """ alignmentLs = db_vervet.filterAlignments(data_dir=self.data_dir, alignmentLs=alignmentLs, sequence_filtered=self.sequence_filtered, \ mask_genotype_method_id=None, parent_individual_alignment_id=None,\ excludeContaminant=self.excludeContaminant,local_realigned=self.local_realigned,\ reduce_reads=self.reduce_reads,\ completedAlignment=self.completedAlignment,\ alignment_method_id=self.alignment_method_id, \ outdated_index=self.alignment_outdated_index) writer = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t') if outputFileFormat==1: header = ['alignmentID', 'medianDepth', "individualID"] else: header = ['alignmentID', 'minDepth', 'maxDepth', 'minGQ'] writer.writeHeader(header) counter = 0 for row in alignmentLs: read_group = row.read_group if outputFileFormat==1: data_row = [read_group, row.median_depth, row.individual_sequence.individual.id] else: minDepth = row.median_depth/float(depthFoldChange) if abs(minDepth-0)<=0.001: #if it's too close to 0, assign 0. minDepth = 0 data_row = [read_group, minDepth, row.median_depth*float(depthFoldChange), minGQ] writer.writerow(data_row) counter += 1 writer.close() sys.stderr.write("%s entries fetched.\n"%(counter))
def readInDataToPlot(self, input_fname, sampling_probability=1.0): """ 2015.01.23 added argument sampling_probability to sub-sample data 2013.07.11 use MatrixFile to read in the file 2009-5-20 add the column index into the column header for easy picking 2009-3-13 wrap the float conversion part into try...except to report what goes wrong 2009-3-13 """ if sampling_probability>1 or sampling_probability<0: sampling_probability=1.0 reader = MatrixFile(inputFname=input_fname) self.column_header=reader.next() for i in range(len(self.column_header)): self.column_header[i] = '%s %s'%(i, self.column_header[i]) no_of_cols = len(self.column_header) self.column_types = [str]*2 + [float]*(no_of_cols-2) self.column_editable_flag_ls = [True, True] + [False]*(no_of_cols-2) self.list_2d = [] for row in reader: if sampling_probability>0 and sampling_probability<1: if random.random()>sampling_probability: #skip continue float_part = row[2:] try: float_part = map(float, float_part) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) traceback.print_exc() new_row = row[:2]+float_part self.list_2d.append(new_row) reader.close() self.setupColumns(self.treeview_matrix) #update status to reflect the input filename self.app1.set_title(os.path.basename(input_fname)) self.app1_appbar1.push(input_fname) self.plotXY(self.ax, self.canvas, self.liststore, self.plot_title)
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) switchPointData = self.readInStats(inputFname=self.inputFname) sys.stderr.write("Processing data ...") writer = MatrixFile(self.outputFname, openMode='w') header = [ "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction', "noOfLoci", 'noOfLociFraction' ] writer.writeHeader(header) data_matrix = switchPointData.data_matrix totalSpan = switchPointData.totalSpan totalNoOfLoci = switchPointData.totalNoOfLoci #sort it based on switchFrequency data_matrix.sort(reverse=True) maxSwitchFrequencyLs = [] cumulativeRegionSpanLs = [] cumulativeNoOfLociLs = [] for i in xrange(len(data_matrix)): switchFrequency, regionSpan, noOfLoci = data_matrix[i] maxSwitchFrequencyLs.append(switchFrequency) if i == 0: cumulativeRegionSpan = totalSpan - regionSpan cumulativeNoOfLoci = totalNoOfLoci - noOfLoci else: cumulativeRegionSpan = cumulativeRegionSpanLs[i - 1] - regionSpan cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci cumulativeRegionSpanLs.append(cumulativeRegionSpan) cumulativeNoOfLociLs.append(cumulativeNoOfLoci) writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\ cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)]) writer.close() sys.stderr.write(".\n")
def constructPedigreeGraphFromPlinkIBD(self, inputFname=None, maxDistanceToPOVector=0.04, drawDistribution=False, outputFnamePrefix=None): """ 2012.8.14 """ sys.stderr.write("Constructing pedigree-graph out of plink-ibd %s ..."%(inputFname)) DG=nx.DiGraph() childNodeSet = set() reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() monkey1IDIndex = reader.getColIndexGivenColHeader("IID1") monkey2IDIndex = reader.getColIndexGivenColHeader("IID2") Z0Index = reader.getColIndexGivenColHeader("Z0") Z1Index = reader.getColIndexGivenColHeader("Z1") Z2Index = reader.getColIndexGivenColHeader("Z2") poVector = numpy.array([0,1,0.0]) counter = 0 real_counter = 0 data_ls = [] for row in reader: monkey1ID = int(row[monkey1IDIndex]) #turn it into integer so could compare age monkey2ID = int(row[monkey2IDIndex]) Z0 = float(row[Z0Index]) Z1 = float(row[Z1Index]) Z2 = float(row[Z2Index]) ZVector = numpy.array([Z0, Z1, Z2]) dist = numpy.linalg.norm(poVector-ZVector) if drawDistribution and outputFnamePrefix: data_ls.append(dist) if dist<=maxDistanceToPOVector: if monkey1ID>monkey2ID: childID = monkey1ID parentID = monkey2ID else: childID = monkey2ID parentID = monkey1ID DG.add_edge(parentID, childID, weight=dist) childNodeSet.add(childID) real_counter += 1 counter += 1 del reader sys.stderr.write("%s out of %s lines become PO pairs. %s children, %s nodes. %s edges. %s connected components.\n"%(\ real_counter, counter, len(childNodeSet), DG.number_of_nodes(), DG.number_of_edges(), \ nx.number_connected_components(DG.to_undirected()))) if drawDistribution and outputFnamePrefix: outputFname = '%s_IBDVector2POVectorDist_hist.png'%(outputFnamePrefix) yh_matplotlib.drawHist(data_ls, title='', \ xlabel_1D="dist(ZVector,POVector)", xticks=None, \ outputFname=outputFname, min_no_of_data_points=10, \ needLog=True, \ dpi=200, min_no_of_bins=25) return PassingData(DG=DG, childNodeSet=childNodeSet)
def run(self): """ 2013.07.24 """ if self.debug: import pdb pdb.set_trace() #inf = utils.openGzipFile(self.inputFname) reader = MatrixFile(inputFname=self.inputFname) writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') counter = 0 if self.run_type==4: #2013.2.1 tfamIndividualData = self.getIndividualID2IndexFromTFAMFile(tfamFname=self.tfamFname) individualID2Index = tfamIndividualData.individualID2Index individualIDList = tfamIndividualData.individualIDList locus_id2individual_index_ls = self.getMendelErrorIndividualLocusData(mendelErrorFname=self.mendelErrorFname, \ individualID2Index=individualID2Index) individual_index2no_of_genotype_marked_missing = {} else: individualID2Index = None individualIDList = None locus_id2individual_index_ls = None individual_index2no_of_genotype_marked_missing = None for row in reader: if self.run_type==2: new_row = self.processRow_ChangeChromosomeIDToX(row) elif self.run_type==3: new_row = self.processRow_addPositionStartBase(row) elif self.run_type==4: new_row = self.markGenotypeMissingIfInvolvedInMendelError(row=row, \ locus_id2individual_index_ls=locus_id2individual_index_ls,\ individual_index2no_of_genotype_marked_missing=individual_index2no_of_genotype_marked_missing) else: new_row = self.processRow(row) writer.writerow(new_row) counter += 1 sys.stderr.write("%s lines modified.\n"%(counter)) del reader del writer self.outputGenotypeMarkedMissingStat(outputFname=self.markMissingStatFname, \ individual_index2no_of_genotype_marked_missing=individual_index2no_of_genotype_marked_missing, \ individualIDList=individualIDList)
def getIndividualID2IndexFromTFAMFile(self, tfamFname=None): """ 2013.07.24 return individualIDList as well 2013.1.29 """ sys.stderr.write("Getting individualID2Index from tfam file %s ..."%(tfamFname)) individualID2Index = {} individualIDList = [] reader = MatrixFile(inputFname=tfamFname) counter = 0 for row in reader: individualID = row[1] individualID2Index[individualID] = len(individualID2Index) individualIDList.append(individualID) counter += 1 del reader sys.stderr.write(" %s individuals.\n"%(len(individualID2Index))) return PassingData(individualID2Index=individualID2Index, individualIDList=individualIDList)
def getLocusNewID2mapPvalue(self, liftOverLocusMapPvalueFname=None): """ 2014.01.04 oldChromosome, oldStart, oldStop, oldStrand, newChromosome, newStart, newStop, mapPvalue """ sys.stderr.write("Reading in the coordinate map from %s ..." % (liftOverLocusMapPvalueFname)) locusNewID2mapPvalue = {} reader = MatrixFile(inputFname=liftOverLocusMapPvalueFname) reader.constructColName2IndexFromHeader() strandIndex = reader.getColIndexGivenColHeader("oldStrand") newChromosomeIndex = reader.getColIndexGivenColHeader("newChromosome") newStartIndex = reader.getColIndexGivenColHeader("newStart") newStopIndex = reader.getColIndexGivenColHeader("newStop") mapPvalueIndex = reader.getColIndexGivenColHeader("mapPvalue") counter = 0 for row in reader: strand = row[strandIndex] newChromosome = row[newChromosomeIndex] newStart = int(row[newStartIndex]) newStop = int(row[newStopIndex]) mapPvalue = float(row[mapPvalueIndex]) key = (newChromosome, newStart, newStop) if key in locusNewID2mapPvalue: if mapPvalue < locusNewID2mapPvalue[key]: #take lowest value locusNewID2mapPvalue[key] = mapPvalue else: locusNewID2mapPvalue[key] = mapPvalue counter += 1 del reader sys.stderr.write( "%s unique loci with map p-value out of %s total loci.\n" % (len(locusNewID2mapPvalue), counter)) return locusNewID2mapPvalue
def getMonkeyPair2IBDVector(self, inputFname=None): """ 2012.9.10 return monkeyIDSet as well 2012.9.6 """ sys.stderr.write("Getting monkey pair 2 IBD vector from %s ..." % (inputFname)) reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() monkey1IDIndex = reader.getColIndexGivenColHeader("IID1") monkey2IDIndex = reader.getColIndexGivenColHeader("IID2") IBDIndex = reader.getColIndexGivenColHeader("PI_HAT") Z0Index = reader.getColIndexGivenColHeader("Z0") Z1Index = reader.getColIndexGivenColHeader("Z1") Z2Index = reader.getColIndexGivenColHeader("Z2") formatFunc = lambda x: "%.2f" % (x) monkeyPair2IBDVector = {} counter = 0 monkeyIDSet = set() for row in reader: monkey1ID = row[monkey1IDIndex] monkey2ID = row[monkey2IDIndex] monkey_id_pair = [monkey1ID, monkey2ID] monkey_id_pair.sort() key = tuple(monkey_id_pair) Z0 = float(row[Z0Index]) Z1 = float(row[Z1Index]) Z2 = float(row[Z2Index]) IBD = float(row[IBDIndex]) IBDVector = [Z0, Z1, Z2] IBDVector = map(formatFunc, IBDVector) IBDVectorStr = ",".join(IBDVector) data = PassingData(IBD=IBD, IBDVector=IBDVector, IBDVectorStr=IBDVectorStr) if key in monkeyPair2IBDVector: sys.stderr.write( "WARNING: key %s has value %s in monkeyPair2IBDVector already. value overwritten with %s.\n" % (repr(key), monkeyPair2IBDVector.get(key), data) ) monkeyPair2IBDVector[key] = data monkeyIDSet.add(monkey1ID) monkeyIDSet.add(monkey2ID) counter += 1 sys.stderr.write( " %s pairs of IBD vectors for %s unique monkeys.\n" % (len(monkeyPair2IBDVector), len(monkeyIDSet)) ) return PassingData(monkeyPair2IBDVector=monkeyPair2IBDVector, monkeyIDSet=monkeyIDSet)
def readInStats(self, inputFname=None): """ 2013.07.15 """ sys.stderr.write("Reading stats from %s ..." % (inputFname)) data_matrix = [] reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() switchFrequencyIndex = reader.getColIndexGivenColHeader( "noOfSwitchPoints_by_noOfLociWithUniqueHit") regionSpanIndex = reader.getColIndexGivenColHeader("regionSpan") noOfLociIndex = reader.getColIndexGivenColHeader("#sitesInInput2") totalSpan = 0 totalNoOfLoci = 0 counter = 0 for row in reader: counter += 1 switchFrequency = row[switchFrequencyIndex] regionSpan = row[regionSpanIndex] noOfLoci = row[noOfLociIndex] if switchFrequency and regionSpan and noOfLoci: #non-empty switchFrequency = float(switchFrequency) regionSpan = int(float(regionSpan)) noOfLoci = int(float(noOfLoci)) data_matrix.append([switchFrequency, regionSpan, noOfLoci]) totalSpan += regionSpan totalNoOfLoci += noOfLoci reader.close() sys.stderr.write(" %s valid entries (from %s rows) with totalSpan=%s, totalNoOfLoci=%s.\n"%\ (len(data_matrix), counter, totalSpan, totalNoOfLoci)) return PassingData(data_matrix=data_matrix, totalSpan=totalSpan, totalNoOfLoci=totalNoOfLoci)
class ComputeLiftOverLocusProbability(parentClass): __doc__ = __doc__ option_default_dict = parentClass.option_default_dict.copy() option_default_dict.update({ ('locusIntervalDeltaOutputFname', 1, ): ['', '', 1, 'file that would contain delta of intervals from old and new coordinate system. \ Used to check if normal distribution on each chromosome. Output format: oldChromosome, oldStart, oldStop, newChromosome, newStart, newStop, intervalDelta.' , ],\ ('startPosition', 0, int):[None, '', 1, 'probability for loci whose start positions are bigger than this argument would be computed.\ Model parameters are estimated using all input data. This argument is used to avoid edge/boundary effect.' ],\ ('stopPosition', 0, int):[None, '', 1, 'probability for loci whose stop positions are less than this argument would be computed.\ Model parameters are estimated using all input data. This argument is used to avoid edge/boundary effect.' ],\ }) def __init__(self, inputFnameLs=None, **keywords): """ """ parentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords) def setup(self, **keywords): """ noOfTotalIntervals = 0 noOfCrossChromosomeIntervals = 0 targetChromosome 2 mapData intervalDeltaList => median orientation (queryStrand) 0=forward 1=backward mean => using 80% of data (sort the delta list, then take 10% to 90% of the list) stddev => if stddev is zero, use 1. locusKey (oldChromosome, oldStart, oldStop) 2 mapData targetCoordinate (newChromosome, newStart, newStop). leftIntervalDelta: None = boundary rightIntervalDelta: None = boundary, 10E10 = cross chromosome probability: max( P(SNP_i_left_interval), P(SNP_i_right_interval)). P(interval): If one interval is on the same chromosome, P(target-chromosome)*P(interval delta size) If not, P(chromosome-cross event). Not implemented: for a whole genome input (rather than a window), an RBTree of windows should be used to counter regional effect. 2013.11.24 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) self.noOfTotalIntervals = 0.0 self.noOfCrossChromosomeIntervals = 0.0 #make it float for division self.targetChromosome2mapData = {} self.locusKey2mapData = {} self.previousLocusData = None #write header for the main output header = [ 'oldChromosome', 'oldStart', 'oldStop', 'oldStrand', 'newChromosome', 'newStart', 'newStop', 'mapPvalue' ] self.writer.writerow(header) self.invariantPData.headerOutputted = True #avoid double header output #open the other writer and write header self.sideOutput = MatrixFile(self.locusIntervalDeltaOutputFname, openMode='w', delimiter='\t') header = [ 'oldChromosome', 'oldStart', 'oldStop', 'oldStrand', 'newChromosome', 'newStart', 'newStop', 'intervalDelta' ] self.sideOutput.writeHeader(header) def processRow(self, row=None, pdata=None): """ 2012.10.7 """ returnValue = 1 self.col_name2index = getattr(pdata, 'col_name2index', None) queryStrandIndex = self.col_name2index.get("queryStrand") queryChromosomeIndex = self.col_name2index.get("queryChromosome") queryStartIndex = self.col_name2index.get("queryStart") queryStopIndex = self.col_name2index.get("queryStop") newChrIndex = self.col_name2index.get("newChr") newRefStartIndex = self.col_name2index.get("newRefStart") newRefStopIndex = self.col_name2index.get("newRefStop") queryStrand = row[queryStrandIndex] queryChromosome = row[queryChromosomeIndex] queryStart = int(row[queryStartIndex]) queryStop = int(row[queryStopIndex]) newChr = row[newChrIndex] newRefStart = int(row[newRefStartIndex]) newRefStop = int(row[newRefStopIndex]) #create current locus data locusKey = (queryChromosome, queryStart, queryStop) currentLocusData = PassingData(locusKey=locusKey, queryStrand=queryStrand, queryChromosome=queryChromosome,\ queryStart=queryStart, queryStop=queryStop, \ newChr=newChr, newRefStart=newRefStart, newRefStop=newRefStop) #insert entry into locusKey2mapData self.locusKey2mapData[locusKey] = PassingData(locusData = currentLocusData, leftIntervalDelta=None,\ rightIntervalDelta=None, mapProbability=None) if self.previousLocusData is not None: #calculate interval delta if self.previousLocusData.newChr != currentLocusData.newChr: intervalDelta = 10E10 self.noOfCrossChromosomeIntervals += 1 else: querySpan = currentLocusData.queryStart - currentLocusData.queryStop targetSpan = currentLocusData.newRefStart - currentLocusData.newRefStop if queryStrand == '+': intervalDelta = targetSpan - querySpan else: intervalDelta = targetSpan + querySpan # insert it into self.targetChromosome2mapData if currentLocusData.newChr not in self.targetChromosome2mapData: self.targetChromosome2mapData[currentLocusData.newChr] = PassingData(intervalDeltaList=[],\ orientation=queryStrand,\ mean=None,\ stddev=None,\ probability=None) self.targetChromosome2mapData[ currentLocusData.newChr].intervalDeltaList.append( intervalDelta) #output to the side self.sideOutput.writerow([currentLocusData.queryChromosome,\ currentLocusData.queryStart, currentLocusData.queryStop, currentLocusData.queryStrand, \ currentLocusData.newChr, currentLocusData.newRefStart, currentLocusData.newRefStop, intervalDelta]) #assign it as right interval delta of previous locus self.locusKey2mapData[self.previousLocusData. locusKey].rightIntervalDelta = intervalDelta # assign it as left interval delta of current locus. self.locusKey2mapData[locusKey].leftIntervalDelta = intervalDelta self.noOfTotalIntervals += 1 self.previousLocusData = currentLocusData return returnValue def calculateLocusMapProbabilityGivenIntervalDelta( self, intervalDelta=None, targetChromosomeMapData=None, crossChromosomeProbability=None): """ 2013.11.25 """ mapProbability = 1 if intervalDelta is not None: if intervalDelta == 10E10: mapProbability *= crossChromosomeProbability else: lessThanGivenValueProb = norm.cdf( intervalDelta, loc=targetChromosomeMapData.mean, scale=targetChromosomeMapData.stddev) if intervalDelta > targetChromosomeMapData.mean: #two-sided p-value deltaProb = 2 * (1 - lessThanGivenValueProb) else: deltaProb = 2 * lessThanGivenValueProb mapProbability *= targetChromosomeMapData.probability * deltaProb return mapProbability def reduce(self, **keywords): """ 2012.10.15 run after all files have been walked through """ counter = 0 real_counter = 0 locusKeyList = self.locusKey2mapData.keys() locusKeyList.sort() sys.stderr.write("%s target chromosomes, %s cross-chromosome intervals, %s total intervals .\n "%\ (len(self.targetChromosome2mapData), self.noOfCrossChromosomeIntervals, self.noOfTotalIntervals)) if self.noOfTotalIntervals > 0: sys.stderr.write( "Running estimates for each target chromosome ... ") #estimates for each chromosome self.crossChromosomeProbability = float( self.noOfCrossChromosomeIntervals) / self.noOfTotalIntervals for targetChromosome in self.targetChromosome2mapData: mapData = self.targetChromosome2mapData.get(targetChromosome) #overall probability for an interval to be on this chromosome if len(mapData.intervalDeltaList ) == 0: #just one crossing event mapData.probability = 1 / float(self.noOfTotalIntervals) else: mapData.probability = len( mapData.intervalDeltaList) / float( self.noOfTotalIntervals) #estimate mean and stddev mapData.intervalDeltaList.sort() startIndex = max(0, int(len(mapData.intervalDeltaList) * 0.1)) stopIndex = max( int(len(mapData.intervalDeltaList) * 0.9) + 1, 1) if startIndex >= stopIndex: stopIndex = startIndex + 1 robustDataList = mapData.intervalDeltaList[ startIndex:stopIndex] stddev = 1 if len(robustDataList) > 0: mapData.mean = numpy.mean(robustDataList) if len(robustDataList) > 1: stddev = numpy.std(robustDataList) else: mapData.mean = 0 if stddev == 0: stddev = 1 mapData.stddev = stddev sys.stderr.write(".\n") #output sys.stderr.write("Output %s SNPs with map p-value ..." % (len(locusKeyList))) for locusKey in locusKeyList: counter += 1 locusMapData = self.locusKey2mapData.get(locusKey) locusData = locusMapData.locusData if locusMapData.leftIntervalDelta != None: leftProbability = self.calculateLocusMapProbabilityGivenIntervalDelta(intervalDelta=locusMapData.leftIntervalDelta, \ targetChromosomeMapData=self.targetChromosome2mapData.get(locusData.newChr),\ crossChromosomeProbability=self.crossChromosomeProbability) else: leftProbability = 0 if locusMapData.rightIntervalDelta != None: rightProbability = self.calculateLocusMapProbabilityGivenIntervalDelta(intervalDelta=locusMapData.rightIntervalDelta, \ targetChromosomeMapData=self.targetChromosome2mapData.get(locusData.newChr),\ crossChromosomeProbability=self.crossChromosomeProbability) else: rightProbability = 0 mapProbability = max(leftProbability, rightProbability) data_row = [locusData.queryChromosome,\ locusData.queryStart, locusData.queryStop, locusData.queryStrand, \ locusData.newChr, locusData.newRefStart, locusData.newRefStop, mapProbability] self.writer.writerow(data_row) real_counter += 1 sys.stderr.write("\n") else: #single SNP (give a low probability) sys.stderr.write( "Zero intervals, output %s SNPs with 0.001 map p-value ..." % (len(locusKeyList))) for locusKey in locusKeyList: counter += 1 locusMapData = self.locusKey2mapData.get(locusKey) locusData = locusMapData.locusData mapProbability = 0.001 data_row = [locusData.queryChromosome,\ locusData.queryStart, locusData.queryStop, locusData.queryStrand, \ locusData.newChr, locusData.newRefStart, locusData.newRefStop, mapProbability] self.writer.writerow(data_row) real_counter += 1 sys.stderr.write("\n") if counter > 0: fraction = float(real_counter) / float(counter) else: fraction = -1 sys.stderr.write("%s/%s (%.3f) outputted.\n" % (real_counter, counter, fraction)) self.sideOutput.close() #close the self.invariantPData.writer AbstractMatrixFileWalker.reduce(self, **keywords)
def calculatePerSampleMismatchFraction(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlapping_sample_id_set=None,\ NA_call_encoding_set = set(['.', 'NA'])): """ 2013.08.13 bugfix, derive overlapping_sites_set by itself, rather than use calculateOverlappingSites() 2013.07.17 vcf files are no longer pre-loaded. 2012.8.16 """ sys.stderr.write("Finding matches for each sample at overlapping sites ...") writer = MatrixFile(outputFname, openMode='w', delimiter='\t') header = ['sample_id', 'no_of_matches', 'no_of_non_NA_pairs', 'matchFraction'] no_of_samples_to_compare = len(overlapping_sample_id_set) vcfFile1._resetInput() vcfFile1.parseFile() vcfFile2._resetInput() vcfFile2.parseFile() overlapping_sites_set = set(vcfFile1.locus_id_ls) & set(vcfFile2.locus_id_ls) sys.stderr.write(" %s overlapping loci, "%(len(overlapping_sites_set))) header_ls_for_no_of_matches = [] header_ls_for_no_of_non_NA_pairs = [] header_ls_for_matchFraction = [] overlapping_sample_id_list = list(overlapping_sample_id_set) overlapping_sample_id_list.sort() """ for sample_id in overlapping_sample_id_list: header_ls_for_no_of_matches.append('no_of_matches_for_%s'%(sample_id)) header_ls_for_no_of_non_NA_pairs.append('no_of_non_NA_pairs_for_%s'%(sample_id)) header_ls_for_matchFraction.append('matchFraction_for_%s'%(sample_id)) #header = header + header_ls_for_no_of_matches + header_ls_for_no_of_non_NA_pairs + header_ls_for_matchFraction """ no_of_matches_per_sample_ls = [0]*no_of_samples_to_compare no_of_non_NA_pairs_per_sample_ls = [0]*no_of_samples_to_compare for locus_id in overlapping_sites_set: row_index1 = vcfFile1.locus_id2row_index[locus_id] row_index2 = vcfFile2.locus_id2row_index[locus_id] for j in xrange(len(overlapping_sample_id_list)): sample_id = overlapping_sample_id_list[j] col_index1 = vcfFile1.sample_id2index.get(sample_id) col_index2 = vcfFile2.sample_id2index.get(sample_id) #2012.1.17 bugfix below. so that 'AG' and 'GA' are same. call1 = vcfFile1.genotype_call_matrix[row_index1][col_index1] call2 = vcfFile2.genotype_call_matrix[row_index2][col_index2] if call1 not in NA_call_encoding_set and call2 not in NA_call_encoding_set: no_of_non_NA_pairs_per_sample_ls[j] += 1 if nt2number[call1]==nt2number[call2]: #2013.07.03 bugfix, 'AT' and 'TA' should be same. no phase no_of_matches_per_sample_ls[j] += 1 else: #do nothing pass matchFractionLs = [-1]*no_of_samples_to_compare for j in xrange(no_of_samples_to_compare): if no_of_non_NA_pairs_per_sample_ls[j]>0: matchFractionLs[j] = no_of_matches_per_sample_ls[j]/float(no_of_non_NA_pairs_per_sample_ls[j]) writer.writerow(header) for i in xrange(no_of_samples_to_compare): data_row = [overlapping_sample_id_list[i], no_of_matches_per_sample_ls[i], no_of_non_NA_pairs_per_sample_ls[i],\ matchFractionLs[i]] writer.writerow(data_row) del writer sys.stderr.write("%s samples.\n"%(no_of_samples_to_compare))
def outputSwitchPointInfo(self, querySNPID2NewReferenceCoordinateLs=None, outputFname=None): """ 2013.07.11 output the switch point (adjacent sites mapped to two different chromosomes) information """ sys.stderr.write("Converting querySNPID2NewReferenceCoordinateLs to oldCoordinateKey2newCoordinateDataLs ... ") oldCoordinateKey2newCoordinateDataLs = {} counter = 0 for querySNPID, newRefCoordinateLs in querySNPID2NewReferenceCoordinateLs.iteritems(): oldCoordinateKey = None counter += len(newRefCoordinateLs) for newRefCoordinate in newRefCoordinateLs: if oldCoordinateKey is None: oldCoordinateKey = (newRefCoordinate.queryChromosome, newRefCoordinate.queryStart, newRefCoordinate.queryStop) if oldCoordinateKey not in oldCoordinateKey2newCoordinateDataLs: oldCoordinateKey2newCoordinateDataLs[oldCoordinateKey] = [] oldCoordinateKey2newCoordinateDataLs[oldCoordinateKey].append(newRefCoordinate) sys.stderr.write(" %s old coordinate keys with %s new coordinates.\n"%(len(oldCoordinateKey2newCoordinateDataLs),\ counter)) sys.stderr.write("Finding switch points ...") counter =0 real_counter = 0 noOfRecordsWithMultiNewCoords = 0 oldChromosome2SwitchData = {} oldCoordinateKeyLs = oldCoordinateKey2newCoordinateDataLs.keys() oldCoordinateKeyLs.sort() for oldCoordinateKey in oldCoordinateKeyLs: counter +=1 newRefCoordinateLs = oldCoordinateKey2newCoordinateDataLs.get(oldCoordinateKey) oldChromosome = oldCoordinateKey[0] if oldChromosome not in oldChromosome2SwitchData: oldChromosome2SwitchData[oldChromosome] = PassingData(noOfLociWithUniqueHit=0, noOfLoci=0, \ spanStart=oldCoordinateKey[1], \ spanStop=oldCoordinateKey[2], noOfSwitchPoints=0,\ previousNewChromosome=None, previousNewRefStart=None,\ previousNewRefStop=None,\ previousOrientationOnNewChromosome=None) switchData = oldChromosome2SwitchData[oldChromosome] switchData.noOfLoci += 1 if len(newRefCoordinateLs)>1: noOfRecordsWithMultiNewCoords += 1 continue switchData.noOfLociWithUniqueHit += 1 newRefCoordinate = newRefCoordinateLs[0] if switchData.previousNewChromosome is not None: if newRefCoordinate.newChr!=switchData.previousNewChromosome: switchData.noOfSwitchPoints += 1 #reset the orientation switchData.previousOrientationOnNewChromosome = None else: #on the same chromosome currentOrientation = (newRefCoordinate.newRefStart - switchData.previousNewRefStart)>=0 if switchData.previousOrientationOnNewChromosome is not None: if currentOrientation !=switchData.previousOrientationOnNewChromosome: switchData.noOfSwitchPoints += 1 switchData.previousOrientationOnNewChromosome = currentOrientation #adjust the spanStop if newRefCoordinate.queryStop > switchData.spanStop: switchData.spanStop = newRefCoordinate.queryStop switchData.previousNewChromosome = newRefCoordinate.newChr switchData.previousNewRefStart = newRefCoordinate.newRefStart switchData.previousNewRefStop = newRefCoordinate.newRefStop real_counter += 1 if counter >0: fraction = real_counter/float(counter) else: fraction = -1 sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \ fraction, noOfRecordsWithMultiNewCoords)) sys.stderr.write("Outputting switch points of %s old chromosomes ..."%(len(oldChromosome2SwitchData))) statFile = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t') header = ['oldChromosome', "noOfSwitchPoints", "regionSpan", "noOfLociWithUniqueHit", "noOfSwitchesPerLocus", "noOfLoci"] statFile.writeHeader(header) noOfTotalSwitchPoints = 0 noOfTotalLoci = 0 for oldChromosome, switchData in oldChromosome2SwitchData.iteritems(): if switchData.noOfLociWithUniqueHit>0: switchPointFraction = switchData.noOfSwitchPoints/float(switchData.noOfLociWithUniqueHit) else: switchPointFraction = -1 data_row = [oldChromosome, switchData.noOfSwitchPoints, switchData.spanStop-switchData.spanStart+1, \ switchData.noOfLociWithUniqueHit, switchPointFraction, len(oldCoordinateKey2newCoordinateDataLs)] statFile.writerow(data_row) noOfTotalSwitchPoints += switchData.noOfSwitchPoints noOfTotalLoci += switchData.noOfLociWithUniqueHit statFile.close() sys.stderr.write(' %s total switch points, %s total loci with unique hit.\n'%(noOfTotalSwitchPoints, noOfTotalLoci))
def calculateOverlappingSites(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlappingSitesOutputFname=None,\ chromosome=None, chrLength=None): """ 2013.09.10 added argument overlappingSitesOutputFname 2013.07.17 vcf files are no longer pre-loaded. read in locus ids first. 2012.8.16 """ writer = MatrixFile(outputFname, openMode='w', delimiter='\t') header = ['#chromosome', 'length', '#sitesInInput1', '#sitesInInput2', '#overlapping', 'overlappingOverTotal', \ 'overlappingOverInput1', 'overlappingOverInput2', '#segregatingSitesNormalized', ] vcf1_locus_id_list = [] for row in vcfFile1.reader: vcf1_locus_id_list.append((row[0], row[1])) vcf2_locus_id_list = [] for row in vcfFile2.reader: vcf2_locus_id_list.append((row[0], row[1])) no_of_sites_of_input1 = len(vcf1_locus_id_list) no_of_sites_of_input2 = len(vcf2_locus_id_list) overlapping_sites_set = set(vcf1_locus_id_list)&set(vcf2_locus_id_list) if overlappingSitesOutputFname: #outputFname = "%s_overlapSitePos.tsv"%(outputFnamePrefix) self.outputOverlapSites(overlapping_sites_set=overlapping_sites_set, outputFname=overlappingSitesOutputFname) no_of_overlapping_sites = len(overlapping_sites_set) no_of_total_sites = no_of_sites_of_input1+no_of_sites_of_input2-no_of_overlapping_sites if no_of_total_sites>0: overlapping_fraction = no_of_overlapping_sites/float(no_of_total_sites) else: overlapping_fraction = -1 if no_of_sites_of_input1>0: overlappingOverInput1 = no_of_overlapping_sites/float(no_of_sites_of_input1) else: overlappingOverInput1 = -1 if no_of_sites_of_input2>0: overlappingOverInput2 = no_of_overlapping_sites/float(no_of_sites_of_input2) else: overlappingOverInput2 = -1 no_of_samples = len(vcfFile1.sample_id2index) no_of_samples_in_vcf2 = len(vcfFile2.sample_id2index) overlapping_sample_id_set = set(vcfFile1.sample_id2index.keys()) & set(vcfFile2.sample_id2index.keys()) if no_of_samples!=no_of_samples_in_vcf2: sys.stderr.write("Warning: sample size in %s is %s, in %s is %s. not matching.\n"%\ (vcfFile1.inputFname, no_of_samples, vcfFile2.inputFname, no_of_samples_in_vcf2)) #exclude the ref sample in the 1st column if no_of_samples>1: normalizingConstant = float(utils.sumOfReciprocals(no_of_samples*2-1)) else: normalizingConstant = 1 noOfSegregatesSitesNormalized = no_of_overlapping_sites/(normalizingConstant*chrLength) writer.writerow(header) """ #reformat for output no_of_matches_per_sample_ls = map(repr, no_of_matches_per_sample_ls) no_of_non_NA_pairs_per_sample_ls = map(repr, no_of_non_NA_pairs_per_sample_ls) matchFractionLs = map(repr, matchFractionLs) """ writer.writerow([chromosome, chrLength, no_of_sites_of_input1, no_of_sites_of_input2, no_of_overlapping_sites, \ overlapping_fraction, overlappingOverInput1, overlappingOverInput2, \ noOfSegregatesSitesNormalized]) del writer return PassingData(overlapping_sample_id_set=overlapping_sample_id_set,overlapping_sites_set=overlapping_sites_set)
def readInCoordinateMap(self, coordinateMapFname=None): """ 2013.07.11 querySNPID queryStrand queryChromosome queryStart queryStop queryRefBase queryAltBase queryAlignmentSpan queryAlignmentStart queryAlignmentStop newChr newRefStart newRefStop newRefBase targetAlignmentSpan targetAlignmentStart targetAlignmentStop """ sys.stderr.write("Reading in the coordinate map from %s ..." % (coordinateMapFname)) oldCoordinate2newCoordinateDataLs = {} reader = MatrixFile(inputFname=coordinateMapFname) reader.constructColName2IndexFromHeader() oldChromosomeIndex = reader.getColIndexGivenColHeader( "queryChromosome") oldStartIndex = reader.getColIndexGivenColHeader("queryStart") strandIndex = reader.getColIndexGivenColHeader("queryStrand") oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase") oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase") newChromosomeIndex = reader.getColIndexGivenColHeader("newChr") newStartIndex = reader.getColIndexGivenColHeader("newRefStart") newStopIndex = reader.getColIndexGivenColHeader("newRefStop") newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase") counter = 0 for row in reader: oldChromosome = row[oldChromosomeIndex] oldStart = int(row[oldStartIndex]) strand = row[strandIndex] oldRefBase = row[oldRefBaseIndex] oldAltBase = row[oldAltBaseIndex] newChromosome = row[newChromosomeIndex] newStart = int(row[newStartIndex]) newStop = int(row[newStopIndex]) newRefBase = row[newRefBaseIndex] key = (oldChromosome, oldStart) if key not in oldCoordinate2newCoordinateDataLs: oldCoordinate2newCoordinateDataLs[key] = [] oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \ oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\ newStop=newStop, newRefBase=newRefBase)) counter += 1 del reader sys.stderr.write("%s old coordinates with %s new coordinates.\n" % (len(oldCoordinate2newCoordinateDataLs), counter)) return oldCoordinate2newCoordinateDataLs
#!/usr/bin/env python import os, sys inputFname = os.path.expanduser("~/RefGenomes/dustPlus10_M1-22XY.bed.gz") inputFname = os.path.expanduser("~/script/varcmp/scripts/LCR-hs37d5.bed.gz") inputFname = os.path.expanduser("~/RefGenomes/dust_M1-22XY.bed.gz") inputFname = os.path.expanduser("/illumina/scratch/CompetitiveAnalysis/CAG/Data/AnnotDB/Repeats/SegDups/genomicSuperDups_hg19.bed") inputFname = os.path.expanduser("~/RefGenomes/dustPlus10_M1-22XY.overlap.genomicSuperDups_hg19.merged.bed") inputFname=sys.argv[1] sys.path.insert(0, os.path.expanduser('~/lib/python')) sys.path.insert(0, os.path.join(os.path.expanduser('~/script'))) from pymodule import utils from pymodule import MatrixFile reader = MatrixFile(inputFname=inputFname, openMode='r', delimiter='\t') span=0 for row in reader: if row[0][0]=='#': continue subSpan = int(row[2])-int(row[1]) + 1 span += subSpan print("span is %s \n"%(span))