def run(self): if self.debug: import pdb pdb.set_trace() inconsistent_rate_ls = [] for inputFname in self.inputFnameLs: if os.path.isfile(inputFname): try: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) inconsistent_rate_index = col_name2index.get("inconsistency") for row in reader: inconsistency = float(row[inconsistent_rate_index]) inconsistent_rate_ls.append(inconsistency) del reader except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() if self.title is None: title = "histogram of inconsistent rate from %s refs"%(len(inconsistent_rate_ls)) else: title = self.title if len(inconsistent_rate_ls)>10: medianInconsistentRate = numpy.median(inconsistent_rate_ls) title += " median %.4f"%(medianInconsistentRate) yh_matplotlib.drawHist(inconsistent_rate_ls, title=title, \ xlabel_1D="Inconsistent Rate", xticks=None, outputFname=self.outputFname, min_no_of_data_points=20, needLog=False, \ dpi=200)
def readDataMatrix(self, inputFname, minExprSumPerGene=180): """ 2012.5.8 """ sys.stderr.write("Reading the gene expression matrix from %s ..."%(inputFname)) suffix = os.path.splitext(inputFname)[1] if suffix=='.gz': import gzip inf = gzip.open(inputFname, 'r') else: inf = open(inputFname, 'r') reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) header = reader.next() #first line is taken as header colName2Index = getColName2IndexFromHeader(header) data_matrix = [] row_id_ls = [] counter = 0 real_counter = 0 for row in reader: data_row = row[1:] data_row = map(float, data_row) exprSumPerGene = sum(data_row) counter += 1 if exprSumPerGene>=minExprSumPerGene: real_counter += 1 row_id_ls.append(row[0]) data_matrix.append(data_row) data_matrix = numpy.array(data_matrix) sys.stderr.write("%s rows out of %s selected. %s rows , %s columns.\n"%(real_counter, counter, \ len(row_id_ls), len(header)-1)) return PassingData(row_id_ls=row_id_ls, header=header, data_matrix=data_matrix)
def trioInconsistentRateFileWalker(self, inputFname, processFunc=None, minNoOfTotal=100, run_type=1): """ 2011-11-2 remove the maxDepth filter. apply afterwards through filterDataByDepth(). 2011-9-30 """ reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) isInconsistent_index = col_name2index.get("isInconsistent") index_of_fa_depth = col_name2index.get("depthOfFather") index_of_mo_depth = col_name2index.get('depthOfMother') index_of_child_depth = col_name2index.get('depthOfChild') for row in reader: fa_depth = int(float(row[index_of_fa_depth])) mo_depth = int(float(row[index_of_mo_depth])) child_depth = int(float(row[index_of_child_depth])) isInconsistent = float(float(row[isInconsistent_index])) #if fa_depth<=self.maxDepth and mo_depth <=self.maxDepth and child_depth<=self.maxDepth: self.fa_depth_ls.append(fa_depth) self.mo_depth_ls.append(mo_depth) self.child_depth_ls.append(child_depth) self.inconsistent_ls.append(isInconsistent) del reader
def run(self): if self.debug: import pdb pdb.set_trace() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') writer.writerow(['#sampleID', 'chromosome', 'meanDepth', 'medianDepth']) for inputFname in self.inputFnameLs: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = csv.reader(inputFile, delimiter=delimiter) header = reader.next() col_name2index = getColName2IndexFromHeader(header) intervalIDIndex = col_name2index.get("Target") #only the first read group among the output (so don't run the DepthOfCoverageWalker over multi-read-group bam files avgCoverageIndex = 4 sampleID = header[avgCoverageIndex][:-9] #this column header is like $sampleID_mean_cvg. so get rid of _mean_cvg medianCoverageIndex = 6 for row in reader: intervalID = row[intervalIDIndex] writer.writerow([sampleID, intervalID, row[avgCoverageIndex], row[medianCoverageIndex]]) del writer sys.stderr.write("Done.\n")
def putHaplotypeGroupIntoDB(self, session, input_fname, tg_ecotypeid2row, max_snp_typing_error_rate, snp_id_ls): """ 2009-3-31 2009-4-4 add argument tg_ecotypeid2row """ sys.stderr.write("Constructing haplotype groups ...\n") pattern_ecotypeid = re.compile(r'(?<=\))\d+') reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) col_name2col_index = getColName2IndexFromHeader(reader.next()) ecotypeid_idx = col_name2col_index['ecotypeid'] haplo_name_idx = col_name2col_index['haplogroup'] geographic_integrity_idx = col_name2col_index['geographic_integrity'] filtered_SNPs_idx = col_name2col_index['filtered_SNPs'] counter = 0 for tg_ecotypeid, row in tg_ecotypeid2row.iteritems(): ecotypeid = int(row[ecotypeid_idx]) ecotypeid = tg_ecotypeid #2009-4-4 use tg_ecotypeid instead haplo_name = row[haplo_name_idx] geographic_integrity_name = row[geographic_integrity_idx] filtered_SNPs = row[filtered_SNPs_idx] ref_ecotypeid = int(pattern_ecotypeid.search(haplo_name).group(0)) haplo_group = StockDB.HaploGroup.query.filter_by(short_name=haplo_name).first() if not haplo_group: haplo_group = StockDB.HaploGroup(short_name=haplo_name, ref_ecotypeid=ref_ecotypeid, max_snp_typing_error_rate=max_snp_typing_error_rate) session.save(haplo_group) session.flush() ecotype = StockDB.Ecotype.get(ecotypeid) haplo_group.ecotypes.append(ecotype) geographic_integrity = StockDB.GeographicIntegrity.query.filter_by(short_name=geographic_integrity_name).first() if not geographic_integrity: geographic_integrity = StockDB.GeographicIntegrity(short_name=geographic_integrity_name) session.save(geographic_integrity) session.flush() ecotype.geographic_integrity = geographic_integrity session.save_or_update(ecotype) #one bit of ecotype: link the ecotypeid to tg_ecotype_id #deal with filtered SNPs for i in range(len(filtered_SNPs)): allele = filtered_SNPs[i] if allele=='_': continue fc = StockDB.FilteredCalls(ecotypeid=ecotypeid, snpid=snp_id_ls[i], allele=allele) session.save(fc) session.flush() counter += 1 if counter%500==0 and self.report: sys.stderr.write('%s%s'%('\x08'*80, counter)) session.flush() sys.stderr.write("Done.\n")
def drawLD(self, axe_LD, LD_fname, row_snp_region, col_snp_region, min_MAF, which_LD_statistic, min_gap=100000): """ 2008-10-03 """ sys.stderr.write("Drawing LD from %s ...\n" % (LD_fname)) reader = csv.reader(open(LD_fname), delimiter='\t') col_name2index = getColName2IndexFromHeader(reader.next()) counter = 0 real_counter = 0 xylim_data = PassingData(xlim=[-1, -1], ylim=[-1, -1]) for row in reader: snp1 = row[col_name2index['snp1']].split('_') snp1 = tuple(map(int, snp1)) snp2 = row[col_name2index['snp2']].split('_') snp2 = tuple(map(int, snp2)) allele1_freq = float(row[col_name2index['allele1_freq']]) allele2_freq = float(row[col_name2index['allele2_freq']]) if allele1_freq >= min_MAF and allele2_freq >= min_MAF: #meet the minimum minor-allele-frequency LD_stat = float(row[col_name2index[LD_statistic.get_name( which_LD_statistic)]]) LD_stat = abs(LD_stat) fc = self.r2ToRGBColor(LD_stat) if snp1 in row_snp_region.chr_pos2adjacent_window and snp2 in col_snp_region.chr_pos2adjacent_window: if snp1[0] == snp2[0] and abs( snp1[1] - snp2[1]) <= min_gap: #too close, ignore continue self.addOnePolygon(axe_LD, row_snp_region.chr_pos2adjacent_window[snp1], col_snp_region.chr_pos2adjacent_window[snp2], \ fc, xylim_data) real_counter += 1 if snp2 in row_snp_region.chr_pos2adjacent_window and snp1 in col_snp_region.chr_pos2adjacent_window: if snp1[0] == snp2[0] and abs( snp1[1] - snp2[1]) <= min_gap: #too close, ignore continue self.addOnePolygon(axe_LD, row_snp_region.chr_pos2adjacent_window[snp2], col_snp_region.chr_pos2adjacent_window[snp1], \ fc, xylim_data) real_counter += 1 counter += 1 if counter % 100000 == 0: sys.stderr.write('%s\t%s' % ('\x08' * 100, counter)) if counter % 500000 == 0 and self.debug > 0: break del reader sys.stderr.write("%s LD drawn. Done.\n" % real_counter) return xylim_data
def getSampleID2FamilyCount(self, inputFname): """ 2012.3.29 """ sys.stderr.write("Getting sampleID2FamilyCount from %s ..."%(inputFname)) reader = csv.reader(open(inputFname, 'r'), delimiter=figureOutDelimiter(inputFname)) header = reader.next() colName2Index = getColName2IndexFromHeader(header) sampleID2FamilyCount = {} for row in reader: individualID = row[colName2Index.get("individualID")] familyCount = int(row[colName2Index.get("familyCount")]) sampleID2FamilyCount[individualID] = familyCount sys.stderr.write("%s individuals.\n"%(len(sampleID2FamilyCount))) return sampleID2FamilyCount
def dropRedundantEcotypes(self, input_fname, ecotypeid2tg_ecotypeid): """ 2009-4-10 not used. decided to keep all of them. 2009-4-4 retain only one row out of duplicated ecotype rows based on ecotypeid2tg_ecotypeid. it's not random. usually the one with same ecotype id as tg_ecotypeid unless tg_ecotypeid doesn't appear. if duplicated ecotypes belong to different haplotype group, choose the one with tg_ecotypeid otherwise random. """ sys.stderr.write("Dropping redundant ecotypes ...\n") reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) col_name2col_index = getColName2IndexFromHeader(reader.next()) ecotypeid_idx = col_name2col_index['ecotypeid'] haplo_name_idx = col_name2col_index['haplogroup'] nativename_idx = col_name2col_index['nativename'] tg_ecotypeid2row = {} no_of_duplicates = 0 no_of_duplicates_with_different_haplogroups = 0 counter = 0 for row in reader: ecotypeid = int(row[ecotypeid_idx]) haplo_name = row[haplo_name_idx] nativename = row[nativename_idx] if ecotypeid in ecotypeid2tg_ecotypeid: tg_ecotypeid = ecotypeid2tg_ecotypeid[ecotypeid] if tg_ecotypeid not in tg_ecotypeid2row: tg_ecotypeid2row[tg_ecotypeid] = row else: no_of_duplicates += 1 old_row = tg_ecotypeid2row[tg_ecotypeid] old_ecotypeid = int(old_row[ecotypeid_idx]) old_haplo_name = old_row[haplo_name_idx] old_nativename = row[nativename_idx] if old_haplo_name != haplo_name: sys.stderr.write("ecotype %s(%s) in haplotype group %s, while duplicate %s(%s) in haplotype group %s.\n"%\ (ecotypeid, nativename, haplo_name, old_ecotypeid, old_nativename, old_haplo_name)) no_of_duplicates_with_different_haplogroups += 1 if ecotypeid == tg_ecotypeid: #replace if the new ecotypeid matching the tg_ecotypeid whether the haplotype group is same or not. tg_ecotypeid2row[tg_ecotypeid] = row else: sys.stderr.write( "Warning: ecotype %s not in ecotypeid2tg_ecotypeid.\n" % (ecotypeid)) counter += 1 sys.stderr.write("no_of_duplicates: %s, out of which %s encompass different haplotype groups. %s accessions in total. Done.\n"%\ (no_of_duplicates, no_of_duplicates_with_different_haplogroups, counter)) return tg_ecotypeid2row
def save_LD(self, session, LD_fname, call_method_id, commit=0): """ 2008-10-15 adapted from DrawSNPRegion.get_LD() """ sys.stderr.write("Reading in LD info from %s ...\n" % (LD_fname)) reader = csv.reader(open(LD_fname), delimiter='\t') col_name2index = getColName2IndexFromHeader(reader.next()) counter = 0 for row in reader: snp1 = row[col_name2index['snp1']].split('_') snp1 = map(int, snp1) snp2 = row[col_name2index['snp2']].split('_') snp2 = map(int, snp2) allele1_freq = float(row[col_name2index['allele1_freq']]) allele2_freq = float(row[col_name2index['allele2_freq']]) r2 = float(row[col_name2index['r2']]) D_prime = float(row[col_name2index['D_prime']]) D = float(row[col_name2index['D']]) if snp1 < snp2: snp_pair = (snp1[0], snp1[1], snp2[0], snp2[1]) else: snp_pair = (snp2[0], snp2[1], snp1[0], snp1[1]) no_of_pairs = int( float(row[col_name2index['no_of_pairs']]) / 2 ) #MpiLD.py outputs this double (due to haploid regarded as diploid) ld = Stock_250kDB.LD(snp1_maf=allele1_freq, snp2_maf=allele2_freq, d=D, d_prime=D_prime, r2=r2, no_of_pairs=no_of_pairs) ld.chr1 = snp_pair[0] ld.pos1 = snp_pair[1] ld.chr2 = snp_pair[2] ld.pos2 = snp_pair[3] ld.call_method_id = call_method_id if commit: session.save(ld) session.flush() counter += 1 if counter % 100000 == 0: sys.stderr.write('%s\t%s' % ('\x08' * 100, counter)) if counter % 1000 == 0 and self.debug > 0: break pass sys.stderr.write("%s entries. Done.\n" % counter)
def getScoreRankFromRBG(self, rbg, candidate_gene_set, results_directory): """ 2008-09-28 rename getScoreRank to getScoreRankFromRBG """ sys.stderr.write("Getting score & rank list ...") if results_directory: #given a directory where all results are. result_fname = os.path.join(results_directory, os.path.basename(rbg.filename)) else: result_fname = rbg.filename if not os.path.isfile(result_fname): sys.stderr.write("%s doesn't exist.\n" % result_fname) return None #if rbg.results_method.analysis_method_id==13: # sys.stderr.write("Skip analysis_method_id=13.\n") # return None reader = csv.reader(open(result_fname), delimiter='\t') col_name2index = getColName2IndexFromHeader(reader.next()) counter = 0 candidate_score_ls = [] non_candidate_score_ls = [] candidate_rank_ls = [] non_candidate_rank_ls = [] for row in reader: gene_id = int(row[col_name2index['gene_id']]) score = float(row[col_name2index['score']]) if gene_id in candidate_gene_set: candidate_score_ls.append(score) candidate_rank_ls.append(counter) else: non_candidate_score_ls.append(score) non_candidate_rank_ls.append(counter) counter += 1 del reader analysis_method = Stock_250kDB.AnalysisMethod.get( rbg.results_method.analysis_method_id) score_rank_data = PassingData(candidate_score_ls=candidate_score_ls, candidate_rank_ls=candidate_rank_ls,\ non_candidate_score_ls=non_candidate_score_ls, non_candidate_rank_ls=non_candidate_rank_ls,\ analysis_method=analysis_method) sys.stderr.write("Done.\n") return score_rank_data
def dropRedundantEcotypes(self, input_fname, ecotypeid2tg_ecotypeid): """ 2009-4-4 retain only one row out of duplicated ecotype rows based on ecotypeid2tg_ecotypeid. it's not random. usually the one with same ecotype id as tg_ecotypeid unless tg_ecotypeid doesn't appear. if duplicated ecotypes belong to different haplotype group, choose the one with tg_ecotypeid otherwise random. """ sys.stderr.write("Dropping redundant ecotypes ...\n") reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) col_name2col_index = getColName2IndexFromHeader(reader.next()) ecotypeid_idx = col_name2col_index['ecotypeid'] haplo_name_idx = col_name2col_index['haplogroup'] nativename_idx = col_name2col_index['nativename'] tg_ecotypeid2row = {} no_of_duplicates = 0 no_of_duplicates_with_different_haplogroups = 0 counter = 0 for row in reader: ecotypeid = int(row[ecotypeid_idx]) haplo_name = row[haplo_name_idx] nativename = row[nativename_idx] if ecotypeid in ecotypeid2tg_ecotypeid: tg_ecotypeid = ecotypeid2tg_ecotypeid[ecotypeid] if tg_ecotypeid not in tg_ecotypeid2row: tg_ecotypeid2row[tg_ecotypeid] = row else: no_of_duplicates += 1 old_row = tg_ecotypeid2row[tg_ecotypeid] old_ecotypeid = int(old_row[ecotypeid_idx]) old_haplo_name = old_row[haplo_name_idx] old_nativename = row[nativename_idx] if old_haplo_name!=haplo_name: sys.stderr.write("ecotype %s(%s) in haplotype group %s, while duplicate %s(%s) in haplotype group %s.\n"%\ (ecotypeid, nativename, haplo_name, old_ecotypeid, old_nativename, old_haplo_name)) no_of_duplicates_with_different_haplogroups += 1 if ecotypeid==tg_ecotypeid: #replace if the new ecotypeid matching the tg_ecotypeid whether the haplotype group is same or not. tg_ecotypeid2row[tg_ecotypeid] = row else: sys.stderr.write("Warning: ecotype %s not in ecotypeid2tg_ecotypeid.\n"%(ecotypeid)) counter += 1 sys.stderr.write("no_of_duplicates: %s, out of which %s encompass different haplotype groups. %s accessions in total. Done.\n"%\ (no_of_duplicates, no_of_duplicates_with_different_haplogroups, counter)) return tg_ecotypeid2row
def getScoreRankFromRBG(self, rbg, candidate_gene_set, results_directory): """ 2008-09-28 rename getScoreRank to getScoreRankFromRBG """ sys.stderr.write("Getting score & rank list ...") if results_directory: #given a directory where all results are. result_fname = os.path.join(results_directory, os.path.basename(rbg.filename)) else: result_fname = rbg.filename if not os.path.isfile(result_fname): sys.stderr.write("%s doesn't exist.\n"%result_fname) return None #if rbg.results_method.analysis_method_id==13: # sys.stderr.write("Skip analysis_method_id=13.\n") # return None reader = csv.reader(open(result_fname), delimiter='\t') col_name2index = getColName2IndexFromHeader(reader.next()) counter = 0 candidate_score_ls = [] non_candidate_score_ls = [] candidate_rank_ls = [] non_candidate_rank_ls = [] for row in reader: gene_id = int(row[col_name2index['gene_id']]) score = float(row[col_name2index['score']]) if gene_id in candidate_gene_set: candidate_score_ls.append(score) candidate_rank_ls.append(counter) else: non_candidate_score_ls.append(score) non_candidate_rank_ls.append(counter) counter += 1 del reader analysis_method = Stock_250kDB.AnalysisMethod.get(rbg.results_method.analysis_method_id) score_rank_data = PassingData(candidate_score_ls=candidate_score_ls, candidate_rank_ls=candidate_rank_ls,\ non_candidate_score_ls=non_candidate_score_ls, non_candidate_rank_ls=non_candidate_rank_ls,\ analysis_method=analysis_method) sys.stderr.write("Done.\n") return score_rank_data
def run(self): """ """ if self.debug: import pdb pdb.set_trace() inf = utils.openGzipFile(self.inputFname, openMode='r') reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) header = None for i in xrange(self.noOfLinesInHeader): if i==0: header = reader.next() #first line is taken as header else: reader.next() if header is not None: colName2Index = getColName2IndexFromHeader(header) newHeader = ['alignmentID', 'total_base_count', 'sampled_base_count', 'meanDepth', 'medianDepth', 'modeDepth'] inputStatLs = [] writer = csv.writer(utils.openGzipFile(self.outputFname, openMode='w'), delimiter='\t') writer.writerow(newHeader) counter = 0 real_counter = 0 for row in reader: counter += 1 if real_counter <= self.maxNumberOfSamplings: r = random.random() if r<=self.fractionToSample and real_counter<=self.maxNumberOfSamplings: inputStatLs.append(float(row[self.whichColumn])) real_counter += 1 meanDepth = numpy.mean(inputStatLs) medianDepth = numpy.median(inputStatLs) modeDepth = scipy.stats.mode(inputStatLs)[0][0] outputRow = [self.alignmentID, counter, real_counter, meanDepth, medianDepth, modeDepth] writer.writerow(outputRow) del writer
def save_LD(self, session, LD_fname, call_method_id, commit=0): """ 2008-10-15 adapted from DrawSNPRegion.get_LD() """ sys.stderr.write("Reading in LD info from %s ...\n"%(LD_fname)) reader = csv.reader(open(LD_fname), delimiter='\t') col_name2index = getColName2IndexFromHeader(reader.next()) counter = 0 for row in reader: snp1 = row[col_name2index['snp1']].split('_') snp1 = map(int, snp1) snp2 = row[col_name2index['snp2']].split('_') snp2 = map(int, snp2) allele1_freq = float(row[col_name2index['allele1_freq']]) allele2_freq = float(row[col_name2index['allele2_freq']]) r2 = float(row[col_name2index['r2']]) D_prime = float(row[col_name2index['D_prime']]) D = float(row[col_name2index['D']]) if snp1<snp2: snp_pair = (snp1[0], snp1[1], snp2[0], snp2[1]) else: snp_pair = (snp2[0], snp2[1], snp1[0], snp1[1]) no_of_pairs = int(float(row[col_name2index['no_of_pairs']])/2) #MpiLD.py outputs this double (due to haploid regarded as diploid) ld = Stock_250kDB.LD(snp1_maf=allele1_freq, snp2_maf=allele2_freq, d=D, d_prime=D_prime, r2=r2, no_of_pairs=no_of_pairs) ld.chr1 = snp_pair[0] ld.pos1 = snp_pair[1] ld.chr2 = snp_pair[2] ld.pos2 = snp_pair[3] ld.call_method_id = call_method_id if commit: session.save(ld) session.flush() counter += 1 if counter%100000==0: sys.stderr.write('%s\t%s'%('\x08'*100, counter)) if counter%1000==0 and self.debug>0: break pass sys.stderr.write("%s entries. Done.\n"%counter)
def trioInconsistentRateFileWalker(cls, inputFname, processFunc=None, minNoOfTotal=100, run_type=1): """ 2012.10.25 only skip except during file opening, not file reading 2011-9-30 """ try: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() return inconsistent_rate_index = col_name2index.get("inconsistency") if run_type==1: index_of_x_data = col_name2index.get("stopFrequency") elif run_type==2: index_of_x_data = col_name2index.get("stop") else: sys.stderr.write("Unsupported run_type %s in trioInconsistentRateFileWalker().\n"%(run_type)) sys.exit(3) index_of_no_of_total = col_name2index.get("no_of_total") inconsistent_rate_ls = [] x_ls = [] for row in reader: if self.samplingRate<1 and self.samplingRate>=0: r = random.random() if r>self.samplingRate: continue no_of_total = int(float(row[index_of_no_of_total])) if no_of_total<=minNoOfTotal: continue inconsistency = float(row[inconsistent_rate_index]) inconsistent_rate_ls.append(inconsistency) x_data = float(row[index_of_x_data]) x_ls.append(x_data) processFunc(x_ls, inconsistent_rate_ls) del reader
def getTopGeneSet(self, rbg, results_directory, no_of_top_genes=1000): """ 2008-09-29 get a set of top genes """ if results_directory: #given a directory where all results are. result_fname = os.path.join(results_directory, os.path.basename(rbg.filename)) else: result_fname = rbg.filename if not os.path.isfile(result_fname): sys.stderr.write("%s doesn't exist.\n"%result_fname) return None reader = csv.reader(open(result_fname), delimiter='\t') col_name2index = getColName2IndexFromHeader(reader.next()) counter = 0 gene_set = Set() for row in reader: gene_id = int(row[col_name2index['gene_id']]) gene_set.add(gene_id) counter += 1 if no_of_top_genes is not None and counter>=no_of_top_genes: break del reader return gene_set
def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\ newSNPDataOutputFname=None, newSNPDataOutputFormat=1): """ 2013.07.03 added argument newSNPDataOutputFormat 2012.10.14 split out of findSNPPositionOnNewRef() """ sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\ (querySNPDataFname, newSNPDataOutputFormat)) """ Sample Geno SNP 1999010 CC cs_primer1082_247 1999068 CC cs_primer1082_247 2000022 CT cs_primer1082_247 2000064 CT cs_primer1082_247 2000117 CC cs_primer1082_247 """ inf = utils.openGzipFile(querySNPDataFname) reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) col_name2index = getColName2IndexFromHeader(reader.next()) sampleIndex = col_name2index.get("Sample") genotypeIndex = col_name2index.get("Geno") SNPIDIndex = col_name2index.get("SNP") row_id2index = {} row_id_ls = [] col_id_ls = [] col_id2index = {} row_col_index2genotype = {} for row in reader: sampleID = row[sampleIndex] genotype = row[genotypeIndex] querySNPID = row[SNPIDIndex] if querySNPID in querySNPID2NewReferenceCoordinateLs: newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get(querySNPID) if len(newRefCoordinateLs)==1: newRefCoordinate = newRefCoordinateLs[0] if newSNPDataOutputFormat==2: col_id = '%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart) else: col_id = '%s_%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart, newRefCoordinate.newRefStop) queryStrand = newRefCoordinate.queryStrand if col_id not in col_id2index: col_id2index[col_id] = len(col_id2index) col_id_ls.append(col_id) if sampleID not in row_id2index: row_id2index[sampleID] = len(row_id2index) row_id_ls.append(sampleID) if queryStrand == "-": genotype = SNP.reverseComplement(genotype) row_index = row_id2index[sampleID] col_index = col_id2index[col_id] row_col_index2genotype[(row_index, col_index)] = genotype else: continue data_matrix = numpy.zeros([len(row_id_ls), len(col_id2index)], dtype=numpy.int8) for row_col_index, genotype in row_col_index2genotype.iteritems(): row_index, col_index = row_col_index[:2] data_matrix[row_index, col_index] = SNP.nt2number[genotype] sys.stderr.write("\n") snpData = SNP.SNPData(row_id_ls=row_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix) snpData.tofile(newSNPDataOutputFname)
def putQCIntoDB(self, session, input_fname, no_of_lines_to_skip, data_source_obj, cnv_type_obj, cnv_method_obj=None, \ run_type=1, original_id=None, version=1): """ 2009-10-28 """ sys.stderr.write("Putting QC data into database ... \n") reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) input_file_basename = os.path.basename(input_fname) if run_type == 7: # 2010-6-14 need to skip first 4 lines (3 comment-lines + 1 header) for nucmer coords file no_of_lines_to_skip = 4 elif run_type == 8: # skip 2 lines (1 comment-line + 1 header) for breakdancer output from Quan Long no_of_lines_to_skip = 2 col_name2index = None for i in range(no_of_lines_to_skip): header = reader.next() # the last line to be skipped will be the header if col_name2index is None: col_name2index = getColName2IndexFromHeader(header) counter = 0 for row in reader: if run_type == 1: db_obj = self.generateCNVQCCallObjFromClark2007( session, row, data_source_obj, cnv_type_obj, cnv_method_obj) elif run_type == 2: db_obj = self.generateCNVQCCallObjFromSchneebergerOssowski( session, row, data_source_obj, cnv_type_obj, cnv_method_obj) elif run_type == 3: db_obj = self.generateCNVQCCallObjFromBobSchmitzData(session, row, data_source_obj, cnv_type_obj,\ cnv_method_obj, original_id=original_id) elif run_type == 4: db_obj = self.generateCNVQCCallObjFromLerContigDerivedCNVs(session, row, data_source_obj, cnv_type_obj, \ cnv_method_obj=cnv_method_obj, \ original_id=original_id, col_name2index=col_name2index) elif run_type == 5: db_obj = self.generateSequenceFragmentRefPosObjFromLerContigSpansOverCol(session, row, data_source_obj, cnv_type_obj, \ cnv_method_obj=cnv_method_obj, \ original_id=original_id, col_name2index=col_name2index, version=version) elif run_type == 6: db_obj = self.generateSequenceFragment2ProbeObj(session, row, data_source_obj, cnv_type_obj, \ cnv_method_obj=cnv_method_obj, \ original_id=original_id, col_name2index=col_name2index) elif run_type == 7: db_obj = self.generateSequenceFragmentRefPosObjFromNucmerLerContigSpansOverCol(session, row, data_source_obj, \ cnv_type_obj, \ cnv_method_obj=cnv_method_obj, \ original_id=original_id, col_name2index=col_name2index,\ version=version, comment=input_file_basename) elif run_type == 8: db_obj = self.generateCNVQCCallObjFromQuanLongBreakDancerOutput(session, row, data_source_obj, \ cnv_type_obj, cnv_method_obj=cnv_method_obj,\ original_id=original_id, col_name2index=col_name2index) elif run_type == 9: db_obj = self.generateCNVQCCallObjFromQuanLongCoverageDerived(session, row, data_source_obj, \ cnv_type_obj=cnv_type_obj, \ cnv_method_obj=cnv_method_obj, col_name2index=col_name2index) else: sys.stderr.write("Run type %s not supported.\n" % run_type) if db_obj: session.add(db_obj) session.flush() counter += 1 if counter % 5000 == 0: sys.stderr.write("%s%s" % ('\x08' * 40, counter)) session.flush() sys.stderr.write("%s records. Done.\n" % counter)
def findSNPPositionOnNewRef(self, SNPFlankSequenceFname=None, blastHitResultFname=None, \ querySNPDataFname=None,\ querySNPID2NewRefCoordinateOutputFname=None, newSNPDataOutputFname=None, minAlignmentSpan=10): """ 2012.10.8 argument minAlignmentSpan: the number of bases involved in the blast query-target alignment 2012.8.19 newSNPDataOutputFname will contain the individual X SNP matrix. """ if SNPFlankSequenceFname: querySNPID2attributes = self.getQuerySNPID2attributes(SNPFlankSequenceFname=SNPFlankSequenceFname) else: querySNPID2attributes = None sys.stderr.write("Finding blast reference coordinates for SNPs from %s ... \n"%(blastHitResultFname)) reader = csv.reader(open(blastHitResultFname), delimiter='\t') header =reader.next() col_name2index = getColName2IndexFromHeader(header) #every coordinate in blastHitResultFname is 1-based. """ queryID queryStart queryEnd queryLength targetChr targetStart targetStop targetLength noOfIdentities noOfMismatches identityPercentage 34804_309 1 417 417 Contig293 2551654 2552070 3001801 413 4 0.9904076738609112 43608_166 1 574 574 Contig269 1565599 1566170 3181654 565 9 0.9843205574912892 44412_392 2 580 580 Contig269 1776095 1776673 3181654 577 3 0.9948275862068966 """ queryIDIndex = col_name2index['queryID'] queryStartIndex = col_name2index['queryStart'] queryEndIndex = col_name2index['queryEnd'] targetChrIndex = col_name2index['targetChr'] targetStartIndex = col_name2index['targetStart'] targetStopIndex = col_name2index['targetStop'] querySNPID2NewReferenceCoordinateLs = {} counter = 0 real_counter = 0 queryIDSet= set() for row in reader: queryID = row[queryIDIndex].split()[0] ##get rid of extra comment queryStart = int(row[queryStartIndex]) queryEnd = int(row[queryEndIndex]) targetChr = row[targetChrIndex] targetStart = int(row[targetStartIndex]) targetStop = int(row[targetStopIndex]) queryIDSet.add(queryID) queryAlignmentSpan = abs(queryEnd-queryStart) + 1 targetAlignmentSpan = abs(targetStop-targetStart) + 1 if queryAlignmentSpan == targetAlignmentSpan: if querySNPID2attributes and queryID in querySNPID2attributes: parseData = querySNPID2attributes.get(queryID) locusSpan = parseData.locusSpan else: parseData = self.parseQueryLocusID(queryID) start = parseData.start stop = parseData.stop if start is not None and stop is not None: stop = int(stop) start = int(start) locusSpan = abs(int(stop)-start) #length-1 else: locusSpan = None positionInFlank = parseData.positionInFlank queryRefBase = parseData.refBase queryAltBase = parseData.altBase if positionInFlank is not None and locusSpan is not None: positionInFlank = int(positionInFlank) if targetAlignmentSpan>=minAlignmentSpan and queryAlignmentSpan>=minAlignmentSpan: if queryStart <queryEnd and positionInFlank>queryStart and positionInFlank<queryEnd: #locus must be in the middle of queryStart and queryEnd. newRefStart = targetStart + (positionInFlank - queryStart) newRefStop = targetStop - (queryEnd - positionInFlank-locusSpan) queryStrand = "+" #query alignment start/stop are always in ascending order, regardless of strand queryAlignmentStart = max(1, parseData.start - (positionInFlank-1) + (queryStart-1)) queryAlignmentStop = queryAlignmentStart + targetAlignmentSpan-1 elif queryStart >queryEnd and positionInFlank<queryStart and positionInFlank>queryEnd: #could happen. on the opposite strand. targetStart is always bigger than targetStop #locus must be in the middle of queryStart and queryEnd. newRefStart= targetStop - (positionInFlank-queryEnd) newRefStop = targetStart + (queryStart - positionInFlank-locusSpan) queryStrand = "-" #query alignment start/stop are always in ascending order, regardless of strand queryAlignmentStart = max(1, parseData.start - (positionInFlank-1) + (queryEnd-1)) queryAlignmentStop = queryAlignmentStart + targetAlignmentSpan-1 else: newRefStart = None newRefStop = None if newRefStart is not None and newRefStop is not None: if queryID not in querySNPID2NewReferenceCoordinateLs: querySNPID2NewReferenceCoordinateLs[queryID] = [] newRefCoordinate = PassingData(newChr=targetChr, newRefStart=newRefStart, newRefStop=newRefStop, \ queryStrand=queryStrand, newRefBase="", \ targetAlignmentSpan=targetAlignmentSpan,\ targetAlignmentStart=targetStart,\ targetAlignmentStop=targetStop,\ queryAlignmentSpan=queryAlignmentSpan,\ queryAlignmentStart=queryAlignmentStart,\ queryAlignmentStop=queryAlignmentStop,\ queryChromosome=parseData.chromosome, \ queryStart=parseData.start, queryStop=parseData.stop,\ queryRefBase=queryRefBase, queryAltBase=queryAltBase ) querySNPID2NewReferenceCoordinateLs[queryID].append(newRefCoordinate) real_counter += 1 counter += 1 sys.stderr.write(" from %s blast results. %s/%s SNPs found blast-reference coordinates.\n"%\ (counter, real_counter, len(queryIDSet))) if querySNPDataFname and newSNPDataOutputFname: self.outputSNPDataInNewCoordinate(querySNPDataFname=querySNPDataFname, \ querySNPID2NewReferenceCoordinateLs=querySNPID2NewReferenceCoordinateLs, \ newSNPDataOutputFname=newSNPDataOutputFname, \ newSNPDataOutputFormat=self.newSNPDataOutputFormat) return querySNPID2NewReferenceCoordinateLs
def putHaplotypeGroupIntoDB(self, session, input_fname, max_snp_typing_error_rate, snp_id_ls): """ 2009-4-10 remove tg_ecotypeid2row 2009-4-4 add argument tg_ecotypeid2row 2009-3-31 """ sys.stderr.write("Constructing haplotype groups ...\n") pattern_ecotypeid = re.compile(r'(?<=\))\d+') reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) col_name2col_index = getColName2IndexFromHeader(reader.next()) ecotypeid_idx = col_name2col_index['ecotypeid'] haplo_name_idx = col_name2col_index['haplogroup'] geographic_integrity_idx = col_name2col_index['geographic_integrity'] filtered_SNPs_idx = col_name2col_index['filtered_SNPs'] counter = 0 #for tg_ecotypeid, row in tg_ecotypeid2row.iteritems(): for row in reader: ecotypeid = int(row[ecotypeid_idx]) #ecotypeid = tg_ecotypeid #2009-4-4 use tg_ecotypeid instead haplo_name = row[haplo_name_idx] geographic_integrity_name = row[geographic_integrity_idx] filtered_SNPs = row[filtered_SNPs_idx] ref_ecotypeid = int(pattern_ecotypeid.search(haplo_name).group(0)) haplo_group = StockDB.HaploGroup.query.filter_by( short_name=haplo_name).first() if not haplo_group: haplo_group = StockDB.HaploGroup( short_name=haplo_name, ref_ecotypeid=ref_ecotypeid, max_snp_typing_error_rate=max_snp_typing_error_rate) session.save(haplo_group) session.flush() ecotype = StockDB.Ecotype.get(ecotypeid) haplo_group.ecotypes.append(ecotype) geographic_integrity = StockDB.GeographicIntegrity.query.filter_by( short_name=geographic_integrity_name).first() if not geographic_integrity: geographic_integrity = StockDB.GeographicIntegrity( short_name=geographic_integrity_name) session.save(geographic_integrity) session.flush() ecotype.geographic_integrity = geographic_integrity session.save_or_update(ecotype) #one bit of ecotype: link the ecotypeid to tg_ecotype_id #deal with filtered SNPs for i in range(len(filtered_SNPs)): allele = filtered_SNPs[i] if allele == '_': continue fc = StockDB.FilteredCalls(ecotypeid=ecotypeid, snpid=snp_id_ls[i], allele=allele) session.save(fc) session.flush() counter += 1 if counter % 500 == 0 and self.report: sys.stderr.write('%s%s' % ('\x08' * 80, counter)) session.flush() sys.stderr.write("Done.\n")
def vcftoolsOutputStatFileWalker(self, inputFname, processFunc=None, run_type=1, \ chrColumnHeader='CHR', minChrLength=1000000, chrLengthColumnHeader='chrLength',\ xColumnHeader="BIN_START", valueForNonPositiveYValue=-1): """ 2012.10.26 skip sites if chr_cumu_start is not available 2012.10.25 only skip except during file opening, not file reading 2012.9.18 chrLengthColumnHeader could be nothing 2012.8.31 add argument valueForNonPositiveYValue 2012.8.13 bugfix. pass inf to figureOutDelimiter 2012.8.1 2011-11-2 remove the maxDepth filter. apply afterwards through filterDataByDepth(). 2011-9-30 """ sys.stderr.write("walking through %s ..."%(inputFname)) counter =0 chr2xy_ls = self.chr2xy_ls try: inf = utils.openGzipFile(inputFname) delimiter=figureOutDelimiter(inf) #2012.8.13 bugfix. pass inf to figureOutDelimiter sys.stderr.write(" delimiter is '%s' "%(delimiter)) reader = csv.reader(inf, delimiter=delimiter) header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() print sys.exc_info() return chr_id_index = col_name2index.get(chrColumnHeader, None) if chr_id_index is None: chr_id_index = col_name2index.get("CHROM", None) if chr_id_index is None: chr_id_index = col_name2index.get("CHR", None) if chr_id_index is None: sys.stderr.write("Error chr_id_index is None.\n") sys.exit(3) bin_start_index = col_name2index.get(xColumnHeader, None) if chrLengthColumnHeader: #could be nothing chrLength_index = col_name2index.get(chrLengthColumnHeader, None) else: chrLength_index = None if self.whichColumnHeader: whichColumn = col_name2index.get(self.whichColumnHeader, None) else: whichColumn = self.whichColumn for row in reader: if self.samplingRate<1 and self.samplingRate>=0: r = random.random() if r>self.samplingRate: continue if chrLength_index: chrLength = int(row[chrLength_index]) if chrLength<minChrLength: continue chr_id = row[chr_id_index] bin_start = int(float(row[bin_start_index])) yValue = row[whichColumn] yValue = self.handleYValue(yValue) if chr_id not in chr2xy_ls: chr2xy_ls[chr_id] = [[],[]] chr_cumu_start = self.chr_id2cumu_start.get(chr_id) if chr_cumu_start is None: #2012.10.26 skip sites sys.stderr.write("Chromosome %s does not have chr_cumu_start.\n"%(chr_id)) continue chr2xy_ls[chr_id][0].append(chr_cumu_start + bin_start + 1) chr2xy_ls[chr_id][1].append(yValue) counter += 1 del reader inf.close() sys.stderr.write("%s data.\n"%(counter))
def run(self): if self.debug: import pdb pdb.set_trace() #['trio_set', 'chromosome', 'pos', 'depthOfFather','depthOfMother', 'depthOfChild', 'isInconsistent'] chr_pos2inconsistentData = {} #key is (chr,pos), #value is (noOfInconsistencyInTrio, noOfTotalInTrio, noOfInconsistencyInDuo, noOfTotalInDuo) sys.stderr.write("Reading from %s files ...\n"%(len(self.inputFnameLs))) for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): continue reader = None trioSetStrIndex = None chromosomeIndex = None posIndex = None isInconsistentIndex = None try: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = csv.reader(inputFile, delimiter=delimiter) header = reader.next() col_name2index = getColName2IndexFromHeader(header) trioSetStrIndex = col_name2index.get("#trio_set") chromosomeIndex = col_name2index.get("chromosome") posIndex = col_name2index.get("pos") isInconsistentIndex = col_name2index.get("isInconsistent") except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() if reader is not None and isInconsistentIndex is not None: for row in reader: trio_set_str = row[trioSetStrIndex] chromosome = row[chromosomeIndex] pos = int(row[posIndex]) isInconsistent = int(row[isInconsistentIndex]) chr_pos = (chromosome, pos) if chr_pos not in chr_pos2inconsistentData: chr_pos2inconsistentData[chr_pos] = [0, 0, 0, 0] #trio_set_ls = trio_set_str.split(',') if trio_set_str.find("0")==0 or trio_set_str.find(",0")!=-1: #it's a duo. one parent is missing. chr_pos2inconsistentData[chr_pos][2] += isInconsistent chr_pos2inconsistentData[chr_pos][3] += 1 else: #it's a trio chr_pos2inconsistentData[chr_pos][0] += isInconsistent chr_pos2inconsistentData[chr_pos][1] += 1 sys.stderr.write("Done.\n") sys.stderr.write("Outputting ...") writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') writer.writerow(['#chromosome', 'pos', 'noOfInconsistencyInTrio', 'noOfTotalInTrio', 'inconsistencyRateInTrio',\ 'noOfInconsistencyInDuo', 'noOfTotalInDuo', 'inconsistencyRateInDuo']) chr_pos_ls = chr_pos2inconsistentData.keys() chr_pos_ls.sort() for chr_pos in chr_pos_ls: chromosome, pos = chr_pos noOfInconsistencyInTrio, noOfTotalInTrio, noOfInconsistencyInDuo, noOfTotalInDuo = chr_pos2inconsistentData.get(chr_pos) if noOfTotalInTrio>0: inconsistencyRateInTrio = noOfInconsistencyInTrio/float(noOfTotalInTrio) else: inconsistencyRateInTrio = -1 if noOfTotalInDuo>0: inconsistencyRateInDuo = noOfInconsistencyInDuo/float(noOfTotalInDuo) else: inconsistencyRateInDuo = -1 writer.writerow([chromosome, pos, noOfInconsistencyInTrio, noOfTotalInTrio, inconsistencyRateInTrio,\ noOfInconsistencyInDuo, noOfTotalInDuo, inconsistencyRateInDuo]) del writer sys.stderr.write("Done.\n")
def predictALLSegments(self, input_fname, array_id2model_array_id_ls, array_id2model,\ max_amplitude=-0.1, param_obj=None): """ 2010-7-25 handle the situation that any arrays has >=3 model-arrays 2010-7-1 """ sys.stderr.write('Predicting for all segments from %s ... \n'%(input_fname)) reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) header = reader.next() col_name2index = getColName2IndexFromHeader(header) median_col_index = col_name2index.get('median') ecotype_id_idx = col_name2index.get('ecotype_id', col_name2index.get('array_id')) counter = 0 no_of_segments_in_model = 0 no_of_predicted_deletions = 0 for row in reader: counter += 1 amplitude = float(row[col_name2index['amplitude']]) if amplitude>max_amplitude: continue cnv_ecotype_id = int(row[ecotype_id_idx]) array_id = int(row[col_name2index.get('array_id')]) if array_id not in array_id2model_array_id_ls: continue no_of_probes = int(row[col_name2index['length']]) start_probe = row[col_name2index['start_probe']].split('_') # split chr_pos start_probe = map(int, start_probe) start_probe_id = row[col_name2index['start_probe_id']] stop_probe = row[col_name2index['end_probe']].split('_') stop_probe = map(int, stop_probe) stop_probe_id = row[col_name2index['end_probe_id']] segment_chromosome = start_probe[0] if start_probe[0]!=stop_probe[0]: #spurious. on different chromosomes. continue segment_start_pos = start_probe[1]-12 segment_stop_pos = stop_probe[1]+12 segment_length = abs(segment_stop_pos-segment_start_pos+1) if median_col_index is not None: median_intensity = float(row[median_col_index]) else: median_intensity = None cnv_segment_obj = PassingData(ecotype_id=cnv_ecotype_id, start_probe=start_probe, stop_probe=stop_probe,\ no_of_probes=no_of_probes, amplitude=amplitude, segment_length=segment_length,\ segment_chromosome=segment_chromosome, array_id=array_id,\ start_probe_id=start_probe_id, stop_probe_id=stop_probe_id,\ segment_start_pos=segment_start_pos, segment_stop_pos=segment_stop_pos,\ median_intensity=median_intensity) model_array_id_ls = array_id2model_array_id_ls.get(array_id) no_of_segments_in_model += 1 label_predicted, label_predicted2probability = self.predictOneSegmentByMultipleModels(cnv_segment_obj, \ model_array_id_ls, array_id2model) if label_predicted==-1: # predicted to be deletion. cnv_segment_obj.probability = label_predicted2probability[-1] cnv_segment_obj.comment = 'model arrays: %s'%(repr(model_array_id_ls)[1:-1]) self.saveSegmentObj(param_obj, cnv_segment_obj) no_of_predicted_deletions += 1 if no_of_predicted_deletions%5000==0: sys.stderr.write('%s%s\t%s\t%s'%('\x08'*100, counter, no_of_segments_in_model, no_of_predicted_deletions)) sys.stderr.write('%s%s\t%s\t%s\n'%('\x08'*100, counter, no_of_segments_in_model, no_of_predicted_deletions)) sys.stderr.write('%s out of %s segments were used in prediction. %s predicted deletions.\n'%\ (no_of_segments_in_model, counter, no_of_predicted_deletions))
def run(self): if self.debug: import pdb pdb.set_trace() no_of_result1_peaks_ls = [] no_of_result2_peaks_ls = [] fraction_of_result1_peaks_in_result2_ls = [] fraction_of_result2_peaks_in_result1_ls = [] no_of_combined_peaks_ls = [] fraction_of_overlap_in_combined_peaks_ls = [] for inputFname in self.inputFnameLs: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) no_of_result1_peaks_index = col_name2index.get( "no_of_result1_peaks") no_of_result2_peaks_index = col_name2index.get( "no_of_result2_peaks") no_of_result1_peaks_in_result2_index = col_name2index.get( "no_of_result1_peaks_in_result2") no_of_result2_peaks_in_result1_index = col_name2index.get( "no_of_result2_peaks_in_result1") for row in reader: no_of_result1_peaks = float(row[no_of_result1_peaks_index]) no_of_result2_peaks = float(row[no_of_result2_peaks_index]) no_of_result1_peaks_in_result2 = float( row[no_of_result1_peaks_in_result2_index]) no_of_result2_peaks_in_result1 = float( row[no_of_result2_peaks_in_result1_index]) no_of_result1_peaks_ls.append(no_of_result1_peaks) no_of_result2_peaks_ls.append(no_of_result2_peaks) fraction_of_result1_peaks_in_result2_ls.append( no_of_result1_peaks_in_result2 / no_of_result1_peaks) fraction_of_result2_peaks_in_result1_ls.append( no_of_result2_peaks_in_result1 / no_of_result2_peaks) no_of_combined_peaks_ls.append(no_of_result1_peaks + no_of_result2_peaks) fraction_of_overlap_in_combined_peaks_ls.append( (no_of_result1_peaks_in_result2 + no_of_result2_peaks_in_result1) / (no_of_result1_peaks + no_of_result2_peaks)) del reader title = "%s pairs" % (len(fraction_of_result1_peaks_in_result2_ls)) if len(fraction_of_result1_peaks_in_result2_ls) > 10: medianFraction = numpy.median( fraction_of_result1_peaks_in_result2_ls) title += " median %.3f" % (medianFraction) yh_matplotlib.drawHist(fraction_of_result1_peaks_in_result2_ls, title=title, \ xlabel_1D="fraction of result1 peaks in result2", xticks=None, \ outputFname="%s_hist_of_fraction_of_result1_peaks_in_result2.png"%self.outputFnamePrefix, \ min_no_of_data_points=20, needLog=False, \ dpi=200) title = "%s pairs" % (len(fraction_of_result2_peaks_in_result1_ls)) if len(fraction_of_result2_peaks_in_result1_ls) > 10: medianFraction = numpy.median( fraction_of_result2_peaks_in_result1_ls) title += " median %.3f" % (medianFraction) yh_matplotlib.drawHist(fraction_of_result2_peaks_in_result1_ls, title=title, \ xlabel_1D="fraction of result2 peaks in result1", xticks=None, \ outputFname="%s_hist_of_fraction_of_result2_peaks_in_result1.png"%self.outputFnamePrefix, \ min_no_of_data_points=20, needLog=False, \ dpi=200) title = "%s pairs" % (len(fraction_of_overlap_in_combined_peaks_ls)) if len(fraction_of_overlap_in_combined_peaks_ls) > 10: medianFraction = numpy.median( fraction_of_overlap_in_combined_peaks_ls) title += " median %.3f" % (medianFraction) yh_matplotlib.drawHist(fraction_of_overlap_in_combined_peaks_ls, title=title, \ xlabel_1D="fraction of recurrent peaks in combined", xticks=None, \ outputFname="%s_hist_of_fraction_of_recurrent_peaks_in_combined.png"%self.outputFnamePrefix, \ min_no_of_data_points=20, needLog=False, \ dpi=200) title = "%s results" % (len(no_of_result1_peaks_ls)) yh_matplotlib.drawScatter(no_of_result1_peaks_ls, no_of_result2_peaks_ls, \ fig_fname="%s_no_of_peaks_result1_vs_result2.png"%self.outputFnamePrefix, \ title=title, xlabel='No. of peaks in result1', \ ylabel='No. of peaks in result2', dpi=300) title = "%s results" % (len(no_of_result1_peaks_ls)) yh_matplotlib.drawScatter(no_of_result1_peaks_ls, fraction_of_result1_peaks_in_result2_ls, \ fig_fname="%s_result1_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \ title=title, xlabel='No. of peaks in result1', \ ylabel='Fraction found in result2', dpi=300) title = "%s results" % (len(no_of_result2_peaks_ls)) yh_matplotlib.drawScatter(no_of_result2_peaks_ls, fraction_of_result2_peaks_in_result1_ls, \ fig_fname="%s_result2_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \ title=title, xlabel='No. of peaks in result2', \ ylabel='Fraction found in result1', dpi=300) title = "%s pairs" % (len(fraction_of_result1_peaks_in_result2_ls)) yh_matplotlib.drawScatter(fraction_of_result1_peaks_in_result2_ls, fraction_of_result2_peaks_in_result1_ls, \ fig_fname="%s_1_fraction_in2_vs_2_fraction_in1.png"%self.outputFnamePrefix, \ title=title, xlabel='result1 fraction found in result2', \ ylabel='result2 fraction found in result1', dpi=300) title = "%s pairs" % (len(no_of_combined_peaks_ls)) yh_matplotlib.drawScatter(no_of_combined_peaks_ls, fraction_of_overlap_in_combined_peaks_ls, \ fig_fname="%s_combined_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \ title=title, xlabel='No. of peaks combined', \ ylabel='Fraction recurrent', dpi=300)