def run(self): if self.debug: import pdb pdb.set_trace() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') writer.writerow( ['#sampleID', 'chromosome', 'meanDepth', 'medianDepth']) for inputFname in self.inputFnameLs: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = csv.reader(inputFile, delimiter=delimiter) header = next(reader) col_name2index = getColName2IndexFromHeader(header) intervalIDIndex = col_name2index.get("Target") #only the first read group among the output (so don't run # the DepthOfCoverageWalker over multi-read-group bam files avgCoverageIndex = 4 sampleID = header[avgCoverageIndex][:-9] #this column header is like $sampleID_mean_cvg. so get rid of _mean_cvg medianCoverageIndex = 6 for row in reader: intervalID = row[intervalIDIndex] writer.writerow([ sampleID, intervalID, row[avgCoverageIndex], row[medianCoverageIndex] ]) del writer sys.stderr.write("Done.\n")
def run(self): """ in case chop the whole figure into blocks, swap col_block_index and row_block_index to make row first, column 2nd """ from palos.polymorphism.SNP import read_data from palos.utils import figureOutDelimiter, PassingData delimiter = figureOutDelimiter(self.input_fname) print(delimiter) header, row_label_ls1, row_label_ls2, data_matrix = read_data( self.input_fname, matrix_data_type=float, delimiter='\t') import numpy data_matrix = numpy.array(data_matrix) min_value = numpy.min(data_matrix) if self.min_value_non_negative and min_value < 0: min_value = 0 max_value = numpy.max(data_matrix) font = get_font(self.font_path, font_size=self.font_size) Value2Color.special_value2color[-2] = self.super_value_color value2color_func = lambda x: Value2Color.value2HSLcolor( x, min_value, max_value) im_legend = drawContinousLegend(min_value, max_value, self.no_of_ticks, value2color_func, font) fig_fname_prefix = os.path.splitext(self.fig_fname)[0] if self.split_legend_and_matrix: im_legend.save('%s_legend.png' % fig_fname_prefix) no_of_rows, no_of_cols = data_matrix.shape passParam = PassingData( value2color_func=value2color_func, im_legend=im_legend, font=font, split_legend_and_matrix=self.split_legend_and_matrix, no_grid=self.no_grid) if no_of_cols <= self.blockColUnit: self._drawMatrix(data_matrix, row_label_ls1, header[2:], self.fig_fname, passParam) else: #split into blocks no_of_col_blocks = no_of_cols / self.blockColUnit + 1 no_of_row_blocks = no_of_rows / self.blockRowUnit + 1 for i in range(no_of_col_blocks): col_start_index = i * self.blockColUnit col_end_index = (i + 1) * self.blockColUnit if col_start_index < no_of_cols: for j in range(no_of_row_blocks): row_start_index = j * self.blockRowUnit row_end_index = (j + 1) * self.blockRowUnit if row_start_index < no_of_rows: fig_fname = '%s_%s_%s.png' % (fig_fname_prefix, j, i) #row first, column 2nd self._drawMatrix( data_matrix[row_start_index:row_end_index, col_start_index:col_end_index], row_label_ls1[row_start_index:row_end_index], header[2 + col_start_index:2 + col_end_index], fig_fname, passParam)
def run(self): if self.debug: import pdb pdb.set_trace() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') writer.writerow(['#sampleID', 'chromosome', 'length', 'noOfReadsAlignedByLength', 'noOfSingletonsByLength', \ 'noOfPairsOnSameContigByLength', 'meanInferInsertSize', 'noOfPairsOnDifferentContigsByLength']) for inputFname in self.inputFnameLs: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = csv.reader(inputFile, delimiter=delimiter) header = next(reader) col_name2index = getColName2IndexFromHeader(header) sampleIDIndex = col_name2index.get("readGroup") chromosomeIndex = col_name2index.get("firstReferenceName") chromosomeLengthIndex = col_name2index.get("firstReferenceLength") numberOfReadsIndex = col_name2index.get("numberOfReads") numberOfReadsAlignedIndex = col_name2index.get( "numberOfReadsAligned") numberOfSingletonsMappedIndex = col_name2index.get( "numberOfSingletonsMapped") numberOfPairsOnSameContigIndex = col_name2index.get( "numberOfPairsOnSameContig") numberOfPairsOnDifferentContigsIndex = col_name2index.get( "numberOfPairsOnDifferentContigs") meanInsertSizeIndex = col_name2index.get("meanInsertSize") for row in reader: sampleID = row[sampleIDIndex] chromosome = row[chromosomeIndex] chromosomeLength = int(row[chromosomeLengthIndex]) numberOfReads = float(row[numberOfReadsIndex]) numberOfReadsAligned = float(row[numberOfReadsAlignedIndex]) numberOfSingletonsMapped = float( row[numberOfSingletonsMappedIndex]) numberOfPairsOnSameContig = float( row[numberOfPairsOnSameContigIndex]) numberOfPairsOnDifferentContigs = float( row[numberOfPairsOnDifferentContigsIndex]) meanInsertSize = row[meanInsertSizeIndex] writer.writerow([ sampleID, chromosome, chromosomeLength, numberOfReadsAligned / chromosomeLength, numberOfSingletonsMapped / chromosomeLength, numberOfPairsOnSameContig / chromosomeLength, meanInsertSize, numberOfPairsOnDifferentContigs / chromosomeLength ]) del writer sys.stderr.write("Done.\n")
def run(self): """ 2008-5-12 """ if self.debug: import pdb pdb.set_trace() #database connection and etc db = self.db_250k session = db.session session.begin() delimiter = figureOutDelimiter(self.inputFname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.inputFname, delimiter=delimiter,\ matrix_data_type=int) if self.snp_id_type == 1: #2011-2-27 translate the db_id into chr_pos because the new StrainXSNP dataset uses db_id to identify SNPs. # but if col-id is already chr_pos, it's fine. new_header = header[:2] data_matrix_col_index_to_be_kept = [] for i in range(2, len(header)): snp_id = header[i] chr_pos = db.get_chr_pos_given_db_id2chr_pos(snp_id, ) if chr_pos is not None: data_matrix_col_index_to_be_kept.append(i - 2) new_header.append(chr_pos) # to remove no-db_id columns from data matrix data_matrix = numpy.array(data_matrix) data_matrix = data_matrix[:, data_matrix_col_index_to_be_kept] header = new_header if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData, need_transposeSNPData=1, report=self.report) chromosomes = [ rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls ] snpsdata.writeRawSnpsDatasToFile(self.outputFname, rawSnpsData_ls, chromosomes=chromosomes, deliminator=',', withArrayIds=self.array_id_2nd_column)
def run(self): """ """ if self.debug: import pdb pdb.set_trace() inf = utils.openGzipFile(self.inputFname, mode='r') reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) header = None for i in range(self.noOfLinesInHeader): if i == 0: header = next(reader) else: next(reader) if header is not None: colName2Index = getColName2IndexFromHeader(header) newHeader = [ 'alignmentID', 'total_base_count', 'sampled_base_count', 'meanDepth', 'medianDepth', 'modeDepth' ] inputStatLs = [] writer = csv.writer(utils.openGzipFile(self.outputFname, mode='w'), delimiter='\t') writer.writerow(newHeader) counter = 0 real_counter = 0 for row in reader: counter += 1 if real_counter <= self.maxNumberOfSamplings: r = random.random() if r <= self.fractionToSample and real_counter <= self.maxNumberOfSamplings: inputStatLs.append(float(row[self.whichColumn])) real_counter += 1 meanDepth = numpy.mean(inputStatLs) medianDepth = numpy.median(inputStatLs) modeDepth = scipy.stats.mode(inputStatLs)[0][0] outputRow = [ self.alignmentID, counter, real_counter, meanDepth, medianDepth, modeDepth ] writer.writerow(outputRow) del writer
def get_isqID2coverage(self, seqCoverageFname, defaultCoverage=None): """ 2011-9-2 """ sys.stderr.write("Fetching sequence coverage info from %s ..." % (seqCoverageFname)) reader = csv.reader(open(seqCoverageFname, 'r'), delimiter=figureOutDelimiter(seqCoverageFname)) isqID2coverage = {} header = next(reader) for row in reader: isqID = int(row[0]) coverage = float(row[1]) isqID2coverage[isqID] = coverage sys.stderr.write("%s entries.\n" % len(isqID2coverage)) return isqID2coverage
def trioInconsistentRateFileWalker(cls, inputFname, processFunc=None, minNoOfTotal=100, run_type=1): """ only skip except during file opening, not file reading """ try: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = next(reader) col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) except: sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() return inconsistent_rate_index = col_name2index.get("inconsistency") if run_type == 1: index_of_x_data = col_name2index.get("stopFrequency") elif run_type == 2: index_of_x_data = col_name2index.get("stop") else: sys.stderr.write( "Unsupported run_type %s in trioInconsistentRateFileWalker().\n" % (run_type)) sys.exit(3) index_of_no_of_total = col_name2index.get("no_of_total") inconsistent_rate_ls = [] x_ls = [] for row in reader: if self.samplingRate < 1 and self.samplingRate >= 0: r = random.random() if r > self.samplingRate: continue no_of_total = int(float(row[index_of_no_of_total])) if no_of_total <= minNoOfTotal: continue inconsistency = float(row[inconsistent_rate_index]) inconsistent_rate_ls.append(inconsistency) x_data = float(row[index_of_x_data]) x_ls.append(x_data) processFunc(x_ls, inconsistent_rate_ls) del reader
def __init__(self, path=None, **keywords): self.ad = ProcessOptions.process_function_arguments(keywords, self.option_default_dict, error_doc=self.__doc__, class_to_have_attr=self) if not self.path: self.path = path if self.path and self.file_handle is None: self.file_handle = utils.openGzipFile(self.path, mode=self.mode) #2013.05.03 for easy access self.filename = self.path self.csvFile = None self.isRealCSV = False if self.mode=='r': #reading mode if self.delimiter is None: self.delimiter = figureOutDelimiter(self.file_handle) if self.delimiter=='\t' or self.delimiter==',': self.csvFile = csv.reader(self.file_handle, delimiter=self.delimiter) self.isRealCSV = True else: self.csvFile = self.file_handle self.isRealCSV = False else: #writing mode if not self.delimiter: self.delimiter = '\t' self.csvFile = csv.writer(self.file_handle, delimiter=self.delimiter) self.isRealCSV = True #else: # self.csvFile = self.file_handle # self.isRealCSV = False self.col_name2index = None self._row = None # store the current row being read self.headerPattern = re.compile(r'^[a-zA-Z]') #default header pattern, line beginned with letter self.commentPattern = re.compile(r'^#') #default, beginned with # self.comment_row_list = []
def run(self): """ 2008-9-7 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, delimiter=delimiter) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) newSnpData, allele_index2allele_ls = snpData.convert2Binary( self.report) if self.mapping_fname: #output allele_index2allele_ls self.output_allele2index_ls(snpData, allele_index2allele_ls, self.mapping_fname) newSnpData.tofile(self.output_fname)
def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\ newSNPDataOutputFname=None, newSNPDataOutputFormat=1): """ 2013.07.03 added argument newSNPDataOutputFormat 2012.10.14 split out of findSNPPositionOnNewRef() """ sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\ (querySNPDataFname, newSNPDataOutputFormat)) """ Sample Geno SNP 1999010 CC cs_primer1082_247 1999068 CC cs_primer1082_247 2000022 CT cs_primer1082_247 2000064 CT cs_primer1082_247 2000117 CC cs_primer1082_247 """ inf = utils.openGzipFile(querySNPDataFname) reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) col_name2index = getColName2IndexFromHeader(next(reader)) sampleIndex = col_name2index.get("Sample") genotypeIndex = col_name2index.get("Geno") SNPIDIndex = col_name2index.get("SNP") row_id2index = {} row_id_ls = [] col_id_ls = [] col_id2index = {} row_col_index2genotype = {} for row in reader: sampleID = row[sampleIndex] genotype = row[genotypeIndex] querySNPID = row[SNPIDIndex] if querySNPID in querySNPID2NewReferenceCoordinateLs: newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get( querySNPID) if len(newRefCoordinateLs) == 1: newRefCoordinate = newRefCoordinateLs[0] if newSNPDataOutputFormat == 2: col_id = '%s_%s' % (newRefCoordinate.newChr, newRefCoordinate.newRefStart) else: col_id = '%s_%s_%s' % (newRefCoordinate.newChr, newRefCoordinate.newRefStart, newRefCoordinate.newRefStop) queryStrand = newRefCoordinate.queryStrand if col_id not in col_id2index: col_id2index[col_id] = len(col_id2index) col_id_ls.append(col_id) if sampleID not in row_id2index: row_id2index[sampleID] = len(row_id2index) row_id_ls.append(sampleID) if queryStrand == "-": genotype = SNP.reverseComplement(genotype) row_index = row_id2index[sampleID] col_index = col_id2index[col_id] row_col_index2genotype[(row_index, col_index)] = genotype else: continue data_matrix = numpy.zeros( [len(row_id_ls), len(col_id2index)], dtype=numpy.int8) for row_col_index, genotype in row_col_index2genotype.items(): row_index, col_index = row_col_index[:2] data_matrix[row_index, col_index] = SNP.nt2number[genotype] sys.stderr.write("\n") snpData = SNP.SNPData(row_id_ls=row_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix) snpData.tofile(newSNPDataOutputFname)
def run(self): """ 2012.5.7 new input looks like this (tab-delimited): alignmentID total_base_count sampled_base_count meanDepth medianDepth modeDepth 100 1005506 301614 70.0441756682 9.0 8.0 27 1005506 301614 70.0441756682 9.0 8.0 2012.4.3 each input looks like this: sample_id total mean granular_third_quartile granular_median granular_first_quartile %_bases_above_15 553_2_VRC_ref_GA_vs_524 2434923137 8.25 11 9 6 4.4 Total 2434923137 8.25 N/A N/A N/A 554_3_Barbados_GA_vs_524 2136011136 7.23 11 8 6 3.5 Total 2136011136 7.23 N/A N/A N/A ... """ if self.debug: import pdb pdb.set_trace() session = self.db_main.session session.begin() no_of_total_lines = 0 for inputFname in self.inputFnameLs: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = next(reader) col_name2index = utils.getColName2IndexFromHeader(header, skipEmptyColumn=True) sample_id_index = col_name2index.get("alignmentID") total_base_count_index = col_name2index.get('total_base_count') mean_depth_index = col_name2index.get("meanDepth") median_depth_index = col_name2index.get("medianDepth") mode_depth_index = col_name2index.get("modeDepth") for row in reader: sample_id = row[sample_id_index] if sample_id=='Total': #ignore rows with this as sample id continue alignment_id = int(sample_id.split("_")[0]) total_base_count = int(row[total_base_count_index]) mean_depth = float(row[mean_depth_index]) median_depth = float(row[median_depth_index]) mode_depth = float(row[mode_depth_index]) individual_alignment = self.db_main.queryTable(SunsetDB.IndividualAlignment).get(alignment_id) individual_alignment.pass_qc_read_base_count = total_base_count #2012.9.17 no longer trustworthy because CalculateMedianMeanOfInputColumn skips data. individual_alignment.mean_depth = mean_depth individual_alignment.median_depth = median_depth individual_alignment.mode_depth = mode_depth session.add(individual_alignment) no_of_total_lines += 1 del reader sys.stderr.write("%s alignments in total.\n"%(no_of_total_lines)) if self.logFilename: logF = open(self.logFilename, 'w') logF.write("%s alignments in total.\n"%(no_of_total_lines)) del logF if self.commit: session.flush() session.commit()
def vcftoolsOutputStatFileWalker(self, inputFname, processFunc=None, run_type=1, \ chrColumnHeader='CHR', minChrLength=1000000, chrLengthColumnHeader='chrLength',\ xColumnHeader="BIN_START", valueForNonPositiveYValue=-1): """ 2012.10.26 skip sites if chr_cumu_start is not available 2012.10.25 only skip except during file opening, not file reading 2012.9.18 chrLengthColumnHeader could be nothing """ sys.stderr.write("walking through %s ..." % (inputFname)) counter = 0 chr2xy_ls = self.chr2xy_ls try: inf = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inf) sys.stderr.write(" delimiter is '%s' " % (delimiter)) reader = csv.reader(inf, delimiter=delimiter) header = next(reader) col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() print(sys.exc_info()) return chr_id_index = col_name2index.get(chrColumnHeader, None) if chr_id_index is None: chr_id_index = col_name2index.get("CHROM", None) if chr_id_index is None: chr_id_index = col_name2index.get("CHR", None) if chr_id_index is None: sys.stderr.write("Error chr_id_index is None.\n") sys.exit(3) bin_start_index = col_name2index.get(xColumnHeader, None) if chrLengthColumnHeader: #could be nothing chrLength_index = col_name2index.get(chrLengthColumnHeader, None) else: chrLength_index = None if self.whichColumnHeader: whichColumn = col_name2index.get(self.whichColumnHeader, None) else: whichColumn = self.whichColumn for row in reader: if self.samplingRate < 1 and self.samplingRate >= 0: r = random.random() if r > self.samplingRate: continue if chrLength_index: chrLength = int(row[chrLength_index]) if chrLength < minChrLength: continue chr_id = row[chr_id_index] bin_start = int(float(row[bin_start_index])) yValue = row[whichColumn] yValue = self.handleYValue(yValue) if chr_id not in chr2xy_ls: chr2xy_ls[chr_id] = [[], []] chr_cumu_start = self.chr_id2cumu_start.get(chr_id) if chr_cumu_start is None: #2012.10.26 skip sites sys.stderr.write( "Chromosome %s does not have chr_cumu_start.\n" % (chr_id)) continue chr2xy_ls[chr_id][0].append(chr_cumu_start + bin_start + 1) chr2xy_ls[chr_id][1].append(yValue) counter += 1 del reader inf.close() sys.stderr.write("%s data.\n" % (counter))
def run(self): """ 2008-12-02 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) if self.matrix_data_type_int == 2: matrix_data_type = float else: matrix_data_type = int header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, matrix_data_type=matrix_data_type, delimiter=delimiter) if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list newSnpData, allele_index2allele_ls = snpData.convert2Binary( self.report) if self.phenotype_fname and self.phenotype_method_id: header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData( header=header_phen, strain_acc_list=newSnpData.strain_acc_list, data_matrix=data_matrix_phen ) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( newSnpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn( phenData, set([self.phenotype_method_id]))[0] phenotype_label = phenData.col_id_ls[phenotype_col_index] phenotype_f = open( '%s_%s.pheno' % (self.output_fname_prefix, phenotype_label.replace('/', '_')), 'w') for phenotype_value in phenData.data_matrix[:, phenotype_col_index]: if self.phenotype_is_binary: #binary and non-binary have different NA designator if numpy.isnan(phenotype_value): phenotype_value = 9 else: phenotype_value = int(phenotype_value) else: if numpy.isnan(phenotype_value): phenotype_value = -100.0 phenotype_f.write('%s\n' % phenotype_value) del phenotype_f genotype_f = open('%s.geno' % self.output_fname_prefix, 'w') ind_writer = csv.writer(open('%s.ind' % self.output_fname_prefix, 'w'), delimiter='\t') snp_writer = csv.writer(open('%s.snp' % self.output_fname_prefix, 'w'), delimiter='\t') #transpose it newSnpData = transposeSNPData(newSnpData) no_of_rows = len(newSnpData.data_matrix) no_of_cols = len(newSnpData.data_matrix[0]) for i in range(no_of_rows): snp_id = newSnpData.row_id_ls[i] chr, pos = snp_id.split('_') allele1 = allele_index2allele_ls[i][0] #major allele allele2 = allele_index2allele_ls[i][1] #minor allele snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2]) geno_line = '' for j in range(no_of_cols): if i == 0: #write out the accessions ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case']) allele = newSnpData.data_matrix[i][j] if allele == 0: geno_line += '0' elif allele == 1: geno_line += '2' else: geno_line += '9' geno_line += '\n' genotype_f.write(geno_line) del genotype_f, ind_writer, snp_writer
def traverse(self): """ """ newHeader = [] key2dataLs = {} #key is the keyColumn, # dataLs corresponds to the sum of each column from valueColumnLs delimiter = None for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: sys.exit(3) else: continue reader = None try: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = MatrixFile(file_handle=inputFile, delimiter=delimiter) except: logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() try: header = next(reader) self.handleNewHeader(header, newHeader, self.keyColumnLs, self.valueColumnLs, keyColumnSet=self.keyColumnSet) if self.noHeader: inputFile.seek(0) reader = MatrixFile(file_handle=inputFile, delimiter=delimiter) except: logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() if reader is not None: for row in reader: try: self.handleValueColumns( row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, valueColumnLs=self.valueColumnLs) except: #in case something wrong (i.e. file is empty) logging.error(f'Ignore this row: {row}.') logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() del reader if self.noHeader: newHeader = None returnData = PassingData(key2dataLs=key2dataLs, delimiter=delimiter, header=newHeader) return returnData
def traverse(self): """ """ newHeader = [] key2dataLs = {} #key is the keyColumn, # dataLs corresponds to the sum of each column from valueColumnLs noOfDataColumnsFromPriorFiles = 0 for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: logging.error(f'{inputFname} does not exist.') sys.exit(3) else: continue reader = None try: inputFile = utils.openGzipFile(inputFname) if self.inputDelimiter is None or self.inputDelimiter == '': self.inputDelimiter = figureOutDelimiter(inputFile) reader = MatrixFile(file_handle=inputFile, delimiter=self.inputDelimiter) except: logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() valueColumnLs = [] try: header = next(reader) self.handleNewHeader(header, newHeader, self.keyColumnLs, valueColumnLs, keyColumnSet=self.keyColumnSet) if self.noHeader: inputFile.seek(0) reader = MatrixFile(file_handle=inputFile, delimiter=self.inputDelimiter) except: #in case something wrong (i.e. file is empty) logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() if reader is not None and valueColumnLs: visitedKeySet = set() for row in reader: try: self.handleValueColumns(row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, valueColumnLs=valueColumnLs, noOfDataColumnsFromPriorFiles= noOfDataColumnsFromPriorFiles, visitedKeySet=visitedKeySet) except: logging.error(f'Ignore this row: {row}.') logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() del reader #append empty data to keys who are missing in the current file. totalKeySet = set(key2dataLs.keys()) unvisitedKeySet = totalKeySet - visitedKeySet for key in unvisitedKeySet: for i in valueColumnLs: key2dataLs[key].append('') noOfDataColumnsFromPriorFiles += len(valueColumnLs) if self.noHeader: newHeader = None returnData = PassingData(key2dataLs=key2dataLs, delimiter=self.inputDelimiter, header=newHeader) return returnData
def run(self): """ 2012.4.3 each input has this as its header: ['alignmentID', 'total_no_of_reads', 'perc_reads_mapped', 'perc_duplicates', 'perc_paired', 'perc_properly_paired', \ 'perc_both_mates_mapped', 'perc_singletons',\ 'perc_mapped_to_diff_chrs'] """ if self.debug: import pdb pdb.set_trace() session = self.db_main.session session.begin() no_of_total_lines = 0 for inputFname in self.inputFnameLs: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = next(reader) colName2Index = utils.getColName2IndexFromHeader( header, skipEmptyColumn=True) alignment_id_index = colName2Index.get('alignmentID') total_no_of_reads_index = colName2Index.get('total_no_of_reads') perc_secondary_index = colName2Index.get("perc_secondary") perc_supplementary_index = colName2Index.get("perc_supplementary") perc_reads_mapped_index = colName2Index.get("perc_reads_mapped") perc_duplicates_index = colName2Index.get("perc_duplicates") perc_paired_index = colName2Index.get("perc_paired") perc_properly_paired_index = colName2Index.get( "perc_properly_paired") perc_both_mates_mapped_index = colName2Index.get( "perc_both_mates_mapped") perc_singletons_index = colName2Index.get("perc_singletons") perc_mapped_to_diff_chrs_index = colName2Index.get( "perc_mapped_to_diff_chrs") perc_mapq5_mapped_to_diff_chrs_index = colName2Index.get( "perc_mapq5_mapped_to_diff_chrs") for row in reader: alignmentID = int(row[alignment_id_index]) alignment = self.db_main.queryTable( SunsetDB.IndividualAlignment).get(alignmentID) alignment.perc_reads_mapped = float( row[perc_reads_mapped_index]) alignment.perc_secondary = float(row[perc_secondary_index]) alignment.perc_supplementary = float( row[perc_supplementary_index]) alignment.perc_duplicates = float(row[perc_duplicates_index]) alignment.perc_paired = float(row[perc_paired_index]) alignment.perc_properly_paired = float( row[perc_properly_paired_index]) alignment.perc_both_mates_mapped = float( row[perc_both_mates_mapped_index]) alignment.perc_singletons = float(row[perc_singletons_index]) alignment.perc_mapped_to_diff_chrs = float( row[perc_mapped_to_diff_chrs_index]) alignment.perc_mapq5_mapped_to_diff_chrs = float( row[perc_mapq5_mapped_to_diff_chrs_index]) alignment.total_no_of_reads = int( float(row[total_no_of_reads_index])) session.add(alignment) no_of_total_lines += 1 del reader sys.stderr.write("%s alignments in total.\n" % (no_of_total_lines)) if self.logFilename: logF = open(self.logFilename, 'w') logF.write("%s alignments in total.\n" % (no_of_total_lines)) del logF if self.commit: session.flush() session.commit()