def makeDatasetDS(self, test): self.ch(test) idx = gsample.index(test["reference"], self.config["fastindex_path"]) if test["sampling"]["region_len"] == None: test["sampling"]["region_len"] = self.calcualteRegionLen(test) if test["sampling"]["region_pad"] == None: test["sampling"]["region_pad"] = self.calculateRegionPadding(test) if test["sampling"]["ratio"] == None: test["sampling"]["ratio"] = self.calculateSamplingRatio(idx["contig_len"]) sampled_file,sampled_index_file=gsample.csample(test["reference"], test["sampling"]["region_len"], test["sampling"]["ratio"], test["sampling"]["region_pad"],self.config["fastindex_path"]) test["reference_sim"] = sampled_file if "coverage" in test and test["read_count"] == None: test["read_count"] = int( ((idx["contig_len"] * test["sampling"]["ratio"]) / test["read_length"]) * test["coverage"]) test["read_count"] = max(test["read_count"], self.getMinimumSampledBases() / test["read_length"]) self.simulate(test) self.ch(test) self.mv("mapping_comparison.sam", "mapping_comparison_unfixed.sam") gsample.ctranslate(test["reference"], sampled_index_file, "mapping_comparison_unfixed.sam", "mapping_comparison.sam",self.config["fastindex_path"]) self.rm(sampled_file) self.rm(sampled_index_file) self.rm("mapping_comparison_unfixed.sam")
def makeDatasetNoDS(self, test): if test["read_count"] == None: index = gsample.index(test["reference"],self.config["fastindex_path"]) test["read_count"] = index["contig_len"] / test["read_length"] #self.mate.error("Teaser: Read count must be set manually when subsampling is disabled, for test '%s'"%test["name"]) #raise RuntimeError test["reference_sim"] = test["reference"] self.ch(test) self.simulate(test)
def makeDatasetNoDS(self, test): if test["read_count"] == None: index = gsample.index(test["reference"], self.config["fastindex_path"]) test["read_count"] = index["contig_len"] / test["read_length"] #self.mate.error("Teaser: Read count must be set manually when subsampling is disabled, for test '%s'"%test["name"]) #raise RuntimeError test["reference_sim"] = test["reference"] self.ch(test) self.simulate(test)
def makeDatasetDS(self, test): self.ch(test) idx = gsample.index(test["reference"], self.config["fastindex_path"]) if test["sampling"]["region_len"] == None: test["sampling"]["region_len"] = self.calcualteRegionLen(test) if test["sampling"]["region_pad"] == None: test["sampling"]["region_pad"] = self.calculateRegionPadding(test) if test["sampling"]["ratio"] == None: test["sampling"]["ratio"] = self.calculateSamplingRatio( idx["contig_len"]) sampled_file, sampled_index_file = gsample.csample( test["reference"], test["sampling"]["region_len"], test["sampling"]["ratio"], test["sampling"]["region_pad"], self.config["fastindex_path"]) test["reference_sim"] = sampled_file if "coverage" in test and test["read_count"] == None: test["read_count"] = int( ((idx["contig_len"] * test["sampling"]["ratio"]) / test["read_length"]) * test["coverage"]) test["read_count"] = max( test["read_count"], self.getMinimumSampledBases() / test["read_length"]) self.simulate(test) self.ch(test) self.mv("mapping_comparison.sam", "mapping_comparison_unfixed.sam") gsample.ctranslate(test["reference"], sampled_index_file, "mapping_comparison_unfixed.sam", "mapping_comparison.sam", self.config["fastindex_path"]) self.rm(sampled_file) self.rm(sampled_index_file) self.rm("mapping_comparison_unfixed.sam")
def importDatasetReal(self, test): if test["sampling"]["enable"] == False: self.log("Subsampling disabled; Importing all reads") test["read_count"] = self.importReadFiles(test) self.log("Data set import successful") return if test["read_count"] == None: self.log( "No target read count given for real data test, estimating using reference and avg. read length" ) fastq = sam.FASTQ(test["import_read_files"][0]) i = 0 read = fastq.next_read() length_sum = 0 while read.valid and i < 10000: length_sum += len(read.seq) i += 1 read = fastq.next_read() avg_len = length_sum / i contig_len = gsample.index( test["reference"], self.config["fastindex_path"])["contig_len"] if test["sampling"]["ratio"] == None: sampling_ratio = self.calculateSamplingRatio(contig_len) else: sampling_ratio = test["sampling"]["ratio"] test["read_count"] = (contig_len / avg_len) * sampling_ratio * test["coverage"] test["read_count"] = max( test["read_count"], self.getMinimumSampledBases() / test["read_length"]) self.log( "Reference length: %d, Sampling Ratio: %f, Estimated avg. read length: %d" % (contig_len, sampling_ratio, avg_len)) self.log("Sampling reads.") line_counts = [] for file in test["import_read_files"]: count = util.line_count(file) if count == -1: self.mate.error( "Teaser: Real data import: Failed to get line count for '%s' during data set import." % file) raise RuntimeError line_counts.append(count) for c in line_counts: if c != line_counts[0]: self.mate.error( "Teaser: Real data import: FASTQ files to import have different line counts" ) raise RuntimeError line_count = line_counts[0] if line_count % 4 != 0: self.mate.error( "Teaser: Real data import: FASTQ file line count is not a multiple of four. This may lead to errors." ) per_file_readcount = test["read_count"] if test["paired"]: per_file_readcount /= 2 line_count -= line_count % 4 import_files_readcount = line_counts[0] / 4 if import_files_readcount < per_file_readcount: self.mate.warning( "Teaser: Real data import: Tried to sample more reads than present in FASTQ files. Using all input instead." ) per_file_readcount = import_files_readcount test["read_count"] = per_file_readcount * 2 sample_fraction = float(per_file_readcount) / import_files_readcount self.log("Fraction: %f, Import file readcount: %d" % (sample_fraction, import_files_readcount)) sampled_readcount = 0 if test["paired"]: sampled_readcount += self.sampleReads( test["import_read_files"][0], test["dir"] + "/reads1.fastq", sample_fraction, "/1") sampled_readcount += self.sampleReads( test["import_read_files"][1], test["dir"] + "/reads2.fastq", sample_fraction, "/2") else: sampled_readcount += self.sampleReads(test["import_read_files"][0], test["dir"] + "/reads.fastq", sample_fraction) test["read_count"] = sampled_readcount self.log("Data set import successful. Total sampled reads: %d" % sampled_readcount)
def importDatasetReal(self,test): if test["sampling"]["enable"] == False: self.log("Subsampling disabled; Importing all reads") test["read_count"] = self.importReadFiles(test) self.log("Data set import successful") return if test["read_count"] == None: self.log("No target read count given for real data test, estimating using reference and avg. read length") fastq=sam.FASTQ(test["import_read_files"][0]) i=0 read=fastq.next_read() length_sum=0 while read.valid and i < 10000: length_sum+=len(read.seq) i+=1 read=fastq.next_read() avg_len = length_sum/i contig_len = gsample.index(test["reference"],self.config["fastindex_path"])["contig_len"] if test["sampling"]["ratio"] == None: sampling_ratio = self.calculateSamplingRatio(contig_len) else: sampling_ratio = test["sampling"]["ratio"] test["read_count"] = (contig_len/avg_len) * sampling_ratio * test["coverage"] test["read_count"] = max(test["read_count"],self.getMinimumSampledBases() / test["read_length"]) self.log("Reference length: %d, Sampling Ratio: %f, Estimated avg. read length: %d"%(contig_len,sampling_ratio,avg_len)) self.log("Sampling reads.") line_counts = [] for file in test["import_read_files"]: count = util.line_count(file) if count == -1: self.mate.error("Teaser: Real data import: Failed to get line count for '%s' during data set import."%file) raise RuntimeError line_counts.append(count) for c in line_counts: if c != line_counts[0]: self.mate.error("Teaser: Real data import: FASTQ files to import have different line counts") raise RuntimeError line_count = line_counts[0] if line_count % 4 != 0: self.mate.error("Teaser: Real data import: FASTQ file line count is not a multiple of four. This may lead to errors.") per_file_readcount = test["read_count"] if test["paired"]: per_file_readcount /= 2 line_count -= line_count % 4 import_files_readcount = line_counts[0] / 4 if import_files_readcount < per_file_readcount: self.mate.warning("Teaser: Real data import: Tried to sample more reads than present in FASTQ files. Using all input instead.") per_file_readcount = import_files_readcount test["read_count"] = per_file_readcount * 2 sample_fraction = float(per_file_readcount)/import_files_readcount self.log("Fraction: %f, Import file readcount: %d"%(sample_fraction,import_files_readcount)) sampled_readcount = 0 if test["paired"]: sampled_readcount += self.sampleReads(test["import_read_files"][0],test["dir"]+"/reads1.fastq",sample_fraction,"/1") sampled_readcount += self.sampleReads(test["import_read_files"][1],test["dir"]+"/reads2.fastq",sample_fraction,"/2") else: sampled_readcount += self.sampleReads(test["import_read_files"][0],test["dir"]+"/reads.fastq",sample_fraction) test["read_count"] = sampled_readcount self.log("Data set import successful. Total sampled reads: %d"%sampled_readcount)