コード例 #1
0
ファイル: teaser.py プロジェクト: Cibiv/Teaser
	def makeDatasetDS(self, test):
		self.ch(test)

		idx = gsample.index(test["reference"], self.config["fastindex_path"])

		if test["sampling"]["region_len"] == None:
			test["sampling"]["region_len"] = self.calcualteRegionLen(test)

		if test["sampling"]["region_pad"] == None:
			test["sampling"]["region_pad"] = self.calculateRegionPadding(test)

		if test["sampling"]["ratio"] == None:
			test["sampling"]["ratio"] = self.calculateSamplingRatio(idx["contig_len"])

		sampled_file,sampled_index_file=gsample.csample(test["reference"], test["sampling"]["region_len"], test["sampling"]["ratio"],
						test["sampling"]["region_pad"],self.config["fastindex_path"])
		
		test["reference_sim"] = sampled_file

		if "coverage" in test and test["read_count"] == None:
			test["read_count"] = int(
				((idx["contig_len"] * test["sampling"]["ratio"]) / test["read_length"]) * test["coverage"])

			test["read_count"] = max(test["read_count"], self.getMinimumSampledBases() / test["read_length"])

		self.simulate(test)

		self.ch(test)
		self.mv("mapping_comparison.sam", "mapping_comparison_unfixed.sam")
		gsample.ctranslate(test["reference"], sampled_index_file, "mapping_comparison_unfixed.sam", "mapping_comparison.sam",self.config["fastindex_path"])

		self.rm(sampled_file)
		self.rm(sampled_index_file)
		self.rm("mapping_comparison_unfixed.sam")
コード例 #2
0
ファイル: teaser.py プロジェクト: Cibiv/Teaser
	def makeDatasetNoDS(self, test):
		if test["read_count"] == None:
			index = gsample.index(test["reference"],self.config["fastindex_path"])
			test["read_count"] = index["contig_len"] / test["read_length"]
			#self.mate.error("Teaser: Read count must be set manually when subsampling is disabled, for test '%s'"%test["name"])
			#raise RuntimeError

		test["reference_sim"] = test["reference"]
		self.ch(test)
		self.simulate(test)
コード例 #3
0
ファイル: teaser.py プロジェクト: alxsimon/Teaser
    def makeDatasetNoDS(self, test):
        if test["read_count"] == None:
            index = gsample.index(test["reference"],
                                  self.config["fastindex_path"])
            test["read_count"] = index["contig_len"] / test["read_length"]
            #self.mate.error("Teaser: Read count must be set manually when subsampling is disabled, for test '%s'"%test["name"])
            #raise RuntimeError

        test["reference_sim"] = test["reference"]
        self.ch(test)
        self.simulate(test)
コード例 #4
0
ファイル: teaser.py プロジェクト: alxsimon/Teaser
    def makeDatasetDS(self, test):
        self.ch(test)

        idx = gsample.index(test["reference"], self.config["fastindex_path"])

        if test["sampling"]["region_len"] == None:
            test["sampling"]["region_len"] = self.calcualteRegionLen(test)

        if test["sampling"]["region_pad"] == None:
            test["sampling"]["region_pad"] = self.calculateRegionPadding(test)

        if test["sampling"]["ratio"] == None:
            test["sampling"]["ratio"] = self.calculateSamplingRatio(
                idx["contig_len"])

        sampled_file, sampled_index_file = gsample.csample(
            test["reference"], test["sampling"]["region_len"],
            test["sampling"]["ratio"], test["sampling"]["region_pad"],
            self.config["fastindex_path"])

        test["reference_sim"] = sampled_file

        if "coverage" in test and test["read_count"] == None:
            test["read_count"] = int(
                ((idx["contig_len"] * test["sampling"]["ratio"]) /
                 test["read_length"]) * test["coverage"])

            test["read_count"] = max(
                test["read_count"],
                self.getMinimumSampledBases() / test["read_length"])

        self.simulate(test)

        self.ch(test)
        self.mv("mapping_comparison.sam", "mapping_comparison_unfixed.sam")
        gsample.ctranslate(test["reference"], sampled_index_file,
                           "mapping_comparison_unfixed.sam",
                           "mapping_comparison.sam",
                           self.config["fastindex_path"])

        self.rm(sampled_file)
        self.rm(sampled_index_file)
        self.rm("mapping_comparison_unfixed.sam")
コード例 #5
0
ファイル: teaser.py プロジェクト: alxsimon/Teaser
    def importDatasetReal(self, test):
        if test["sampling"]["enable"] == False:
            self.log("Subsampling disabled; Importing all reads")
            test["read_count"] = self.importReadFiles(test)
            self.log("Data set import successful")
            return

        if test["read_count"] == None:
            self.log(
                "No target read count given for real data test, estimating using reference and avg. read length"
            )

            fastq = sam.FASTQ(test["import_read_files"][0])
            i = 0
            read = fastq.next_read()
            length_sum = 0
            while read.valid and i < 10000:
                length_sum += len(read.seq)
                i += 1
                read = fastq.next_read()
            avg_len = length_sum / i
            contig_len = gsample.index(
                test["reference"], self.config["fastindex_path"])["contig_len"]

            if test["sampling"]["ratio"] == None:
                sampling_ratio = self.calculateSamplingRatio(contig_len)
            else:
                sampling_ratio = test["sampling"]["ratio"]

            test["read_count"] = (contig_len /
                                  avg_len) * sampling_ratio * test["coverage"]
            test["read_count"] = max(
                test["read_count"],
                self.getMinimumSampledBases() / test["read_length"])

            self.log(
                "Reference length: %d, Sampling Ratio: %f, Estimated avg. read length: %d"
                % (contig_len, sampling_ratio, avg_len))

        self.log("Sampling reads.")

        line_counts = []
        for file in test["import_read_files"]:
            count = util.line_count(file)
            if count == -1:
                self.mate.error(
                    "Teaser: Real data import: Failed to get line count for '%s' during data set import."
                    % file)
                raise RuntimeError
            line_counts.append(count)

        for c in line_counts:
            if c != line_counts[0]:
                self.mate.error(
                    "Teaser: Real data import: FASTQ files to import have different line counts"
                )
                raise RuntimeError

        line_count = line_counts[0]
        if line_count % 4 != 0:
            self.mate.error(
                "Teaser: Real data import: FASTQ file line count is not a multiple of four. This may lead to errors."
            )

        per_file_readcount = test["read_count"]
        if test["paired"]:
            per_file_readcount /= 2

        line_count -= line_count % 4
        import_files_readcount = line_counts[0] / 4
        if import_files_readcount < per_file_readcount:
            self.mate.warning(
                "Teaser: Real data import: Tried to sample more reads than present in FASTQ files. Using all input instead."
            )
            per_file_readcount = import_files_readcount
            test["read_count"] = per_file_readcount * 2

        sample_fraction = float(per_file_readcount) / import_files_readcount
        self.log("Fraction: %f, Import file readcount: %d" %
                 (sample_fraction, import_files_readcount))

        sampled_readcount = 0
        if test["paired"]:
            sampled_readcount += self.sampleReads(
                test["import_read_files"][0], test["dir"] + "/reads1.fastq",
                sample_fraction, "/1")
            sampled_readcount += self.sampleReads(
                test["import_read_files"][1], test["dir"] + "/reads2.fastq",
                sample_fraction, "/2")
        else:
            sampled_readcount += self.sampleReads(test["import_read_files"][0],
                                                  test["dir"] + "/reads.fastq",
                                                  sample_fraction)

        test["read_count"] = sampled_readcount

        self.log("Data set import successful. Total sampled reads: %d" %
                 sampled_readcount)
コード例 #6
0
ファイル: teaser.py プロジェクト: Cibiv/Teaser
	def importDatasetReal(self,test):
		if test["sampling"]["enable"] == False:
			self.log("Subsampling disabled; Importing all reads")
			test["read_count"] = self.importReadFiles(test)
			self.log("Data set import successful")
			return

		if test["read_count"] == None:
			self.log("No target read count given for real data test, estimating using reference and avg. read length")

			fastq=sam.FASTQ(test["import_read_files"][0])
			i=0
			read=fastq.next_read()
			length_sum=0
			while read.valid and i < 10000:
				length_sum+=len(read.seq)
				i+=1
				read=fastq.next_read()
			avg_len = length_sum/i
			contig_len = gsample.index(test["reference"],self.config["fastindex_path"])["contig_len"]

			if test["sampling"]["ratio"] == None:
				sampling_ratio = self.calculateSamplingRatio(contig_len)
			else:
				sampling_ratio = test["sampling"]["ratio"]

			test["read_count"] = (contig_len/avg_len) * sampling_ratio * test["coverage"]
			test["read_count"] = max(test["read_count"],self.getMinimumSampledBases() / test["read_length"])

			self.log("Reference length: %d, Sampling Ratio: %f, Estimated avg. read length: %d"%(contig_len,sampling_ratio,avg_len))

		self.log("Sampling reads.")

		line_counts = []
		for file in test["import_read_files"]:
			count = util.line_count(file)
			if count == -1:
				self.mate.error("Teaser: Real data import: Failed to get line count for '%s' during data set import."%file)
				raise RuntimeError
			line_counts.append(count)

		for c in line_counts:
			if c != line_counts[0]:
				self.mate.error("Teaser: Real data import: FASTQ files to import have different line counts")
				raise RuntimeError

		line_count = line_counts[0]
		if line_count % 4 != 0:
			self.mate.error("Teaser: Real data import: FASTQ file line count is not a multiple of four. This may lead to errors.")

		per_file_readcount = test["read_count"]
		if test["paired"]:
			per_file_readcount /= 2

		line_count -= line_count % 4
		import_files_readcount = line_counts[0] / 4
		if import_files_readcount < per_file_readcount:
			self.mate.warning("Teaser: Real data import: Tried to sample more reads than present in FASTQ files. Using all input instead.")
			per_file_readcount = import_files_readcount
			test["read_count"] = per_file_readcount * 2

		sample_fraction = float(per_file_readcount)/import_files_readcount
		self.log("Fraction: %f, Import file readcount: %d"%(sample_fraction,import_files_readcount))

		sampled_readcount = 0
		if test["paired"]:
			sampled_readcount += self.sampleReads(test["import_read_files"][0],test["dir"]+"/reads1.fastq",sample_fraction,"/1")
			sampled_readcount += self.sampleReads(test["import_read_files"][1],test["dir"]+"/reads2.fastq",sample_fraction,"/2")
		else:
			sampled_readcount += self.sampleReads(test["import_read_files"][0],test["dir"]+"/reads.fastq",sample_fraction)

		test["read_count"] = sampled_readcount

		self.log("Data set import successful. Total sampled reads: %d"%sampled_readcount)