Beispiel #1
0
	def autoGeneratePrereqs(self):
		self.inpt=Input(self.workspace)
		self.sort=Sort(self.workspace, copy(self.vital_parameters))
		self.molecule_stats=self.sort.getMoleculeStats()
		self.split=Split(self.workspace, copy(self.vital_parameters))
		self.split_summary=Summarize(self.workspace, self.split)
		self.pairwise_alignment=PairwiseAlignment(self.workspace, copy(self.vital_parameters))
		self.pairwise_summary=Summarize(self.workspace, self.pairwise_alignment)
Beispiel #2
0
	def autoGeneratePrereqs(self):
		self.inpt=Input(self.workspace)
		self.sort=Sort(self.workspace, copy(self.vital_parameters))
		self.molecule_stats=self.sort.getMoleculeStats()
		self.split=Split(self.workspace, copy(self.vital_parameters))
		self.split_summary=Summarize(self.workspace, self.split)
		self.pairwise_alignment=PairwiseAlignment(self.workspace, copy(self.vital_parameters))
		self.pairwise_summary=Summarize(self.workspace, self.pairwise_alignment)
		self.assembly=Assembly(self.workspace, copy(self.vital_parameters))
		self.assembly_summary=Summarize(self.workspace, self.assembly)
		self.merge_assembly=Merge(self.workspace, self.assembly)
		self.group_manifest=GroupManifest(self.workspace, self.assembly)
Beispiel #3
0
class RefineB0(Step):
	def __init__(self, workspace, vital_parameters):
		self.workspace=workspace
		self.vital_parameters=vital_parameters
		self.quality=None

		self.output_prefix="refineB0"
		self.color=1
		self.aligned_site_threshold=5
		self.max_coverage=100
		self.enable_multi_mode=True
		self.internal_split_ratio=0.20
		self.internal_trimmed_coverage_ratio=0.35
		# TODO this file doesn't exist in other assemblies...
		self.cnt_file="refineB0_max_id"
		self.min_contig_len=100.0
		self.allow_no_splits=True
		self.allow_infinite_splits=False
		self.min_end_coverage=6.99
		self.scale_bias_wt=0
		self.min_likelihood_ratio=1e2
		self.max_query_alignment=4
		self.max_reference_alignment=6
		self.max_repeat_shift=2
		self.repeat_pval_ratio=0.01
		self.repeat_log_pval_ratio=0.7
		self.repeat_min_shift_ratio=0.6
		self.min_gap_flanking_sites=2
		self.output_trimmed_coverage=True
		self.normalize_trimmed_coverage=True
		self.min_gap_flanking_len=55
		self.last_non_chimeric_site_after_gap=2
		self.split_molecules_with_outliers=True
		self.outlier_pvals_per_true_positive=1e-5
		self.end_outlier_prior_probability=1e-4
		self.pval_after_refinement=1
		self.faster_refinement_resolution=""
		self.count_splits_with_largest_ids=True
		self.contig_split_version=""
		self.reduced_contig_resolution_divided_by_two=2.0
		self.overwrite_output=True
		self.hash_window=5
		self.hash_min_sites=3
		self.hash_sd_max=2.4
		self.hash_sd_rms=1.5
		self.hash_relative_error=0.05
		self.hash_offset_kb=5.0
		self.hash_max_insert_errors=1
		self.hash_max_probe_errors=1
		self.hash_max_unresolved_sites=1
		self.hash_file=""
		self.hash_threshold=""
		self.hashdelta=10
		self.reduced_molecule_resolution=1.2
		self.insert_threads=4
		self.skip_alignment_statistic_computation=True
		self.sd=0.2
		self.sf=0.2
		self.sr=0.03
		self.res=3.3
		self.regex_acceptible_output_file=".*.bnx"
		self.write_output_to_file=True
		self.write_errors_to_file=True
		
		self.max_job_count=2
		self.autoGeneratePrereqs()

	def writeCode(self):
		code="cd " + self.workspace.work_dir + "\n"
		code+="mkdir -p " + self.getStepDir() + "\n"
		code+="cd " + self.getStepDir() + "\n"
		code+="pwd\n"

		param_values=OrderedDict()
		param_values["-i"]="placeholder"
		param_values["-o"]=self.output_prefix
		param_values["-maxthreads"]=str(self.getThreads())		 
		param_values["-ref"]=self.merge_refineA.getOutputFile()
		param_values["-T"]=str(self.vital_parameters.pval)
		param_values["-usecolor"]=str(self.color) 
		param_values["-A"]=str(self.aligned_site_threshold) 
		param_values["-extend"]="1"
		param_values["-MaxCov"]=str(self.max_coverage) 
		if self.enable_multi_mode: 
			param_values["-MultiMode"]=""
		param_values["-contigsplit"]=" ".join([str(self.internal_split_ratio), str(self.internal_trimmed_coverage_ratio), self.cnt_file])
		param_values["-MinSplitLen"]=str(self.min_contig_len) 
		param_values["-nosplit"] =  "2" if self.allow_no_splits else "0" if self.allow_infinite_splits else "1" 
		param_values["-EndTrim"]=str(self.min_end_coverage)
		param_values["-biaswt"]=str(self.scale_bias_wt) 
		param_values["-LRbias"]=str(self.min_likelihood_ratio) 
		param_values["-deltaX"]=str(self.max_query_alignment) 
		param_values["-deltaY"]=str(self.max_reference_alignment) 
		param_values["-RepeatMask"]=" ".join([str(self.max_repeat_shift), str(self.repeat_pval_ratio)]) 
		param_values["-RepeatRec"]=" ".join([str(self.repeat_log_pval_ratio), str(self.repeat_min_shift_ratio)])
		param_values["-CovTrim"]=str(self.min_gap_flanking_sites) 
		if self.output_trimmed_coverage:
			param_values["-ReplaceCov"]=""
		if self.normalize_trimmed_coverage:
			param_values["-TrimNorm"]=""
		param_values["-CovTrimLen"]=str(self.min_gap_flanking_len) 
		param_values["-TrimNormChim"]=str(self.last_non_chimeric_site_after_gap) 
		if self.split_molecules_with_outliers: 
			param_values["-TrimOutlier"]=""
		param_values["-outlier"]=str(self.outlier_pvals_per_true_positive) 
		param_values["-endoutlier"]=str(self.end_outlier_prior_probability) 
		param_values["-endoutlierFinal"]=str(self.pval_after_refinement) 
		param_values["-Mprobeval"]=str(self.faster_refinement_resolution) 
		if self.count_splits_with_largest_ids: 
			param_values["-splitcnt"]=""
		param_values["-splitrev"]=str(self.contig_split_version) 
		param_values["-rres"]=str(self.reduced_contig_resolution_divided_by_two) 
		if self.overwrite_output:
			param_values["-f"]=""
		param_values["-refine"]="0"
		param_values["-hashgen"] =  " ".join([str(self.hash_window), str(self.hash_min_sites), str(self.hash_sd_max), str(self.hash_sd_rms), str(self.hash_relative_error), str(self.hash_offset_kb), str(self.hash_max_insert_errors), str(self.hash_max_probe_errors), str(self.hash_max_unresolved_sites)]) 
		param_values["-hash"]=" ".join([self.hash_file, str(self.hash_threshold)]) 
		param_values["-hashdelta"]=str(self.hashdelta) 
		param_values["-mres"]=str(self.reduced_molecule_resolution) 
		param_values["-insertThreasds"]=str(self.insert_threads) 
		if self.skip_alignment_statistic_computation: 
			param_values["-nostat"]=""
		param_values["-maxmem"]=str(self.getMem())
		param_values["-FP"]=str(self.vital_parameters.fp)
		param_values["-FN"]=str(self.vital_parameters.fn)
		param_values["-sd"]=str(self.sd)
		param_values["-sf"]=str(self.sf)
		param_values["-sr"]=str(self.sr)
		param_values["-res"]=str(self.res)
		param_values["-refine"]="0"
		param_values["-grouped"]="../" + self.group_manifest.getOutputFile()
		param_values["-mapped"]="placeholder"
		param_values["-output-filter"]=self.regex_acceptible_output_file
		param_values["-id"]="placeholder"
		if self.write_output_to_file:
			param_values["-stdout"]=""
		if self.write_errors_to_file:
			param_values["-stderr"]=""
		param_values["-XmapStatRead"]="../"+self.molecule_stats.getOutputFile()
		param_values["-minlen"]=str(self.vital_parameters.min_molecule_len)
		param_values["-minsites"]=str(self.vital_parameters.min_molecule_sites)
		
		tmp_code=""
		cur_jobs=0
		code_parts=[]
		for block in xrange(1, self.split.vital_parameters.blocks+1):
			cur_jobs+=1
			param_values["-i"]=self.split.getOutputFile(block)
			param_values["-mapped"]="refineB0_id"+str(block)+"_mapped"
			param_values["-id"]=str(block)
			
			param_list=[self.workspace.binaries["bng_ref_aligner"]]
			for key in param_values:
				param_list.append(key)
				param_list.append(param_values[key])
			tmp_code+=" ".join(param_list) + "\n"

			if cur_jobs>=self.max_job_count:
				code_parts.append(code+tmp_code)
				tmp_code=""
				cur_jobs=0
		if len(tmp_code) > 0:
			code_parts.append(code+tmp_code)

		return code_parts

	def getStepDir(self):
		return "_".join(["refineB0", "fp"+str(self.vital_parameters.fp), "fn"+str(self.vital_parameters.fn), "pval"+str(self.vital_parameters.pval), "minlen"+str(self.vital_parameters.min_molecule_len), "minsites"+str(self.vital_parameters.min_molecule_sites)])

	def getOutputFile(self):
		return self.getStepDir() + "/" + self.output_prefix + "." + self.getOutputFileExtension()

	def getOutputFileExtension(self):
		return "contigs"
	def getOutputFileExtension(self):
		return "bnx"

	def autoGeneratePrereqs(self):
		self.inpt=Input(self.workspace)
		self.sort=Sort(self.workspace, copy(self.vital_parameters))
		self.molecule_stats=self.sort.getMoleculeStats()
		self.split=Split(self.workspace, copy(self.vital_parameters))
		self.split_summary=Summarize(self.workspace, self.split)
		self.pairwise_alignment=PairwiseAlignment(self.workspace, copy(self.vital_parameters))
		self.pairwise_summary=Summarize(self.workspace, self.pairwise_alignment)
		self.assembly=Assembly(self.workspace, copy(self.vital_parameters))
		self.assembly_summary=Summarize(self.workspace, self.assembly)
		self.merge_assembly=Merge(self.workspace, self.assembly)
		self.refineA=RefineA(self.workspace, copy(self.vital_parameters))
		self.refineA_summary=Summarize(self.workspace, self.refineA)
		self.merge_refineA=Merge(self.workspace, self.refineA)
		self.group_manifest=GroupManifest(self.workspace, self.refineA)
	
	def getPrereq(self):
		return self.group_manifest

	def getMem(self):
		return self.workspace.resources.getMediumMemory()
	def getTime(self):
		return self.workspace.resources.getLargeTime()
	def getThreads(self):
		return self.workspace.resources.getMediumThreads()
Beispiel #4
0
class PairwiseAlignment(Step):
	def __init__(self, workspace, vital_parameters):
		self.workspace=workspace
		self.vital_parameters=vital_parameters

		self.color=1
		self.sd=0.2
		self.sf=0.2
		self.sr=0.03
		self.res=3.3
		self.min_alignment_sites=5
		self.min_alignment_score=1
		self.outlier_pval=0.0001
		self.endoutlier_pval=0
		self.repeat_max_shift=2
		self.repeat_pval_change=0.01
		self.repeat_pval_ratio=0.7
		self.repeat_min_change=0.6
		self.hash_window=5
		self.hash_min_sites=3
		self.hash_sd_max=2.2
		self.hash_sd_rms=1.2
		self.hash_relative_error=0.05
		self.hash_offset_kb=3.0
		self.hash_max_insert_errors=1
		self.hash_max_probe_errors=1
		self.hash_max_unresolved_sites=1
		self.target_resolution=1.2
		self.allow_no_splits=True
		self.allow_infinite_splits=False
		self.overwrite_output=True
		self.send_output_to_file=True
		self.send_error_to_file=True

		split=Split(self.workspace, self.vital_parameters)
		total_blocks=split.total_job_count
		self.total_job_count=total_blocks*(total_blocks+1)/2 

		approx_mins_per_job=270.0
		self.max_job_count=self.getTime() * (60.0/approx_mins_per_job) - 1
		if self.max_job_count<1:
			self.max_job_count=1

		self.autoGeneratePrereqs()

	def writeCode(self):
		code_parts=[]

		param_values=OrderedDict()
		param_values["-usecolor"] =  str(self.color)
		param_values["-FP"] =  str(self.vital_parameters.fp)
		param_values["-FN"] =  str(self.vital_parameters.fn)
		param_values["-sd"] =  str(self.sd)
		param_values["-sf"] =  str(self.sf)
		param_values["-sr"] =  str(self.sr)
		param_values["-res"] =  str(self.res)
		param_values["-T"] =  str(self.vital_parameters.pval)
		maxmem=int(self.getMem()/self.getThreads())
		if maxmem < 1:
			maxmem=1
		param_values["-maxmem"] =  str(maxmem)
		param_values["-o"] =  "placeholder"
		param_values["-A"] =  str(self.min_alignment_sites)
		param_values["-S"] =  str(self.min_alignment_score)
		param_values["-outlier"] =  str(self.outlier_pval)
		param_values["-endoutlier"] =  str(self.endoutlier_pval)
		param_values["-RepeatMask"] =  " ".join([str(self.repeat_max_shift), str(self.repeat_pval_change)])
		param_values["-RepeatRec"] =  " ".join([str(self.repeat_pval_ratio), str(self.repeat_min_change)])
		param_values["-hashgen"] =  " ".join([str(self.hash_window), str(self.hash_min_sites), str(self.hash_sd_max), str(self.hash_sd_rms), str(self.hash_relative_error), str(self.hash_offset_kb), str(self.hash_max_insert_errors), str(self.hash_max_probe_errors), str(self.hash_max_unresolved_sites)])
		param_values["-hash"] =  ""
		param_values["-mres"] =  str(self.target_resolution)
		param_values["-nosplit"] =  "2" if self.allow_no_splits else "0" if self.allow_infinite_splits else "1"
		param_values["-maxthreads"] =  str(self.getThreads())
		param_values["-XmapStatRead"] =  "../" + str(self.molecule_stats.getOutputFile())

		if self.overwrite_output:
			param_values["-f"]=""
		if self.send_output_to_file:
			param_values["-stdout"]=""
		if self.send_error_to_file:
			param_values["-stderr"]=""
		
		tmp_code=""
		cur_jobs=0
		totalBlocks=self.split.total_job_count
		currentJob = 0
		for i in xrange(1,totalBlocks+1):
			file1="../" + self.split.getOutputFile(i)
			for j in range(i,totalBlocks + 1):
				file2="../" + self.split.getOutputFile(j)

				currentJob += 1
				param_values["-o"]='pairwise%dof%d' % (currentJob, self.total_job_count)
				if path.exists(self.getStepDir() + "/" + param_values["-o"] + ".align"):
					continue

				param_values["-i"]=file1
				if i==j :
					if "-first" in param_values:
						del param_values["-first"]
					if "-1" in param_values:
						del param_values["-1"]
					if "-i " in param_values:
						del param_values["-i "]
				else :
					param_values["-first"]=""
					param_values["-1"]=""
					param_values["-i "]=file2

				param_list=[self.workspace.binaries["bng_ref_aligner"]]
				for key in param_values:
					param_list.append(key)
					param_list.append(param_values[key])
				tmp_code += "if [ ! -e " + param_values["-o"] + ".align ]\n"
				tmp_code += "then\n"
				tmp_code += "  " + " ".join(param_list) + "\n"
				tmp_code += "fi\n"

				cur_jobs+=1
				if cur_jobs>=self.max_job_count:
					code = "cd " + self.workspace.work_dir + "\n"
					code += "mkdir -p " + self.getStepDir() + "\n"
					code += "cd " + self.getStepDir() + "\n"
					code += tmp_code
					code_parts.append(code)

					tmp_code=""
					cur_jobs=0
		if tmp_code != "":
			code = "cd " + self.workspace.work_dir + "\n"
			code += "mkdir -p " + self.getStepDir() + "\n"
			code += "cd " + self.getStepDir() + "\n"
			code += "pwd\n"
			code += tmp_code
			code_parts.append(code)

			tmp_code=""
			cur_jobs=0

		if len(code_parts)==0:
			return ["# do nothing"]
		return code_parts

	def getStepDir(self):
		return "_".join(["pairwise", self.inpt.getStepDir(), "fp"+str(self.vital_parameters.fp), "fn"+str(self.vital_parameters.fn), "pval"+str(self.vital_parameters.pval)])

	def autoGeneratePrereqs(self):
		self.inpt=Input(self.workspace)
		self.sort=Sort(self.workspace, copy(self.vital_parameters))
		self.molecule_stats=self.sort.getMoleculeStats()
		self.split=Split(self.workspace, copy(self.vital_parameters))
		self.split_summary=Summarize(self.workspace, self.split)

	def getPrereq(self):
		return self.split_summary

	def getMem(self):
		return self.workspace.resources.getLargeMemory()
	def getTime(self):
		return self.workspace.resources.getLargeTime()
	def getThreads(self):
		return self.workspace.resources.getLargeThreads()

	def getOutputFile(self):
		raise Exception("Pairwise doesn't have an output file, per se")
	def getOutputFileExtension(self):
		return "align"
Beispiel #5
0
class Assembly(GenericAssembly):
	def __init__(self, workspace, vital_parameters):
		self.workspace=workspace
		self.vital_parameters=vital_parameters
		self.quality=None

		self.sd=0.2
		self.sf=0.2
		self.sr=0.03
		self.res=3.3
		self.color=1
		self.alignment_score_threshold=1
		self.max_rel_coverage_multiple=100
		self.max_rel_coverage_absolute=200
		self.max_rel_coverage_absolute_2=30
		self.bulge_coverage=20
		self.max_coverage=10
		self.min_coverage=10
		self.min_average_coverage=5
		self.min_maps=5
		self.min_contig_len=0.0
		self.end_trim=1
		self.chimera_pval=0.001
		self.chimera_num=3
		self.fast_bulge=1000
		self.fragile_preserve=False
		self.draftsize=1
		self.min_duplicate_len=1
		self.binary_output=True
		self.min_snr=2
		self.output_prefix="unrefined"
		self.add_alignment_filter=True
		self.alignment_filter_threshold=100
		self.alignment_filter_minlen_change=2.0
		self.alignment_filter_pval_change=0.5
		self.overwrite_output=True
		self.hide_branches=True
		self.send_output_to_file=True
		self.send_errors_to_file=True

		self.total_job_count=1

		self.autoGeneratePrereqs()

	def writeCode(self):
		code = "cd " + self.workspace.work_dir + "\n"
		code += "mkdir " + self.getStepDir() + "\n"
		code += "cd " + self.getStepDir() + "\n"
		code += "pwd\n"

		param_values=OrderedDict()
		param_values["-if"]= "../" + str(self.split_summary.getOutputFile())
		param_values["-af"]= "../" + str(self.pairwise_summary.getOutputFile())
		param_values["-XmapStatRead"]= "../" + str(self.molecule_stats.getOutputFile())
		param_values["-usecolor"]= str(self.color)
		param_values["-FP"]= str(self.vital_parameters.fp)
		param_values["-FN"]= str(self.vital_parameters.fn)
		param_values["-sd"]= str(self.sd)
		param_values["-sf"]= str(self.sf)
		param_values["-sr"]= str(self.sr)
		param_values["-res"]= str(self.res)
		param_values["-T"]= str(self.vital_parameters.pval)
		param_values["-S"]= str(self.alignment_score_threshold)
		param_values["-MaxRelCoverage"]= " ".join([str(self.max_rel_coverage_multiple), str(self.max_rel_coverage_absolute), str(self.max_rel_coverage_absolute_2)])
		param_values["-BulgeCoverage"]= str(self.bulge_coverage)
		param_values["-MaxCoverage"]= str(self.max_coverage)
		param_values["-MinCov"]= str(self.min_coverage)
		param_values["-MinAvCov"]= str(self.min_average_coverage)
		param_values["-MinMaps"]= str(self.min_maps)
		param_values["-MinContigLen"]= str(self.min_contig_len)
		param_values["-EndTrim"]= str(self.end_trim)
		param_values["-refine"]="0"
		param_values["-PVchim"]= " ".join([str(self.chimera_pval),str(self.chimera_num)])
		param_values["-FastBulge"]= str(self.fast_bulge)
		param_values["-FragilePreserve"]= str("1" if self.fragile_preserve else "0")
		param_values["-draftsize"]= str("1")
		param_values["-SideBranch"]= str(self.min_duplicate_len)
		param_values["-contigs_format"]= str("1" if self.binary_output else "0")
		param_values["-maxthreads"]= str(self.getThreads())
		maxmem=int(self.getMem()/self.getThreads())
		if maxmem<1:
			maxmem=1
		param_values["-maxmem"]= str(maxmem)
		param_values["-minlen"]= str(self.vital_parameters.min_molecule_len)
		param_values["-minsites"]= str(self.vital_parameters.min_molecule_sites)
		param_values["-minSNR"]= str(self.min_snr)
		param_values["-o"]= str(self.output_prefix)
		
		if self.add_alignment_filter:
			param_values["-AlignmentFilter"] = " ".join([str(self.alignment_filter_threshold), str(self.alignment_filter_minlen_change), str(self.alignment_filter_pval_change)])
		if self.overwrite_output:
			param_values["-force"] = ""
		if self.hide_branches:
			param_values["-SideChain"] = ""
		if self.send_output_to_file:
			param_values["-stdout"] = ""
		if self.send_errors_to_file:
			param_values["-stderr"] = ""

		param_list=[self.workspace.binaries["bng_assembler"]]
		for key in param_values:
			param_list.append(key)
			param_list.append(param_values[key])
		code += " ".join(param_list) + "\n"

		return [code]

	def getStepDir(self):
		return "_".join(["assembly", self.inpt.getStepDir(), "fp"+str(self.vital_parameters.fp), "fn"+str(self.vital_parameters.fn), "pval"+str(self.vital_parameters.pval), "minlen"+str(self.vital_parameters.min_molecule_len), "minsites"+str(self.vital_parameters.min_molecule_sites)])

	def autoGeneratePrereqs(self):
		self.inpt=Input(self.workspace)
		self.sort=Sort(self.workspace, copy(self.vital_parameters))
		self.molecule_stats=self.sort.getMoleculeStats()
		self.split=Split(self.workspace, copy(self.vital_parameters))
		self.split_summary=Summarize(self.workspace, self.split)
		self.pairwise_alignment=PairwiseAlignment(self.workspace, copy(self.vital_parameters))
		self.pairwise_summary=Summarize(self.workspace, self.pairwise_alignment)

	def getPrereq(self):
		return self.pairwise_summary

	def isComplete(self):
		return path.exists(self.getOutputFile())

	def createQualityObject(self):
		if not self.isComplete():
			raise Exception("The step is not complete yet")
		count=0
		total_length=0.0
		lengths=[]
		label_occurrences=0
		label_count=0
		for cmap_name in glob(self.getStepDir() + "/*.cmap"): # This glob relies on there not being a merged .cmap in the same directory (i.e. Summarize has not been run)
			contigs=set()
			cmap_file=CmapFile(cmap_name)
			for label in cmap_file.parse():
				if not label.contig_id in contigs:
					count+=1
					total_length+=label.contig_len
					contigs.add(label.contig_id)
					lengths.append(label.contig_len)
				label_occurrences+=label.occurrences
				label_count+=1
		
		sorted_lengths=sorted(lengths, reverse=True)
		minlen=sorted_lengths[len(sorted_lengths)-1]
		maxlen=sorted_lengths[0]
		n50=0
		length_included_in_n50=0
		target_length_included=total_length/2.0
		for length in sorted(lengths, reverse=True):
			length_included_in_n50+=length
			if length_included_in_n50 >= target_length_included:
				n50 = length
				break

		with open(self.getOutputFile()) as contig_file:
			for line in contig_file:
				if line[0] != "C":
					continue
				contig_data=line.split(",")
				nummaps=contig_data[len(contig_data)-1]
				nummaps_data=nummaps.split("=")
				nummaps=nummaps_data[len(nummaps_data)-1]

		self.quality=Quality(length=total_length, count=count, average_length=total_length/count, n50=n50, min=minlen, max=maxlen, average_occurrences=float(label_occurrences)/label_count, total_mols_aligned=nummaps, avg_mols_aligned=float(nummaps)/count)
		self.saveQualityObjectToFile()

	def getQuality_count(self):
		if self.quality is None:
			self.loadQualityObjectFromFile()
		return self.quality.count
	def getQuality_length(self):
		if self.quality is None:
			self.loadQualityObjectFromFile()
		return self.quality.length
	def getQuality_averageLength(self):
		if self.quality is None:
			self.loadQualityObjectFromFile()
		return self.quality.average_length
	def getQuality_n50(self):
		if self.quality is None:
			self.loadQualityObjectFromFile()
		return self.quality.n50
	def getQuality_max(self):
		if self.quality is None:
			self.loadQualityObjectFromFile()
		return self.quality.max
	def getQuality_min(self):
		if self.quality is None:
			self.loadQualityObjectFromFile()
		return self.quality.min
	def getQuality_averageOccurrences(self):
		if self.quality is None:
			self.loadQualityObjectFromFile()
		return self.quality.average_occurrences
	def getQuality_totalMolsAligned(self):
		if self.quality is None:
			self.loadQualityObjectFromFile()
		return self.quality.total_mols_aligned
	def getQuality_avgMolsAligned(self):
		if self.quality is None:
			self.loadQualityObjectFromFile()
		return self.quality.avg_mols_aligned

	def getMem(self):
		return self.workspace.resources.getLargeMemory()
	def getTime(self):
		return self.workspace.resources.getMediumTime()
	def getThreads(self):
		return 1
Beispiel #6
0
class RefineA(GenericAssembly):
	def __init__(self, workspace, vital_parameters):
		self.workspace=workspace
		self.vital_parameters=vital_parameters
		self.quality=None

		self.sd=0.2
		self.sf=0.2
		self.sr=0.03
		self.res=3.3
		self.usecolor=1
		self.use_multi_mode=True
		self.consensus_end_coverage=0.99
		self.bias_for_low_likelihood_ratio=1e2
		self.refinement_length_accuracy=""
		self.largest_query_map_interval=4
		self.largest_reference_map_interval=6
		self.outlier_pval=1e-5
		self.end_outlier_prior_probability=0.00001
		self.contigs_format=1
		self.overwrite_output=True
		self.output_prefix="refineA"
		self.send_output_to_file=True
		self.send_errors_to_file=True

		self.total_job_count=1

		self.autoGeneratePrereqs()

	def writeCode(self):
		code="cd " + self.workspace.work_dir + "\n"
		code+="mkdir " + self.getStepDir() + "\n"
		code+="cd " + self.getStepDir() + "\n"
		code+="pwd\n"

		param_values=OrderedDict()
		param_values["-i"]="../" + self.sort.getOutputFile()
		param_values["-contigs"]=" ".join(["../" + self.assembly.getOutputFile(), "$group_start", "$group_end"])
		param_values["-maxthreads"]=str(self.getThreads())
		param_values["-T"]=str(self.vital_parameters.pval)
		param_values["-usecolor"]=str(self.usecolor)
		param_values["-extend"]="1"
		param_values["-refine"]="2"
		if self.use_multi_mode:
			param_values["-MultiMode"]=""
		param_values["-EndTrim"]=str(self.consensus_end_coverage)
		param_values["-LRbias"]=str(self.bias_for_low_likelihood_ratio)
		param_values["-Mprobeval"]=str(self.refinement_length_accuracy)
		param_values["-deltaX"]=str(self.largest_query_map_interval)
		param_values["-deltaY"]=str(self.largest_reference_map_interval)
		param_values["-outlier"]=str(self.outlier_pval)
		param_values["-endoutlier"]=str(self.end_outlier_prior_probability)
		param_values["-contigs_format"]=str(self.contigs_format)
		if self.overwrite_output:
			param_values["-force"]=""
		param_values["-FP"]=str(self.vital_parameters.fp)
		param_values["-FN"]=str(self.vital_parameters.fn)
		param_values["-sd"]=str(self.sd)
		param_values["-sf"]=str(self.sf)
		param_values["-sr"]=str(self.sr)
		param_values["-res"]=str(self.res)
		param_values["-o"]=self.output_prefix
		if self.send_output_to_file:
			param_values["-stdout"]=""
		if self.send_errors_to_file:
			param_values["-stderr"]=""
		param_values["-XmapStatRead"]="../" + self.molecule_stats.getOutputFile()
		
		param_list=[self.workspace.binaries["bng_assembler"]]
		for key in param_values:
			param_list.append(key)
			param_list.append(param_values[key])

		code+="let contig_num=0\n"
		code+="while read line\n"
		code+="do\n"
		code+="  if [[ $line == \"#\"* ]]; then continue; fi\n"
		code+="  let contig_num+=1\n"
		code+="  group_start=`echo $line | awk '{print $1}'`\n"
		code+="  group_end=`echo $line | awk '{print $NF}'`\n"
		code+="    " + " ".join(param_list) + "\n"
		code+="done < ../" + self.group_manifest.getOutputFile()

		return [code]
		
	def getStepDir(self):
		return "_".join(["refineA", self.inpt.getStepDir(), "fp"+str(self.vital_parameters.fp), "fn"+str(self.vital_parameters.fn), "pval"+str(self.vital_parameters.pval), "minlen"+str(self.vital_parameters.min_molecule_len), "minsites"+str(self.vital_parameters.min_molecule_sites)])

	def autoGeneratePrereqs(self):
		self.inpt=Input(self.workspace)
		self.sort=Sort(self.workspace, copy(self.vital_parameters))
		self.molecule_stats=self.sort.getMoleculeStats()
		self.split=Split(self.workspace, copy(self.vital_parameters))
		self.split_summary=Summarize(self.workspace, self.split)
		self.pairwise_alignment=PairwiseAlignment(self.workspace, copy(self.vital_parameters))
		self.pairwise_summary=Summarize(self.workspace, self.pairwise_alignment)
		self.assembly=Assembly(self.workspace, copy(self.vital_parameters))
		self.assembly_summary=Summarize(self.workspace, self.assembly)
		self.merge_assembly=Merge(self.workspace, self.assembly)
		self.group_manifest=GroupManifest(self.workspace, self.assembly)

	def getPrereq(self):
		return self.group_manifest

	def getMem(self):
		return self.workspace.resources.getMediumMemory()
	def getTime(self):
		return self.workspace.resources.getLargeTime()
	def getThreads(self):
		return self.workspace.resources.getMediumThreads()
Beispiel #7
0
	def autoGeneratePrereqs(self):
		self.inpt=Input(self.workspace)
		self.sort=Sort(self.workspace, copy(self.vital_parameters))
		self.molecule_stats=self.sort.getMoleculeStats()
Beispiel #8
0
class Split(Step):
	def __init__(self, workspace, vital_parameters):
		self.workspace=workspace
		self.vital_parameters=vital_parameters

		self.overwrite_output=True
		self.send_output_to_file=True
		self.send_error_to_file=True

		self.autoGeneratePrereqs()

		if vital_parameters.blocks is None:
			with open(self.workspace.work_dir + "/" + self.inpt.getOutputFile()) as iFile:
				count=0
				site_count=0
				for line in iFile:
					if line[0] == "0":
					    count+=1
					if line[0] == "1":
					    site_count+=len(line.split())-1

				blocks=int(math.ceil(count/80000.0))
				site_blocks=int(math.ceil(site_count/1e6))
				if site_blocks>blocks:
					blocks=site_blocks
				self.total_job_count=blocks
				self.vital_parameters.blocks=blocks
		else:
			self.total_job_count=vital_parameters.blocks

		approx_mins_per_job=5
		self.max_job_count=self.getTime()*(60.0/approx_mins_per_job)-3
		if self.max_job_count<1:
			self.max_job_count=1

	def writeCode(self):
		code_parts=[]

		param_values=OrderedDict()
		param_values["-i"] =  "../" + self.sort.getOutputFile()
		param_values["-o"] =  "placeholder"
		param_values["-maxthreads"] =  str(self.getThreads())
		param_values["-merge"] =  ""
		param_values["-bnx"] =  ""

		if self.overwrite_output:
			param_values["-f"] = ""
		if self.send_output_to_file:
			param_values["-stdout"] = ""
		if self.send_error_to_file:
			param_values["-stderr"] = ""

		tmp_code=""
		cur_jobs=0
		for cur_block in xrange(1, self.total_job_count+1):
			param_list=[self.workspace.binaries["bng_ref_aligner"]]
			param_values["-o"]="split_" + str(cur_block) + "_of_" + str(self.total_job_count)
			param_values["-subsetbin"]=str(cur_block) + " " + str(self.total_job_count)
			for key in param_values:
				param_list.append(key)
				param_list.append(param_values[key])

			tmp_code += " ".join(param_list) + "\n"
			cur_jobs+=1

			if cur_jobs>=self.max_job_count or cur_block==self.total_job_count:
				code = "cd " + self.workspace.work_dir + "\n"
				code += "mkdir -p " + self.getStepDir() + "\n"
				code += "cd " + self.getStepDir() + "\n"
				code += tmp_code
				code += "pwd\n"

				code_parts.append(code)

				cur_jobs=0
				tmp_code=""

		return code_parts

	def getStepDir(self):
		return "_".join(["split", self.inpt.getStepDir(), "blockCount"+str(self.total_job_count)])

	def autoGeneratePrereqs(self):
		self.inpt=Input(self.workspace)
		self.sort=Sort(self.workspace, copy(self.vital_parameters))
		self.molecule_stats=self.sort.getMoleculeStats()

	def getPrereq(self):
		return self.sort


	def getMem(self):
		return self.workspace.resources.getMediumMemory()
	def getTime(self):
		return self.workspace.resources.getSmallTime()
	def getThreads(self):
		return self.workspace.resources.getSmallThreads()

	def getOutputFile(self, block_num):
		return self.getStepDir() + "/split_" + str(block_num) + "_of_" + str(self.total_job_count) + ".bnx"
	def getOutputFileExtension(self):
		return "bnx"