Ejemplo n.º 1
0
def cluster_seqs (infile, scratch_dir, cluster_fn):
	# NOTE: the logic of this is quite hairy. We wish to traverse the number of
	# times through the collection looking for similar seqs. At the same time,
	# we don't wish to load the entirity of sequences into memory. So we open the
	# infile file and read it one by one. For each sequence read, we read the file
	# again and compare it to every sequence after it. The comparison (clustering)
	# function returns None if the two sequences do not cluster. Otherwise it
	# returns the preferred / better sequence of the two, which is then used for
	# subsequent comparisons on this loop. We save on unnecessary comparsions by
	# storing any sucessful matches in a "don't check" dict. We use a single file
	# handle to searh one, rewinding as need be, to avoid opening and closing
	# thousands.
	
	## Main:
	log.info ('Clustering sequences ...')
	# create & open file for results, & open handl for searching / comparing
	clustered_out_hndl = create_intermediate_file ('clustered', scratch_dir)
	search_hndl = open (infile, 'r')
	# read in seqfile and check seqs one-by-one
	seq_cnt = 0
	already_tested = {}
	log.info ("Reading '%s' ..." % infile)
	rdr = ExSeqReader (infile, fmt=SCRATCH_FORMAT, merge_quals=False)
	for i, seq_1 in enumerate (rdr.read()):
		log.debug ("Reading sequence '%s' ..." % seq_1.id)
		# if this seq hasn't previously been clustered
		if seq_1.id not in already_tested:
			# move to start of search file & start reading
			search_hndl.seek (0)
			rdr_2 = ExSeqReader (search_hndl, fmt=SCRATCH_FORMAT, merge_quals=False)
			# for every seq beyond the current one
			for seq_2 in islice (rdr_2.read(), i+1, None):
				# if it hasn't previously been clustered
				if seq_2.id not in already_tested:
					cluster_seq = cluster_fn (seq_1, seq_2)
					# if it clusters, place in "done" dict and update search term
					if cluster_seq:
						already_tested[seq_2.id] = True
						seq_1 = cluster_seq
			# save the surviving search term
			SeqIO.write ([seq_1], clustered_out_hndl, SCRATCH_FORMAT)
			seq_cnt += 1
	clustered_out_hndl.close()
	search_hndl.close()
	## Postconditions & return:
	log.debug ('%s sequences remain after clustering ...' % seq_cnt)
	return clustered_out_hndl.name
Ejemplo n.º 2
0
def blast_seqs (in_file, scratch_dir, result_dir, e_threshold=None,
		max_hits=None):
	log.info ('Blasting reads ...')
	#
	# read in seqfile
	blast_cnt = 0
	rdr = ExSeqReader (in_file, fmt=SCRATCH_FORMAT, merge_quals=False)
	# for each sequence
	for seq in rdr.read():
		log.info ("Blasting sequence '%s' ..." % seq.id)
		# blast it and write down to scratch
		res = blast_ncbi (seq.format('fasta'),
			e_threshold=e_threshold,
			max_hits=max_hits,
		)
		blast_cnt += 1
		scratch_hndl = fileutils.write_to_file ([result_dir, '%s.xml' % seq.id],
			res.read())
		# reduce and write to results
		res.seek(0)
		result_hndl = fileutils.write_to_file ([result_dir, '%s.xml' % seq.id],
			res.read())
		# summarize
		
	## Postconditions & return:
	log.info ('%s sequences were blasted ...' % blast_cnt)
Ejemplo n.º 3
0
def merge_and_trim_seqs (input_files, scratch_dir, input_format=None,
		merge_quals=True, trim_right=None):
	"""
	Combine input sequence files, with merging with quality data and trimming.
	
	These reads and merges all the inputs into a single file in the designated
	intermediate format, combining them with any quality data available and
	trimming the 3' end if required. It is possible that this stage may be
	unnecessary (if the data is in a single file in the right format already and
	no trimming is required) but this will almost never be the case.
	
	"""
	# TOOD: check for rare conditions where we don't need to do this
	
	## Main:
	if trim_right:
		log.info ('Merging & trimming sequences ...')
	else:
		log.info ('Merging sequences ...')
		
	# create & open file for filtered seqs
	merged_out_hndl = create_intermediate_file ('merged_and_trimmed', scratch_dir)
	
	# read in seqfiles
	seq_cnt = 0
	for f in input_files:
		log.info ("Reading '%s' ..." % f)
		# check file exists
		assert (path.exists (f)), "the sequence file '%s' does not exist" % f
		# set default format
		if (input_format in [None, 'auto']):
			fmt = None
		else:
			fmt = input_format
		# make reader & read
		rdr = ExSeqReader (f, fmt=fmt, merge_quals=merge_quals)
		for seq in rdr.read():
			log.debug ("Reading sequence '%s' ..." % seq.id)
			# trim the sequnece if requested
			if trim_right:
				seq.seq = seq.seq[:-trim_right]
			# write it out
			SeqIO.write ([seq], merged_out_hndl, SCRATCH_FORMAT)
			seq_cnt += 1
	merged_out_hndl.close()
	## Postconditions & return:
	log.info ('%s sequences merged ...' % seq_cnt)
	return merged_out_hndl.name
Ejemplo n.º 4
0
def filter_seqs (infile, scratch_dir, filters):
	## Main:
	log.info ('Filtering sequences ...')
	# create & open file for filtered seqs
	filtered_out_hndl = create_intermediate_file ('filtered', scratch_dir)
	# read in seqfile
	filter_cnt = 0
	log.info ("Reading '%s' ..." % infile)
	# make reader & read
	rdr = ExSeqReader (infile, fmt=SCRATCH_FORMAT, merge_quals=False)
	for seq in rdr.read():
		log.debug ("Reading sequence '%s' ..." % seq.id)
		# if it passes all filters
		if all (filters):
			log.debug ("Accepting '%s' ..." % seq.id)
			SeqIO.write ([seq], filtered_out_hndl, SCRATCH_FORMAT)
			filter_cnt += 1
		else:
			log.debug ("Rejecting '%s' ..." % seq.id)
	filtered_out_hndl.close()
	## Postconditions & return:
	log.debug ('%s sequences remain after filtering ...' % filter_cnt)
	return filtered_out_hndl.name
Ejemplo n.º 5
0
def main():
	input_files, options = parse_args()

	try:
		# auxillary and help functions
		if options.dump_options:
			dump_options (options)
			
		# setup
		scriptlog.init_logger (
			format="%(message)s",
			verbosity=options.verbosity,
		)
		
		scratch_dir = fileutils.create_scratch_dir (SCRATCH_DIR_PREFIX,
			options.scratch_path)
		log.info ("Making temporary files at '%s' ..." % scratch_dir)
		
		# read in. merge and trim seq files
		work_file = merge_and_trim_seqs (input_files, scratch_dir,
			merge_quals=not options.ignore_qual_files,
			trim_right=options.trim_right,
			input_format=options.input_format
		)

		# filter stuff, if requested
		filters = make_filters (
			length = options.filter_length,
			base_quality = options.filter_base_qual_threshold,
			avg_quality = options.filter_avg_qual_threshold,
		)
		if (filters):
			work_file = filter_seqs (work_file, scratch_dir, filters)
		
		# cluster stuff, if requested
		clusterer = make_clusterer (
			options.cluster_identity,
			options.cluster_subsequence,
			options.cluster_similar,
		)
		if clusterer: 
			work_file = cluster_seqs (work_file, scratch_dir, clusterer)
			
		# blast stuff, if not a dryrun
		if not options.dryrun:
			# directory for raw results straight from server
			blast_out_dir = make_blast_raw_dir (scratch_dir)
			# directory for distilled results
			results_dir = fileutils.make_output_dir (RESULTS_DIR_PREFIX,
				options.results_path)
			log.info ("Making final results directory at '%s' ..." % results_dir)
		
			result_dir = blast_seqs (work_file,
				scratch_dir=blast_out_dir,
				result_dir=results_dir,
				e_threshold=options.blast_expect,
				max_hits=options.blast_max_hits,
			)
			# and combine, reduce & summarize if requested

	except BaseException, err:
		if options.traceback:
			traceback.print_exc()
		log.critical ("A problem: %s" % err)
		log.info ('Fatal error, terminating.')
		sys.exit(1)
		return
Ejemplo n.º 6
0
			# directory for distilled results
			results_dir = fileutils.make_output_dir (RESULTS_DIR_PREFIX,
				options.results_path)
			log.info ("Making final results directory at '%s' ..." % results_dir)
		
			result_dir = blast_seqs (work_file,
				scratch_dir=blast_out_dir,
				result_dir=results_dir,
				e_threshold=options.blast_expect,
				max_hits=options.blast_max_hits,
			)
			# and combine, reduce & summarize if requested

	except BaseException, err:
		if options.traceback:
			traceback.print_exc()
		log.critical ("A problem: %s" % err)
		log.info ('Fatal error, terminating.')
		sys.exit(1)
		return
		
	log.info ('Finished.')
	sys.exit(0)
	

if __name__ == '__main__':
	main()


### END #######################################################################