def blast_seqs (in_file, scratch_dir, result_dir, e_threshold=None, max_hits=None): log.info ('Blasting reads ...') # # read in seqfile blast_cnt = 0 rdr = ExSeqReader (in_file, fmt=SCRATCH_FORMAT, merge_quals=False) # for each sequence for seq in rdr.read(): log.info ("Blasting sequence '%s' ..." % seq.id) # blast it and write down to scratch res = blast_ncbi (seq.format('fasta'), e_threshold=e_threshold, max_hits=max_hits, ) blast_cnt += 1 scratch_hndl = fileutils.write_to_file ([result_dir, '%s.xml' % seq.id], res.read()) # reduce and write to results res.seek(0) result_hndl = fileutils.write_to_file ([result_dir, '%s.xml' % seq.id], res.read()) # summarize ## Postconditions & return: log.info ('%s sequences were blasted ...' % blast_cnt)
def merge_and_trim_seqs (input_files, scratch_dir, input_format=None, merge_quals=True, trim_right=None): """ Combine input sequence files, with merging with quality data and trimming. These reads and merges all the inputs into a single file in the designated intermediate format, combining them with any quality data available and trimming the 3' end if required. It is possible that this stage may be unnecessary (if the data is in a single file in the right format already and no trimming is required) but this will almost never be the case. """ # TOOD: check for rare conditions where we don't need to do this ## Main: if trim_right: log.info ('Merging & trimming sequences ...') else: log.info ('Merging sequences ...') # create & open file for filtered seqs merged_out_hndl = create_intermediate_file ('merged_and_trimmed', scratch_dir) # read in seqfiles seq_cnt = 0 for f in input_files: log.info ("Reading '%s' ..." % f) # check file exists assert (path.exists (f)), "the sequence file '%s' does not exist" % f # set default format if (input_format in [None, 'auto']): fmt = None else: fmt = input_format # make reader & read rdr = ExSeqReader (f, fmt=fmt, merge_quals=merge_quals) for seq in rdr.read(): log.debug ("Reading sequence '%s' ..." % seq.id) # trim the sequnece if requested if trim_right: seq.seq = seq.seq[:-trim_right] # write it out SeqIO.write ([seq], merged_out_hndl, SCRATCH_FORMAT) seq_cnt += 1 merged_out_hndl.close() ## Postconditions & return: log.info ('%s sequences merged ...' % seq_cnt) return merged_out_hndl.name
def cluster_seqs (infile, scratch_dir, cluster_fn): # NOTE: the logic of this is quite hairy. We wish to traverse the number of # times through the collection looking for similar seqs. At the same time, # we don't wish to load the entirity of sequences into memory. So we open the # infile file and read it one by one. For each sequence read, we read the file # again and compare it to every sequence after it. The comparison (clustering) # function returns None if the two sequences do not cluster. Otherwise it # returns the preferred / better sequence of the two, which is then used for # subsequent comparisons on this loop. We save on unnecessary comparsions by # storing any sucessful matches in a "don't check" dict. We use a single file # handle to searh one, rewinding as need be, to avoid opening and closing # thousands. ## Main: log.info ('Clustering sequences ...') # create & open file for results, & open handl for searching / comparing clustered_out_hndl = create_intermediate_file ('clustered', scratch_dir) search_hndl = open (infile, 'r') # read in seqfile and check seqs one-by-one seq_cnt = 0 already_tested = {} log.info ("Reading '%s' ..." % infile) rdr = ExSeqReader (infile, fmt=SCRATCH_FORMAT, merge_quals=False) for i, seq_1 in enumerate (rdr.read()): log.debug ("Reading sequence '%s' ..." % seq_1.id) # if this seq hasn't previously been clustered if seq_1.id not in already_tested: # move to start of search file & start reading search_hndl.seek (0) rdr_2 = ExSeqReader (search_hndl, fmt=SCRATCH_FORMAT, merge_quals=False) # for every seq beyond the current one for seq_2 in islice (rdr_2.read(), i+1, None): # if it hasn't previously been clustered if seq_2.id not in already_tested: cluster_seq = cluster_fn (seq_1, seq_2) # if it clusters, place in "done" dict and update search term if cluster_seq: already_tested[seq_2.id] = True seq_1 = cluster_seq # save the surviving search term SeqIO.write ([seq_1], clustered_out_hndl, SCRATCH_FORMAT) seq_cnt += 1 clustered_out_hndl.close() search_hndl.close() ## Postconditions & return: log.debug ('%s sequences remain after clustering ...' % seq_cnt) return clustered_out_hndl.name
def filter_seqs (infile, scratch_dir, filters): ## Main: log.info ('Filtering sequences ...') # create & open file for filtered seqs filtered_out_hndl = create_intermediate_file ('filtered', scratch_dir) # read in seqfile filter_cnt = 0 log.info ("Reading '%s' ..." % infile) # make reader & read rdr = ExSeqReader (infile, fmt=SCRATCH_FORMAT, merge_quals=False) for seq in rdr.read(): log.debug ("Reading sequence '%s' ..." % seq.id) # if it passes all filters if all (filters): log.debug ("Accepting '%s' ..." % seq.id) SeqIO.write ([seq], filtered_out_hndl, SCRATCH_FORMAT) filter_cnt += 1 else: log.debug ("Rejecting '%s' ..." % seq.id) filtered_out_hndl.close() ## Postconditions & return: log.debug ('%s sequences remain after filtering ...' % filter_cnt) return filtered_out_hndl.name