def buildSeqRecord(db_record, id_field, seq_field, meta_fields=None): """ Parses a database record into a SeqRecord Arguments: db_record : a dictionary containing a database record. id_field : the field containing identifiers. seq_field : the field containing sequences. meta_fields : a list of fields to add to sequence annotations. Returns: Bio.SeqRecord.SeqRecord: record. """ # Return None if ID or sequence fields are empty if not db_record[id_field] or not db_record[seq_field]: return None # Create description string desc_dict = OrderedDict([('ID', db_record[id_field])]) if meta_fields is not None: desc_dict.update([(f, db_record[f]) for f in meta_fields if f in db_record]) desc_str = flattenAnnotation(desc_dict) # Create SeqRecord seq_record = SeqRecord(Seq(db_record[seq_field], IUPAC.ambiguous_dna), id=desc_str, name=desc_str, description='') return seq_record
def getDbSeqRecord(db_record, id_field, seq_field, meta_fields=None, delimiter=default_delimiter): """ Parses a database record into a SeqRecord Arguments: db_record = a dictionary containing a database record id_field = the field containing identifiers seq_field = the field containing sequences meta_fields = a list of fields to add to sequence annotations delimiter = a tuple of delimiters for (fields, values, value lists) Returns: a SeqRecord """ # Return None if ID or sequence fields are empty if not db_record[id_field] or not db_record[seq_field]: return None # Create description string desc_dict = OrderedDict([('ID', db_record[id_field])]) if meta_fields is not None: desc_dict.update([(f, db_record[f]) for f in meta_fields if f in db_record]) desc_str = flattenAnnotation(desc_dict, delimiter=delimiter) # Create SeqRecord seq_record = SeqRecord(Seq(db_record[seq_field], IUPAC.ambiguous_dna), id=desc_str, name=desc_str, description='') return seq_record
def _header(seq, cluster, field=cluster_field, prefix=cluster_prefix, delimiter=out_args['delimiter']): label = '%s%i' % (prefix, cluster) header = parseAnnotation(seq.description, delimiter=delimiter) header = mergeAnnotation(header, {field: label}, delimiter=delimiter) seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter) seq.description = '' return seq
def processBCQueue(alive, data_queue, result_queue, cons_func, cons_args={}, min_count=default_min_count, primer_field=None, primer_freq=None, max_gap=None, max_error=None, max_diversity=None, copy_fields=None, copy_actions=None, delimiter=default_delimiter): """ Pulls from data queue, performs calculations, and feeds results queue Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues; when False function returns data_queue = a multiprocessing.Queue holding data to process result_queue = a multiprocessing.Queue to hold processed results cons_func = the function to use for consensus generation cons_args = a dictionary of optional arguments for the consensus function min_count = threshold number of sequences to define a consensus primer_field = the annotation field containing primer names; if None do not annotate with primer names primer_freq = the maximum primer frequency that must be meet to build a consensus; if None do not filter by primer frequency max_gap = the maximum frequency of (., -) characters allowed before deleting a position; if None do not delete positions max_error = the minimum error rate to retain a set; if None do not calculate error rate max_diversity = a threshold defining the average pairwise error rate required to retain a read group; if None do not calculate diversity copy_fields = a list of annotations to copy into consensus sequence annotations; if None no additional annotations will be copied copy_actions = the list of actions to take for each copy_fields; one of ['set', 'majority', 'min', 'max', 'sum'] delimiter = a tuple of delimiters for (annotations, field/values, value lists) Returns: None """ try: # Iterator over data queue until sentinel object reached while alive.value: # Get data from queue if data_queue.empty(): continue else: data = data_queue.get() # Exit upon reaching sentinel if data is None: break # Define result dictionary for iteration result = SeqResult(data.id, data.data) result.log['BARCODE'] = data.id result.log['SEQCOUNT'] = len(data) # Define primer annotations and consensus primer if applicable if primer_field is None: primer_ann = None seq_list = data.data else: # Calculate consensus primer primer_ann = OrderedDict() prcons = annotationConsensus(data.data, primer_field, delimiter=delimiter) result.log['PRIMER'] = ','.join(prcons['set']) result.log['PRCOUNT'] = ','.join([str(c) for c in prcons['count']]) result.log['PRCONS'] = prcons['cons'] result.log['PRFREQ'] = prcons['freq'] if primer_freq is None: # Retain full sequence set if not in primer consensus mode seq_list = data.data primer_ann = mergeAnnotation(primer_ann, {'PRIMER':prcons['set']}, delimiter=delimiter) primer_ann = mergeAnnotation(primer_ann, {'PRCOUNT':prcons['count']}, delimiter=delimiter) elif prcons['freq'] >= primer_freq: # Define consensus subset seq_list = subsetSeqSet(data.data, primer_field, prcons['cons'], delimiter=delimiter) primer_ann = mergeAnnotation(primer_ann, {'PRCONS':prcons['cons']}, delimiter=delimiter) primer_ann = mergeAnnotation(primer_ann, {'PRFREQ':prcons['freq']}, delimiter=delimiter) else: # If set fails primer consensus, feed result queue and continue result_queue.put(result) continue # Check count threshold cons_count = len(seq_list) result.log['CONSCOUNT'] = cons_count if cons_count < min_count: # If set fails count threshold, feed result queue and continue result_queue.put(result) continue # Update log with input sequences for i, s in enumerate(seq_list): result.log['INSEQ%i' % (i + 1)] = str(s.seq) # If primer and count filters pass, generate consensus sequence consensus = cons_func(seq_list, **cons_args) # Delete positions with gap frequency over max_gap and update log with consensus if max_gap is not None: gap_positions = set(findGapPositions(seq_list, max_gap)) result.log['CONSENSUS'] = ''.join([' ' if i in gap_positions else x \ for i, x in enumerate(consensus.seq)]) if 'phred_quality' in consensus.letter_annotations: result.log['QUALITY'] = ''.join([' ' if i in gap_positions else chr(q + 33) \ for i, q in enumerate(consensus.letter_annotations['phred_quality'])]) consensus = deleteSeqPositions(consensus, gap_positions) else: gap_positions = None result.log['CONSENSUS'] = str(consensus.seq) if 'phred_quality' in consensus.letter_annotations: result.log['QUALITY'] = ''.join([chr(q + 33) for q in consensus.letter_annotations['phred_quality']]) # Calculate nucleotide diversity if max_diversity is not None: diversity = calculateDiversity(seq_list) result.log['DIVERSITY'] = diversity # If diversity exceeds threshold, feed result queue and continue if diversity > max_diversity: result_queue.put(result) continue # Calculate set error against consensus if max_error is not None: # Delete positions if required and calculate error if gap_positions is not None: seq_check = [deleteSeqPositions(s, gap_positions) for s in seq_list] else: seq_check = seq_list error = calculateSetError(seq_check, consensus) result.log['ERROR'] = error # If error exceeds threshold, feed result queue and continue if error > max_error: result_queue.put(result) continue # TODO: should move this into an improved annotationConsensus function with an action argument # Parse copy_field annotations and define consensus annotations if copy_fields is not None and copy_actions is not None: copy_ann = OrderedDict() for f, act in zip(copy_fields, copy_actions): # Numeric operations if act == 'min': vals = getAnnotationValues(seq_list, f, delimiter=delimiter) copy_ann[f] = '%.12g' % min([float(x or 0) for x in vals]) elif act == 'max': vals = getAnnotationValues(seq_list, f, delimiter=delimiter) copy_ann[f] = '%.12g' % max([float(x or 0) for x in vals]) elif act == 'sum': vals = getAnnotationValues(seq_list, f, delimiter=delimiter) copy_ann[f] = '%.12g' % sum([float(x or 0) for x in vals]) elif act == 'set': vals = annotationConsensus(seq_list, f, delimiter=delimiter) copy_ann[f] = vals['set'] copy_ann['%s_COUNT' % f] = vals['count'] elif act == 'majority': vals = annotationConsensus(seq_list, f, delimiter=delimiter) copy_ann[f] = vals['cons'] copy_ann['%s_FREQ' % f] = vals['freq'] else: copy_ann = None # Define annotation for output sequence cons_ann = OrderedDict([('ID', data.id), ('CONSCOUNT', cons_count)]) # Merge addition consensus annotations into output sequence annotations if primer_ann is not None: cons_ann = mergeAnnotation(cons_ann, primer_ann, delimiter=delimiter) if copy_ann is not None: cons_ann = mergeAnnotation(cons_ann, copy_ann, delimiter=delimiter) # Add output sequence annotations to consensus sequence consensus.id = consensus.name = flattenAnnotation(cons_ann, delimiter=delimiter) consensus.description = '' result.results = consensus result.valid = True # Feed results to result queue result_queue.put(result) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None except: alive.value = False sys.stderr.write('Error processing sequence set with ID: %s\n' % data.id) raise return None
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None, coord_type=default_coord_type, out_args=default_out_args): """ Generates consensus sequences Arguments: seq_file_1 = the file containing the grouped sequences and annotations seq_file_2 = the file to assign annotations to from seq_file_1 fields_1 = list of annotations in seq_file_1 records to copy to seq_file_2 records; if None do not copy any annotations fields_2 = list of annotations in seq_file_2 records to copy to seq_file_1 records; if None do not copy any annotations coord_type = the sequence header format out_args = common output argument dictionary from parseCommonArgs Returns: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2) """ # Define private functions def _key_func(x): return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter']) log = OrderedDict() log['START'] = 'PairSeq' log['FILE1'] = os.path.basename(seq_file_1) log['FILE2'] = os.path.basename(seq_file_2) log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None log['COORD_TYPE'] = coord_type printLog(log) # Define output type if out_args['out_type'] is None: out_type_1 = getFileType(seq_file_1) out_type_2 = getFileType(seq_file_2) else: out_type_1 = out_type_2 = out_args['out_type'] # Define output name if out_args['out_name'] is None: out_name_1 = out_name_2 = None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] # Open and count files start_time = time() printMessage("Indexing files", start_time=start_time) # Index file 1 seq_count_1 = countSeqFile(seq_file_1) seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func) # Define file 2 iterator seq_count_2 = countSeqFile(seq_file_2) seq_iter_2 = readSeqFile(seq_file_2, index=False) printMessage("Done", start_time=start_time, end=True) # Open output file handles pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) if out_args['failed']: fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) pass_keys = list() # Iterate over pairs and write to output files start_time = time() rec_count = pair_count = 0 for seq_2 in seq_iter_2: # Print progress for previous iteration printProgress(rec_count, seq_count_2, 0.05, start_time) rec_count += 1 # Check for file 2 mate pair in file 1 coord_2 = getCoordKey(seq_2.id, coord_type=coord_type, delimiter=out_args['delimiter']) seq_1 = seq_dict_1.get(coord_2, None) if seq_1 is not None: # Record paired keys pair_count += 1 if fields_1 is not None or fields_2 is not None: ann_1 = parseAnnotation(seq_1.description, delimiter=out_args['delimiter']) ann_2 = parseAnnotation(seq_2.description, delimiter=out_args['delimiter']) # Prepend annotations from seq_1 to seq_2 if fields_1 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \ if k in fields_1]) merge_ann = mergeAnnotation(ann_2, copy_ann, prepend=True, delimiter=out_args['delimiter']) seq_2.id = flattenAnnotation(merge_ann, delimiter=out_args['delimiter']) seq_2.description = '' # Append annotations from seq_2 to seq_1 if fields_2 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \ if k in fields_2]) merge_ann = mergeAnnotation(ann_1, copy_ann, prepend=False, delimiter=out_args['delimiter']) seq_1.id = flattenAnnotation(merge_ann, delimiter=out_args['delimiter']) seq_1.description = '' # Write paired records SeqIO.write(seq_1, pass_handle_1, out_type_1) SeqIO.write(seq_2, pass_handle_2, out_type_2) # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records if out_args['failed']: if seq_1 is not None: pass_keys.append(coord_2) else: SeqIO.write(seq_2, fail_handle_2, out_type_2) # Print final progress printProgress(rec_count, seq_count_2, 0.05, start_time) # Find and write unpaired file 1 records if out_args['failed']: start_time = time() printMessage("Finding unpaired", start_time=start_time) # Find file 1 unpaired keys pass_keys = set(pass_keys) unpaired = set(seq_dict_1).difference(pass_keys) # Write unpaired file 1 records for k in unpaired: SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1) printMessage("Done", start_time=start_time, end=True) # Print log log = OrderedDict() log['OUTPUT1'] = os.path.basename(pass_handle_1.name) log['OUTPUT2'] = os.path.basename(pass_handle_2.name) log['SEQUENCES1'] = seq_count_1 log['SEQUENCES2'] = seq_count_2 log['PASS'] = pair_count log['END'] = 'PairSeq' printLog(log) # Close file handles pass_handle_1.close() pass_handle_2.close() return [(pass_handle_1.name, pass_handle_2.name)]
def processCSQueue(alive, data_queue, result_queue, cluster_field, cluster_args={}, delimiter=default_delimiter): """ Pulls from data queue, performs calculations, and feeds results queue Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues; when False function returns data_queue = a multiprocessing.Queue holding data to process result_queue = a multiprocessing.Queue to hold processed results cluster_args = a dictionary of optional arguments for the clustering function delimiter = a tuple of delimiters for (annotations, field/values, value lists) Returns: None """ try: # Iterator over data queue until sentinel object reached while alive.value: # Get data from queue if data_queue.empty(): continue else: data = data_queue.get() # Exit upon reaching sentinel if data is None: break # Define result object result = SeqResult(data.id, data.data) result.log['BARCODE'] = data.id result.log['SEQCOUNT'] = len(data) # Perform clustering cluster_dict = runUClust(data.data, **cluster_args) # Process failed result if cluster_dict is None: # Update log result.log['CLUSTERS'] = 0 for i, seq in enumerate(data.data, start=1): result.log['CLUST0-%i' % i] = str(seq.seq) # Feed results queue and continue result_queue.put(result) continue # Get number of clusters result.log['CLUSTERS'] = len(cluster_dict) # Update sequence annotations with cluster assignments results = list() seq_dict = {s.id: s for s in data.data} for clust, id_list in cluster_dict.items(): for i, seq_id in enumerate(id_list, start=1): # Add cluster annotation seq = seq_dict[seq_id] header = parseAnnotation(seq.description, delimiter=delimiter) header = mergeAnnotation(header, {cluster_field:clust}, delimiter=delimiter) seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter) seq.description = '' # Update log and results result.log['CLUST%i-%i' % (clust, i)] = str(seq.seq) results.append(seq) # Check results result.results = results result.valid = (len(results) == len(seq_dict)) # Feed results to result queue result_queue.put(result) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None except: alive.value = False sys.stderr.write('Error processing sequence set with ID: %s.\n' % data.id) raise return None
def modifyHeaders(seq_file, modify_func, modify_args, out_args=default_out_args): """ Modifies sequence headers Arguments: seq_file = the sequence file name modify_func = the function defining the modification operation modify_args = a dictionary of arguments to pass to modify_func out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ # Define subcommand label dictionary cmd_dict = {addHeader: 'add', copyHeader: 'copy', collapseHeader: 'collapse', deleteHeader: 'delete', expandHeader: 'expand', renameHeader: 'rename'} # Print parameter info log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__) log['FILE'] = os.path.basename(seq_file) for k in sorted(modify_args): v = modify_args[k] log[k.upper()] = ','.join(v) if isinstance(v, list) else v printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Count records result_count = countSeqFile(seq_file) # Iterate over sequences start_time = time() seq_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time) #Update counts seq_count += 1 # Modify header header = parseAnnotation(seq.description, delimiter=out_args['delimiter']) header = modify_func(header, delimiter=out_args['delimiter'], **modify_args) # Write new sequence seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter']) seq.description = '' SeqIO.write(seq, out_handle, out_args['out_type']) # print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=default_out_args): """ Modifies sequence headers Arguments: seq_file : the sequence file name. modify_func : the function defining the modification operation. modify_args : a dictionary of arguments to pass to modify_func. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: output file name. """ # Define subcommand label dictionary cmd_dict = {addHeader: 'add', copyHeader: 'copy', collapseHeader: 'collapse', deleteHeader: 'delete', expandHeader: 'expand', renameHeader: 'rename'} # Print parameter info log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__) log['FILE'] = os.path.basename(seq_file) for k in sorted(modify_args): v = modify_args[k] log[k.upper()] = ','.join(v) if isinstance(v, list) else v printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type if out_file is not None: out_handle = open(out_file, 'w') else: out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Count records result_count = countSeqFile(seq_file) # Iterate over sequences start_time = time() seq_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time=start_time) #Update counts seq_count += 1 # Modify header header = parseAnnotation(seq.description, delimiter=out_args['delimiter']) header = modify_func(header, delimiter=out_args['delimiter'], **modify_args) # Write new sequence seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter']) seq.description = '' SeqIO.write(seq, out_handle, out_args['out_type']) # print counts printProgress(seq_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def convertHeaders(seq_file, convert_func, convert_args={}, out_file=None, out_args=default_out_args): """ Converts sequence headers to the pRESTO format Arguments: seq_file : the sequence file name. convert_func : the function used to convert sequence headers. convert_args : a dictionary of arguments to pass to convert_func. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: the output sequence file name. """ # Define subcommand label dictionary cmd_dict = {convertGenericHeader: 'generic', convert454Header: '454', convertGenbankHeader: 'genbank', convertIlluminaHeader: 'illumina', convertIMGTHeader: 'imgt', convertMIGECHeader: 'migec', convertSRAHeader: 'sra'} log = OrderedDict() log['START'] = 'ConvertHeaders' log['COMMAND'] = cmd_dict[convert_func] log['FILE'] = os.path.basename(seq_file) printLog(log) # Open input file in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Wrapper for opening handles and writers def _open(x, out_file=out_file): if out_file is not None and x == 'pass': handle = open(out_file, 'w') else: handle = getOutputHandle(seq_file, 'convert-%s' % x, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) return handle # Count records result_count = countSeqFile(seq_file) # Set additional conversion arguments if convert_func in [convertGenericHeader, convertGenbankHeader]: convert_args.update({'delimiter': out_args['delimiter']}) # Intialize file handles pass_handle, fail_handle = None, None # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration and update count printProgress(seq_count, result_count, 0.05, start_time=start_time) seq_count += 1 # Convert header header = convert_func(seq.description, **convert_args) if header is not None: # Write successfully converted sequences pass_count += 1 seq.id = seq.name = flattenAnnotation(header, out_args['delimiter']) seq.description = '' try: SeqIO.write(seq, pass_handle, out_args['out_type']) except AttributeError: # Open output file pass_handle = _open('pass') SeqIO.write(seq, pass_handle, out_args['out_type']) else: fail_count += 1 if out_args['failed']: # Write unconverted sequences try: SeqIO.write(seq, fail_handle, out_args['out_type']) except AttributeError: # Open output file pass_handle = _open('fail') SeqIO.write(seq, fail_handle, out_args['out_type']) # Print counts printProgress(seq_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertHeaders' printLog(log) # Close file handles if fail_handle is not None: pass_handle.close() if fail_handle is not None: fail_handle.close() return pass_handle.name
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None, copy_fields=None, copy_actions=None, max_field=None, min_field=None, inner=False, keep_missing=False, out_file=None, out_args=default_out_args): """ Removes duplicate sequences from a file Arguments: seq_file : filename of the sequence file to sample from. max_missing : number of ambiguous characters to allow in a unique sequence. uniq_fields : a list of annotations that define a sequence as unique if they differ. copy_fields : a list of annotations to copy into unique sequence annotations. copy_actions : the list of collapseAnnotation actions to take on copy_fields. max_field : a numeric field whose maximum value determines the retained sequence. min_field : a numeric field whose minimum value determines the retained sequence. inner : if True exclude consecutive outer ambiguous characters from iterations and matching. keep_missing : if True retain sequences with more ambiguous characters than max_missing as unique. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: the collapsed output file name. """ log = OrderedDict() log['START'] = 'CollapseSeq' log['FILE'] = os.path.basename(seq_file) log['MAX_MISSING'] = max_missing log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \ if uniq_fields is not None else None log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \ if copy_fields is not None else None log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \ if copy_actions is not None else None log['MAX_FIELD'] = max_field log['MIN_FIELD'] = min_field log['INNER'] = inner log['KEEP_MISSING'] = keep_missing printLog(log) # Read input file in_type = getFileType(seq_file) seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False)) if out_args['out_type'] is None: out_args['out_type'] = in_type # Count total sequences rec_count = len(seq_dict) # Open unique record output handle if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(seq_file, 'collapse-unique', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') # Find sequences with duplicates uniq_dict = {} # Added list typing for compatibility issue with Python 2.7.5 on OS X # TypeError: object of type 'dictionary-keyiterator' has no len() search_keys = list(seq_dict.keys()) dup_keys = [] for n in range(0, max_missing + 1): # Find unique sequences uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, uniq_fields, copy_fields, max_field, min_field, inner, out_args['delimiter']) # Update list of duplicates dup_keys.extend(dup_list) # Break if no keys to search remain if len(search_keys) == 0: break # Write unique sequences for val in uniq_dict.values(): # Define output sequence out_seq = val.seq out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_app = OrderedDict() if copy_fields is not None and copy_actions is not None: for f, a in zip(copy_fields, copy_actions): x = collapseAnnotation(val.annotations, a, f, delimiter=out_args['delimiter']) out_app[f] = x[f] out_ann.pop(f, None) out_app['DUPCOUNT'] = val.count out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' # Write unique sequence SeqIO.write(out_seq, pass_handle, out_args['out_type']) # Update log log = OrderedDict() log['HEADER'] = out_seq.id log['DUPCOUNT'] = val.count for i, k in enumerate(val.keys, start=1): log['ID%i' % i] = k for i, k in enumerate(val.keys, start=1): log['SEQ%i' % i] = str(seq_dict[k].seq) printLog(log, handle=log_handle) # Write sequence with high missing character counts if keep_missing: for k in search_keys: out_seq = seq_dict[k] out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' SeqIO.write(out_seq, pass_handle, out_args['out_type']) # Write sequence with high missing character counts if out_args['failed'] and not keep_missing: with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as missing_handle: for k in search_keys: SeqIO.write(seq_dict[k], missing_handle, out_args['out_type']) if out_args['failed']: # Write duplicate sequences with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as dup_handle: for k in dup_keys: SeqIO.write(seq_dict[k], dup_handle, out_args['out_type']) # Print log log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['SEQUENCES'] = rec_count log['UNIQUE'] = len(uniq_dict) log['DUPLICATE'] = len(dup_keys) log['UNDETERMINED'] = len(search_keys) log['END'] = 'CollapseSeq' printLog(log) # Close file handles pass_file = pass_handle.name if pass_handle is not None: pass_handle.close() if log_handle is not None: log_handle.close() return pass_file
def convertHeaders(seq_file, convert_func, convert_args={}, out_args=default_out_args): """ Converts sequence headers to the pRESTO format Arguments: seq_file = the sequence file name convert_func = the function used to convert sequence headers convert_args = a dictionary of arguments to pass to convert_func out_args = common output argument dictionary from parseCommonArgs Returns: the output sequence file name """ # Define subcommand label dictionary cmd_dict = {convertGenericHeader:'generic', convert454Header:'454', convertGenbankHeader:'genbank', convertIlluminaHeader:'illumina', convertIMGTHeader:'imgt', convertSRAHeader:'sra'} log = OrderedDict() log['START'] = 'ConvertHeaders' log['COMMAND'] = cmd_dict[convert_func] log['FILE'] = os.path.basename(seq_file) printLog(log) # Open input file in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Count records result_count = countSeqFile(seq_file) # Open output file handles pass_handle = getOutputHandle(seq_file, 'convert-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) if out_args['failed']: fail_handle = getOutputHandle(seq_file, 'convert-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) else: fail_handle = None # Set additional conversion arguments if convert_func in [convertGenericHeader, convertGenbankHeader]: convert_args.update({'delimiter':out_args['delimiter']}) # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration and update count printProgress(seq_count, result_count, 0.05, start_time) seq_count += 1 # Convert header header = convert_func(seq.description, **convert_args) if header is not None: # Write successfully converted sequences pass_count += 1 seq.id = seq.name = flattenAnnotation(header, out_args['delimiter']) seq.description = '' SeqIO.write(seq, pass_handle, out_args['out_type']) else: fail_count += 1 if fail_handle is not None: # Write successfully unconverted sequences SeqIO.write(seq, fail_handle, out_args['out_type']) # Print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertHeaders' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() return pass_handle.name
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None, copy_fields=None, copy_actions=None, max_field=None, min_field=None, inner=False, keep_missing=False, out_args=default_out_args): """ Removes duplicate sequences from a file Arguments: seq_file = filename of the sequence file to sample from max_missing = number of ambiguous characters to allow in a unique sequence uniq_fields = a list of annotations that define a sequence as unique if they differ copy_fields = a list of annotations to copy into unique sequence annotations copy_actions = the list of collapseAnnotation actions to take on copy_fields max_field = a numeric field whose maximum value determines the retained sequence min_field = a numeric field whose minimum value determines the retained sequence inner = if True exclude consecutive outer ambiguous characters from iterations and matching keep_missing = if True retain sequences with more ambiguous characters than max_missing as unique out_args = common output argument dictionary from parseCommonArgs Returns: the collapsed output file name """ log = OrderedDict() log['START'] = 'CollapseSeq' log['FILE'] = os.path.basename(seq_file) log['MAX_MISSING'] = max_missing log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \ if uniq_fields is not None else None log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \ if copy_fields is not None else None log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \ if copy_actions is not None else None log['MAX_FIELD'] = max_field log['MIN_FIELD'] = min_field log['INNER'] = inner log['KEEP_MISSING'] = keep_missing printLog(log) # TODO: storing all sequences in memory is faster # Read input file in_type = getFileType(seq_file) #seq_dict = readSeqFile(seq_file, index=True) seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False)) if out_args['out_type'] is None: out_args['out_type'] = in_type # Count total sequences rec_count = len(seq_dict) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') # Find sequences with duplicates uniq_dict = {} # Added list typing for compatibility issue with Python 2.7.5 on OS X # TypeError: object of type 'dictionary-keyiterator' has no len() search_keys = list(seq_dict.keys()) dup_keys = [] for n in range(0, max_missing + 1): # Find unique sequences uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, uniq_fields, copy_fields, max_field, min_field, inner, out_args['delimiter']) # Update list of duplicates dup_keys.extend(dup_list) # Update log log = OrderedDict() log['ITERATION'] = n + 1 log['MISSING'] = n log['UNIQUE'] = len(uniq_dict) log['DUPLICATE'] = len(dup_keys) log['UNDETERMINED'] = len(search_keys) printLog(log, handle=log_handle) # Break if no keys to search remain if len(search_keys) == 0: break # Write unique sequences with getOutputHandle(seq_file, 'collapse-unique', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as uniq_handle: for val in uniq_dict.values(): # Define output sequence out_seq = val[0] out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_app = OrderedDict() if copy_fields is not None and copy_actions is not None: for f, a, s in zip(copy_fields, copy_actions, val[3:]): out_app[f] = s out_app = collapseAnnotation(out_app, a, f, delimiter=out_args['delimiter']) out_ann.pop(f, None) out_app['DUPCOUNT'] = val[1] out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' # Write unique sequence SeqIO.write(out_seq, uniq_handle, out_args['out_type']) # Write sequence with high missing character counts if keep_missing: for k in search_keys: out_seq = seq_dict[k] out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' SeqIO.write(out_seq, uniq_handle, out_args['out_type']) # Write sequence with high missing character counts if out_args['failed'] and not keep_missing: with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as missing_handle: for k in search_keys: SeqIO.write(seq_dict[k], missing_handle, out_args['out_type']) if out_args['failed']: # Write duplicate sequences with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as dup_handle: for k in dup_keys: SeqIO.write(seq_dict[k], dup_handle, out_args['out_type']) # Print log log = OrderedDict() log['OUTPUT'] = os.path.basename(uniq_handle.name) log['SEQUENCES'] = rec_count log['UNIQUE'] = len(uniq_dict) log['DUPLICATE'] = len(dup_keys) log['UNDETERMINED'] = len(search_keys) log['END'] = 'CollapseSeq' printLog(log) # Close file handles if log_handle is not None: log_handle.close() return uniq_handle.name
def processAssembly(data, assemble_func, assemble_args={}, rc=None, fields_1=None, fields_2=None, delimiter=default_delimiter): """ Performs assembly of a sequence pair Arguments: data = a SeqData object with a list of exactly two SeqRecords assemble_func = the function to use to assemble paired ends assemble_args = a dictionary of arguments to pass to the assembly function rc = Defines which sequences ('head','tail','both') to reverse complement before assembly; if None do not reverse complement sequences fields_1 = list of annotations in head SeqRecord to copy to assembled record; if None do not copy an annotation fields_2 = list of annotations in tail SeqRecord to copy to assembled record; if None do not copy an annotation delimiter = a tuple of delimiters for (fields, values, value lists) Returns: a SeqResult object """ # Reverse complement sequences if required head_seq = data.data[0] if rc not in ('head', 'both') \ else reverseComplement(data.data[0]) tail_seq = data.data[1] if rc not in ('tail', 'both') \ else reverseComplement(data.data[1]) # Define result object result = SeqResult(data.id, [head_seq, tail_seq]) # Define stitched sequence annotation stitch_ann = OrderedDict([('ID', data.id)]) if fields_1 is not None: head_ann = parseAnnotation(head_seq.description, fields_1, delimiter=delimiter) stitch_ann = mergeAnnotation(stitch_ann, head_ann, delimiter=delimiter) result.log['FIELDS1'] = '|'.join(['%s=%s' % (k, v) for k, v in head_ann.items()]) if fields_2 is not None: tail_ann = parseAnnotation(tail_seq.description, fields_2, delimiter=delimiter) stitch_ann = mergeAnnotation(stitch_ann, tail_ann, delimiter=delimiter) result.log['FIELDS2'] = '|'.join(['%s=%s' % (k, v) for k, v in tail_ann.items()]) # Assemble sequences stitch = assemble_func(head_seq, tail_seq, **assemble_args) ab = stitch.head_pos xy = stitch.tail_pos result.valid = stitch.valid # Add reference to log if stitch.ref_seq is not None and stitch.ref_pos is not None: result.log['REFID'] = stitch.ref_seq.id result.log['REFSEQ'] = ' ' * stitch.ref_pos[0] + stitch.ref_seq.seq if ab is not None and xy is not None: result.log['SEQ1'] = ' ' * xy[0] + head_seq.seq result.log['SEQ2'] = ' ' * ab[0] + tail_seq.seq else: result.log['SEQ1'] = head_seq.seq result.log['SEQ2'] = ' ' * (len(head_seq) + (stitch.gap or 0)) + tail_seq.seq # Define stitching log if stitch.seq is not None: # Update stitch annotation stitch.seq.id = flattenAnnotation(stitch_ann, delimiter=delimiter) stitch.seq.name = stitch.seq.id stitch.seq.description = '' result.results = stitch.seq # Add assembly to log result.log['ASSEMBLY'] = stitch.seq.seq if 'phred_quality' in stitch.seq.letter_annotations: result.log['QUALITY'] = ''.join([chr(q+33) for q in stitch.seq.letter_annotations['phred_quality']]) result.log['LENGTH'] = len(stitch) result.log['OVERLAP'] = stitch.overlap else: result.log['ASSEMBLY'] = None # Add mode specific log results if stitch.gap is not None: result.log['GAP'] = stitch.gap if stitch.error is not None: result.log['ERROR'] = '%.4f' % stitch.error if stitch.pvalue is not None: result.log['PVALUE'] = '%.4e' % stitch.pvalue if stitch.evalue is not None: result.log['EVALUE1'] = '%.4e' % stitch.evalue[0] result.log['EVALUE2'] = '%.4e' % stitch.evalue[1] if stitch.ident is not None: result.log['IDENTITY'] = '%.4f' % stitch.ident return result
def assemblyWorker(data, assemble_func, assemble_args={}, rc='tail', fields_1=None, fields_2=None, delimiter=default_delimiter): """ Performs assembly of a sequence pair Arguments: data : a SeqData object with a list of exactly two SeqRecords. assemble_func : the function to use to assemble paired ends. assemble_args : a dictionary of arguments to pass to the assembly function. rc : Defines which sequences ('head', 'tail', 'both', 'none') to reverse complement before assembly; if None do not reverse complement sequences. fields_1 : list of annotations in head SeqRecord to copy to assembled record; if None do not copy an annotation. fields_2 : list of annotations in tail SeqRecord to copy to assembled record; if None do not copy an annotation. delimiter : a tuple of delimiters for (fields, values, value lists). Returns: SeqResult: a SeqResult object """ # Define result object result = SeqResult(data.id, data.data) # Reverse complement sequences if required head_seq = data.data[0] if rc not in ('head', 'both') \ else reverseComplement(data.data[0]) tail_seq = data.data[1] if rc not in ('tail', 'both') \ else reverseComplement(data.data[1]) # Define stitched sequence annotation stitch_ann = OrderedDict([('ID', data.id)]) if fields_1 is not None: head_ann = parseAnnotation(head_seq.description, fields_1, delimiter=delimiter) stitch_ann = mergeAnnotation(stitch_ann, head_ann, delimiter=delimiter) result.log['FIELDS1'] = '|'.join( ['%s=%s' % (k, v) for k, v in head_ann.items()]) if fields_2 is not None: tail_ann = parseAnnotation(tail_seq.description, fields_2, delimiter=delimiter) stitch_ann = mergeAnnotation(stitch_ann, tail_ann, delimiter=delimiter) result.log['FIELDS2'] = '|'.join( ['%s=%s' % (k, v) for k, v in tail_ann.items()]) # Assemble sequences stitch = assemble_func(head_seq, tail_seq, **assemble_args) ab = stitch.head_pos xy = stitch.tail_pos result.valid = stitch.valid # Add reference to log if stitch.ref_seq is not None and stitch.ref_pos is not None: result.log['REFID'] = stitch.ref_seq.id result.log['REFSEQ'] = ' ' * stitch.ref_pos[0] + stitch.ref_seq.seq if ab is not None and xy is not None: result.log['SEQ1'] = ' ' * xy[0] + head_seq.seq result.log['SEQ2'] = ' ' * ab[0] + tail_seq.seq else: result.log['SEQ1'] = head_seq.seq result.log['SEQ2'] = ' ' * (len(head_seq) + (stitch.gap or 0)) + tail_seq.seq # Define stitching log if stitch.seq is not None: # Update stitch annotation stitch.seq.id = flattenAnnotation(stitch_ann, delimiter=delimiter) stitch.seq.name = stitch.seq.id stitch.seq.description = '' result.results = stitch.seq # Add assembly to log result.log['ASSEMBLY'] = stitch.seq.seq if 'phred_quality' in stitch.seq.letter_annotations: result.log['QUALITY'] = ''.join([ chr(q + 33) for q in stitch.seq.letter_annotations['phred_quality'] ]) result.log['LENGTH'] = len(stitch) result.log['OVERLAP'] = stitch.overlap else: result.log['ASSEMBLY'] = None # Add mode specific log results if stitch.gap is not None: result.log['GAP'] = stitch.gap if stitch.error is not None: result.log['ERROR'] = '%.4f' % stitch.error if stitch.pvalue is not None: result.log['PVALUE'] = '%.4e' % stitch.pvalue if stitch.evalue is not None: result.log['EVALUE1'] = '%.4e' % stitch.evalue[0] result.log['EVALUE2'] = '%.4e' % stitch.evalue[1] if stitch.ident is not None: result.log['IDENTITY'] = '%.4f' % stitch.ident return result
def getMaskedSeq(align, mode='mask', barcode=False, delimiter=default_delimiter): """ Create an output sequence with primers masked or cut Arguments: align = a PrimerAlignment object returned from alignPrimers or scorePrimers mode = defines the action taken; one of ['cut','mask','tag','trim'] barcode = if True add sequence preceding primer to description delimiter = a tuple of delimiters for (annotations, field/values, value lists) Returns: output SeqRecord object """ seq = align.seq # Build output sequence if mode == 'tag' or not align.align_primer: # Do not modify sequence out_seq = seq elif mode == 'trim': # Remove region before primer if not align.rev_primer: out_seq = seq[align.start:] else: out_seq = seq[:align.end] elif mode == 'cut': # Remove primer and preceding region if not align.rev_primer: out_seq = seq[align.end:] else: out_seq = seq[:align.start] elif mode == 'mask': # Mask primer with Ns and remove preceding region if not align.rev_primer: mask_len = align.end - align.start + align.gaps out_seq = 'N' * mask_len + seq[align.end:] if hasattr(seq, 'letter_annotations') and \ 'phred_quality' in seq.letter_annotations: out_seq.letter_annotations['phred_quality'] = \ [0] * mask_len + \ seq.letter_annotations['phred_quality'][align.end:] else: mask_len = min(align.end, len(seq)) - align.start + align.gaps out_seq = seq[:align.start] + 'N' * mask_len if hasattr(seq, 'letter_annotations') and \ 'phred_quality' in seq.letter_annotations: out_seq.letter_annotations['phred_quality'] = \ seq.letter_annotations['phred_quality'][:align.start] + \ [0] * mask_len # Add alignment annotations to output SeqRecord out_seq.annotations = seq.annotations out_seq.annotations['primer'] = align.primer out_seq.annotations['prstart'] = align.start out_seq.annotations['error'] = align.error # Parse seq annotation and create output annotation seq_ann = parseAnnotation(seq.description, delimiter=delimiter) out_ann = OrderedDict([('SEQORIENT', seq.annotations['seqorient']), ('PRIMER', align.primer)]) # Add ID sequence to description if barcode: seq_code = seq[:align.start].seq if not align.rev_primer \ else seq[align.end:].seq out_seq.annotations['barcode'] = seq_code out_ann['BARCODE'] = seq_code out_ann = mergeAnnotation(seq_ann, out_ann, delimiter=delimiter) out_seq.id = flattenAnnotation(out_ann, delimiter=delimiter) out_seq.description = '' return out_seq
def processQueue(alive, data_queue, result_queue, cluster_func, cluster_args={}, cluster_field=default_cluster_field, cluster_prefix=default_cluster_prefix, delimiter=default_delimiter): """ Pulls from data queue, performs calculations, and feeds results queue Arguments: alive : a multiprocessing.Value boolean controlling whether processing continues; when False function returns. data_queue : a multiprocessing.Queue holding data to process. result_queue : a multiprocessing.Queue to hold processed results. cluster_func : the function to use for clustering. cluster_args : a dictionary of optional arguments for the clustering function. cluster_field : string defining the output cluster field name. cluster_prefix : string defining a prefix for the cluster identifier. delimiter : a tuple of delimiters for (annotations, field/values, value lists). Returns: None """ try: # Iterator over data queue until sentinel object reached while alive.value: # Get data from queue if data_queue.empty(): continue else: data = data_queue.get() # Exit upon reaching sentinel if data is None: break # Define result object result = SeqResult(data.id, data.data) result.log['BARCODE'] = data.id result.log['SEQCOUNT'] = len(data) # Perform clustering cluster_dict = cluster_func(data.data, **cluster_args) # Process failed result if cluster_dict is None: # Update log result.log['CLUSTERS'] = 0 for i, seq in enumerate(data.data, start=1): result.log['CLUST0-%i' % i] = str(seq.seq) # Feed results queue and continue result_queue.put(result) continue # Get number of clusters result.log['CLUSTERS'] = len(cluster_dict) # Update sequence annotations with cluster assignments results = list() seq_dict = {s.id: s for s in data.data} for cluster, id_list in cluster_dict.items(): for i, seq_id in enumerate(id_list, start=1): # Add cluster annotation seq = seq_dict[seq_id] label = '%s%i' % (cluster_prefix, cluster) header = parseAnnotation(seq.description, delimiter=delimiter) header = mergeAnnotation(header, {cluster_field: label}, delimiter=delimiter) seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter) seq.description = '' # Update log and results result.log['CLUST%i-%i' % (cluster, i)] = str(seq.seq) results.append(seq) # Check results result.results = results result.valid = (len(results) == len(seq_dict)) # Feed results to result queue result_queue.put(result) else: sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None except: alive.value = False printError('Error processing sequence set with ID: %s.' % data.id, exit=False) raise return None
def processQueue(alive, data_queue, result_queue, cons_func, cons_args={}, min_count=default_consensus_min_count, primer_field=None, primer_freq=None, max_gap=None, max_error=None, max_diversity=None, copy_fields=None, copy_actions=None, delimiter=default_delimiter): """ Pulls from data queue, performs calculations, and feeds results queue Arguments: alive : a multiprocessing.Value boolean controlling whether processing continues; when False function returns. data_queue : a multiprocessing.Queue holding data to process. result_queue : a multiprocessing.Queue to hold processed results. cons_func : the function to use for consensus generation. cons_args : a dictionary of optional arguments for the consensus function. min_count : threshold number of sequences to define a consensus. primer_field : the annotation field containing primer names; if None do not annotate with primer names. primer_freq : the maximum primer frequency that must be meet to build a consensus; if None do not filter by primer frequency. max_gap : the maximum frequency of (., -) characters allowed before deleting a position; if None do not delete positions. max_error : the minimum error rate to retain a set; if None do not calculate error rate. max_diversity : a threshold defining the average pairwise error rate required to retain a read group; if None do not calculate diversity. copy_fields : a list of annotations to copy into consensus sequence annotations; if None no additional annotations will be copied. copy_actions : the list of actions to take for each copy_fields; one of ['set', 'majority', 'min', 'max', 'sum']. delimiter : a tuple of delimiters for (annotations, field/values, value lists). Returns: None """ try: # Iterator over data queue until sentinel object reached while alive.value: # Get data from queue if data_queue.empty(): continue else: data = data_queue.get() # Exit upon reaching sentinel if data is None: break # Define result dictionary for iteration result = SeqResult(data.id, data.data) result.log['BARCODE'] = data.id result.log['SEQCOUNT'] = len(data) # Define primer annotations and consensus primer if applicable if primer_field is None: primer_ann = None seq_list = data.data else: # Calculate consensus primer primer_ann = OrderedDict() prcons = annotationConsensus(data.data, primer_field, delimiter=delimiter) result.log['PRIMER'] = ','.join(prcons['set']) result.log['PRCOUNT'] = ','.join( [str(c) for c in prcons['count']]) result.log['PRCONS'] = prcons['cons'] result.log['PRFREQ'] = prcons['freq'] if primer_freq is None: # Retain full sequence set if not in primer consensus mode seq_list = data.data primer_ann = mergeAnnotation(primer_ann, {'PRIMER': prcons['set']}, delimiter=delimiter) primer_ann = mergeAnnotation(primer_ann, {'PRCOUNT': prcons['count']}, delimiter=delimiter) elif prcons['freq'] >= primer_freq: # Define consensus subset seq_list = subsetSeqSet(data.data, primer_field, prcons['cons'], delimiter=delimiter) primer_ann = mergeAnnotation(primer_ann, {'PRCONS': prcons['cons']}, delimiter=delimiter) primer_ann = mergeAnnotation(primer_ann, {'PRFREQ': prcons['freq']}, delimiter=delimiter) else: # If set fails primer consensus, feed result queue and continue result_queue.put(result) continue # Check count threshold cons_count = len(seq_list) result.log['CONSCOUNT'] = cons_count if cons_count < min_count: #print(cons_count, min_count) # If set fails count threshold, feed result queue and continue result_queue.put(result) continue # Update log with input sequences for i, s in enumerate(seq_list): result.log['INSEQ%i' % (i + 1)] = str(s.seq) # If primer and count filters pass, generate consensus sequence consensus = cons_func(seq_list, **cons_args) # Delete positions with gap frequency over max_gap and update log with consensus if max_gap is not None: gap_positions = set(findGapPositions(seq_list, max_gap)) result.log['CONSENSUS'] = ''.join([' ' if i in gap_positions else x \ for i, x in enumerate(consensus.seq)]) if 'phred_quality' in consensus.letter_annotations: result.log['QUALITY'] = ''.join([' ' if i in gap_positions else chr(q + 33) \ for i, q in enumerate(consensus.letter_annotations['phred_quality'])]) consensus = deleteSeqPositions(consensus, gap_positions) else: gap_positions = None result.log['CONSENSUS'] = str(consensus.seq) if 'phred_quality' in consensus.letter_annotations: result.log['QUALITY'] = ''.join([ chr(q + 33) for q in consensus.letter_annotations['phred_quality'] ]) # Calculate nucleotide diversity if max_diversity is not None: diversity = calculateDiversity(seq_list) result.log['DIVERSITY'] = diversity # If diversity exceeds threshold, feed result queue and continue if diversity > max_diversity: result_queue.put(result) continue # Calculate set error against consensus if max_error is not None: # Delete positions if required and calculate error if gap_positions is not None: seq_check = [ deleteSeqPositions(s, gap_positions) for s in seq_list ] else: seq_check = seq_list error = calculateSetError(seq_check, consensus) result.log['ERROR'] = error # If error exceeds threshold, feed result queue and continue if error > max_error: result_queue.put(result) continue # TODO: should move this into an improved annotationConsensus function with an action argument # Parse copy_field annotations and define consensus annotations if copy_fields is not None and copy_actions is not None: copy_ann = OrderedDict() for f, act in zip(copy_fields, copy_actions): # Numeric operations if act == 'min': vals = getAnnotationValues(seq_list, f, delimiter=delimiter) copy_ann[f] = '%.12g' % min( [float(x or 0) for x in vals]) elif act == 'max': vals = getAnnotationValues(seq_list, f, delimiter=delimiter) copy_ann[f] = '%.12g' % max( [float(x or 0) for x in vals]) elif act == 'sum': vals = getAnnotationValues(seq_list, f, delimiter=delimiter) copy_ann[f] = '%.12g' % sum( [float(x or 0) for x in vals]) elif act == 'set': vals = annotationConsensus(seq_list, f, delimiter=delimiter) copy_ann[f] = vals['set'] copy_ann['%s_COUNT' % f] = vals['count'] elif act == 'majority': vals = annotationConsensus(seq_list, f, delimiter=delimiter) copy_ann[f] = vals['cons'] copy_ann['%s_FREQ' % f] = vals['freq'] else: copy_ann = None # Define annotation for output sequence cons_ann = OrderedDict([('ID', data.id), ('CONSCOUNT', cons_count)]) # Merge addition consensus annotations into output sequence annotations if primer_ann is not None: cons_ann = mergeAnnotation(cons_ann, primer_ann, delimiter=delimiter) if copy_ann is not None: cons_ann = mergeAnnotation(cons_ann, copy_ann, delimiter=delimiter) # Add output sequence annotations to consensus sequence consensus.id = consensus.name = flattenAnnotation( cons_ann, delimiter=delimiter) consensus.description = '' result.results = consensus result.valid = True # Feed results to result queue result_queue.put(result) else: sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None except: alive.value = False printError('Processing sequence set with ID: %s' % data.id, exit=False) raise return None
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None, action=None, coord_type=default_coord, out_args=default_out_args): """ Syncronized paired end files and copies annotations between them Arguments: seq_file_1 : the file containing the grouped sequences and annotations. seq_file_2 : the file to assign annotations to from seq_file_1. fields_1 : list of annotations in seq_file_1 records to copy to seq_file_2 records; if None do not copy any annotations. fields_2 : list of annotations in seq_file_2 records to copy to seq_file_1 records; if None do not copy any annotations. action : the collapse action to take on all copied annotation if they already exist in the target header. coord_type : the sequence header format. out_args : common output argument dictionary from parseCommonArgs. Returns: list: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2). """ # Define private functions def _key_func(x): return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter']) log = OrderedDict() log['START'] = 'PairSeq' log['FILE1'] = os.path.basename(seq_file_1) log['FILE2'] = os.path.basename(seq_file_2) log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None log['COORD_TYPE'] = coord_type printLog(log) # Define output type if out_args['out_type'] is None: out_type_1 = getFileType(seq_file_1) out_type_2 = getFileType(seq_file_2) else: out_type_1 = out_type_2 = out_args['out_type'] # Define output name if out_args['out_name'] is None: out_name_1 = out_name_2 = None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] # Open and count files start_time = time() printMessage("Indexing files", start_time=start_time) # Index file 1 seq_count_1 = countSeqFile(seq_file_1) seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func) # Define file 2 iterator seq_count_2 = countSeqFile(seq_file_2) seq_iter_2 = readSeqFile(seq_file_2, index=False) printMessage("Done", start_time=start_time, end=True) # Open output file handles pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) if out_args['failed']: fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) pass_keys = list() # Iterate over pairs and write to output files start_time = time() rec_count = pair_count = 0 for seq_2 in seq_iter_2: # Print progress for previous iteration printProgress(rec_count, seq_count_2, 0.05, start_time=start_time) rec_count += 1 # Check for file 2 mate pair in file 1 coord_2 = getCoordKey(seq_2.id, coord_type=coord_type, delimiter=out_args['delimiter']) seq_1 = seq_dict_1.get(coord_2, None) if seq_1 is not None: # Record paired keys pair_count += 1 if fields_1 is not None or fields_2 is not None: ann_1 = parseAnnotation(seq_1.description, delimiter=out_args['delimiter']) ann_2 = parseAnnotation(seq_2.description, delimiter=out_args['delimiter']) # Prepend annotations from seq_1 to seq_2 if fields_1 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \ if k in fields_1]) merge_ann = mergeAnnotation( ann_2, copy_ann, prepend=True, delimiter=out_args['delimiter']) # Collapse if necessary if action is not None: merge_ann = collapseAnnotation( merge_ann, action, fields=fields_1, delimiter=out_args['delimiter']) # Flatten seq_2.id = flattenAnnotation( merge_ann, delimiter=out_args['delimiter']) seq_2.description = '' # Append annotations from seq_2 to seq_1 if fields_2 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \ if k in fields_2]) merge_ann = mergeAnnotation( ann_1, copy_ann, prepend=False, delimiter=out_args['delimiter']) # Collapse if necessary if action is not None: merge_ann = collapseAnnotation( merge_ann, action, fields=fields_2, delimiter=out_args['delimiter']) # Flatten seq_1.id = flattenAnnotation( merge_ann, delimiter=out_args['delimiter']) seq_1.description = '' # Write paired records SeqIO.write(seq_1, pass_handle_1, out_type_1) SeqIO.write(seq_2, pass_handle_2, out_type_2) # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records if out_args['failed']: if seq_1 is not None: pass_keys.append(coord_2) else: SeqIO.write(seq_2, fail_handle_2, out_type_2) # Print final progress printProgress(rec_count, seq_count_2, 0.05, start_time=start_time) # Find and write unpaired file 1 records if out_args['failed']: start_time = time() printMessage("Finding unpaired", start_time=start_time) # Find file 1 unpaired keys pass_keys = set(pass_keys) unpaired = set(seq_dict_1).difference(pass_keys) # Write unpaired file 1 records for k in unpaired: SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1) printMessage("Done", start_time=start_time, end=True) # Print log log = OrderedDict() log['OUTPUT1'] = os.path.basename(pass_handle_1.name) log['OUTPUT2'] = os.path.basename(pass_handle_2.name) log['SEQUENCES1'] = seq_count_1 log['SEQUENCES2'] = seq_count_2 log['PASS'] = pair_count log['END'] = 'PairSeq' printLog(log) # Close file handles pass_handle_1.close() pass_handle_2.close() return [(pass_handle_1.name, pass_handle_2.name)]