def filterSeq(seq_file, filter_func, filter_args={}, out_args=default_out_args, nproc=None, queue_size=None): """ Filters sequences by fraction of ambiguous nucleotides Arguments: seq_file = the sequence file to filter filter_func = the function to use for filtering sequences filter_args = a dictionary of arguments to pass to filter_func out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a list of successful output file names """ # Define output file label dictionary cmd_dict = {filterLength:'length', filterMissing:'missing', filterRepeats:'repeats', filterQuality:'quality', maskQuality:'maskqual', trimQuality:'trimqual'} # Print parameter info log = OrderedDict() log['START'] = 'FilterSeq' log['COMMAND'] = cmd_dict.get(filter_func, filter_func.__name__) log['FILE'] = os.path.basename(seq_file) for k in sorted(filter_args): log[k.upper()] = filter_args[k] log['NPROC'] = nproc printLog(log) # Check input type in_type = getFileType(seq_file) if in_type != 'fastq' and filter_func in (filterQuality, maskQuality, trimQuality): sys.exit('ERROR: Input file must be FASTQ for %s mode' % cmd_dict[filter_func]) # Define feeder function and arguments feed_func = feedSeqQueue feed_args = {'seq_file': seq_file} # Define worker function and arguments work_func = processSeqQueue work_args = {'process_func': filter_func, 'process_args': filter_args} # Define collector function and arguments collect_func = collectSeqQueue collect_args = {'seq_file': seq_file, 'task_label': cmd_dict[filter_func], 'out_args': out_args} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'FilterSeq' printLog(result['log']) return result['out_files']
def clusterSets(seq_file, barcode_field=default_barcode_field, cluster_field=default_cluster_field, ident=default_ident, seq_start=None, seq_end=None, usearch_exec=default_usearch_exec, out_args=default_out_args, nproc=None, queue_size=None): """ Performs clustering on sets of sequences Arguments: seq_file = the sample sequence file name barcode_field = the annotation containing set IDs ident = the identity threshold for clustering sequences seq_start = the start position to trim sequences at before clustering seq_end = the end position to trim sequences at before clustering usearch_exec = the path to the executable for usearch nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: the clustered output file name """ # Print parameter info log = OrderedDict() log['START'] = 'ClusterSets' log['FILE'] = os.path.basename(seq_file) log['BARCODE_FIELD'] = barcode_field log['CLUSTER_FIELD'] = cluster_field log['IDENTITY'] = ident log['SEQUENCE_START'] = seq_start log['SEQUENCE_END'] = seq_end log['NPROC'] = nproc printLog(log) # Define cluster function parameters cluster_args = {'usearch_exec':usearch_exec, 'ident':ident, 'seq_start':seq_start, 'seq_end':seq_end} # Define feeder function and arguments index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']} feed_func = feedSeqQueue feed_args = {'seq_file': seq_file, 'index_func': indexSeqSets, 'index_args': index_args} # Define worker function and arguments work_func = processCSQueue work_args = {'cluster_field': cluster_field, 'cluster_args': cluster_args, 'delimiter': out_args['delimiter']} # Define collector function and arguments collect_func = collectSeqQueue collect_args = {'seq_file': seq_file, 'task_label': 'cluster', 'out_args': out_args, 'index_field': barcode_field} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log log = OrderedDict() log['OUTPUT'] = result['log'].pop('OUTPUT') for k, v in result['log'].items(): log[k] = v log['END'] = 'ClusterSets' printLog(log) return result['out_files']
def filterSeq(seq_file, filter_func, filter_args={}, out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Filters sequences by fraction of ambiguous nucleotides Arguments: seq_file : the sequence file to filter. filter_func : the function to use for filtering sequences. filter_args : a dictionary of arguments to pass to filter_func. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. nproc : the number of processQueue processes; if None defaults to the number of CPUs. queue_size : maximum size of the argument queue; if None defaults to 2*nproc. Returns: list: a list of successful output file names """ # Define output file label dictionary cmd_dict = { filterLength: 'length', filterMissing: 'missing', filterRepeats: 'repeats', filterQuality: 'quality', maskQuality: 'maskqual', trimQuality: 'trimqual' } # Print parameter info log = OrderedDict() log['START'] = 'FilterSeq' log['COMMAND'] = cmd_dict.get(filter_func, filter_func.__name__) log['FILE'] = os.path.basename(seq_file) for k in sorted(filter_args): log[k.upper()] = filter_args[k] log['NPROC'] = nproc printLog(log) # Check input type in_type = getFileType(seq_file) if in_type != 'fastq' and filter_func in (filterQuality, maskQuality, trimQuality): printError('Input file must be FASTQ for %s mode.' % cmd_dict[filter_func]) # Define feeder function and arguments feed_func = feedSeqQueue feed_args = {'seq_file': seq_file} # Define worker function and arguments work_func = processSeqQueue work_args = {'process_func': filter_func, 'process_args': filter_args} # Define collector function and arguments collect_func = collectSeqQueue collect_args = { 'seq_file': seq_file, 'label': cmd_dict[filter_func], 'out_file': out_file, 'out_args': out_args } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'FilterSeq' printLog(result['log']) return result['out_files']
def alignRecords(db_file, seq_fields, group_func, align_func, group_args={}, align_args={}, out_args=default_out_args, nproc=None, queue_size=None): """ Performs a multiple alignment on sets of sequences Arguments: db_file : filename of the input database. seq_fields : the sequence fields to multiple align. group_func : function to use to group records. align_func : function to use to multiple align sequence groups. group_args : dictionary of arguments to pass to group_func. align_args : dictionary of arguments to pass to align_func. out_args : common output argument dictionary from parseCommonArgs. nproc : the number of processQueue processes. if None defaults to the number of CPUs. queue_size : maximum size of the argument queue. if None defaults to 2*nproc. Returns: tuple : a tuple of (align-pass, align-fail) filenames. """ # Define subcommand label dictionary cmd_dict = { alignAcross: 'across', alignWithin: 'within', alignBlocks: 'block' } # Print parameter info log = OrderedDict() log['START'] = 'AlignRecords' log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__) log['FILE'] = os.path.basename(db_file) log['SEQ_FIELDS'] = ','.join(seq_fields) if 'group_fields' in group_args: log['GROUP_FIELDS'] = ','.join(group_args['group_fields']) if 'mode' in group_args: log['MODE'] = group_args['mode'] if 'action' in group_args: log['ACTION'] = group_args['action'] log['NPROC'] = nproc printLog(log) # Define feeder function and arguments feed_func = feedDbQueue feed_args = { 'db_file': db_file, 'group_func': group_func, 'group_args': group_args } # Define worker function and arguments align_args['seq_fields'] = seq_fields work_func = processDbQueue work_args = {'process_func': align_func, 'process_args': align_args} # Define collector function and arguments collect_func = collectDbQueue collect_args = { 'db_file': db_file, 'task_label': 'align', 'out_args': out_args, 'add_fields': ['%s_ALIGN' % f for f in seq_fields] } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'AlignRecords' printLog(result['log']) return result['out_files']
def alignSets(seq_file, align_func, align_args, barcode_field=default_barcode_field, calc_div=False, out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Performs a multiple alignment on sets of sequences Arguments: seq_file : the sample sequence file name. align_func : the function to use to align sequence sets. align_args : a dictionary of arguments to pass to align_func. barcode_field : the annotation containing set IDs. calc_div : if True calculate average pairwise error for each sequence set. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs nproc : the number of processQueue processes; if None defaults to the number of CPUs queue_size : maximum size of the argument queue; if None defaults to 2*nproc Returns: tuple: a tuple of (passing, failing) filenames. """ # Define subcommand label dictionary cmd_dict = {runMuscle:'muscle', offsetSeqSet:'offset'} # Print parameter info log = OrderedDict() log['START'] = 'AlignSets' log['COMMAND'] = cmd_dict[align_func] log['FILE'] = os.path.basename(seq_file) if 'mode' in align_args: log['MODE'] = align_args['mode'] log['BARCODE_FIELD'] = barcode_field if 'field' in align_args: log['OFFSET_FIELD'] = align_args['field'] log['CALC_DIV'] = calc_div log['NPROC'] = nproc printLog(log) # Define feeder function and arguments index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']} feed_func = feedSeqQueue feed_args = {'seq_file': seq_file, 'index_func': indexSeqSets, 'index_args': index_args} # Define worker function and arguments work_func = processQueue work_args = {'align_func': align_func, 'align_args': align_args, 'calc_div': calc_div, 'delimiter': out_args['delimiter']} # Define collector function and arguments collect_func = collectSeqQueue collect_args = {'seq_file': seq_file, 'label': 'align', 'out_file': out_file, 'out_args': out_args, 'index_field': barcode_field} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'AlignSets' printLog(result['log']) return result['out_files']
def buildConsensus(seq_file, barcode_field=default_barcode_field, min_count=default_consensus_min_count, min_freq=default_consensus_min_freq, min_qual=default_consensus_min_qual, primer_field=None, primer_freq=None, max_gap=None, max_error=None, max_diversity=None, copy_fields=None, copy_actions=None, dependent=False, out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Generates consensus sequences Arguments: seq_file : the sample sequence file name barcode_field : the annotation field containing set IDs min_count : threshold number of sequences to define a consensus min_freq : the frequency cutoff to assign a base min_qual : the quality cutoff to assign a base primer_field : the annotation field containing primer tags; if None do not annotate with primer tags primer_freq : the maximum primer tag frequency that must be meet to build a consensus; if None do not filter by primer frequency max_gap : the maximum frequency of (., -) characters allowed before deleting a position; if None do not delete positions max_error : a threshold defining the maximum allowed error rate to retain a read group; if None do not calculate error rate max_diversity : a threshold defining the average pairwise error rate required to retain a read group; if None do not calculate diversity dependent : if False treat barcode group sequences as independent data copy_fields : a list of annotations to copy into consensus sequence annotations; if None no additional annotations will be copied copy_actions : the list of actions to take for each copy_fields; one of ['set', 'majority', 'min', 'max', 'sum'] out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. nproc : the number of processQueue processes; if None defaults to the number of CPUs queue_size : maximum size of the argument queue; if None defaults to 2*nproc Returns: list : a list of successful output file names. """ # Print parameter info log = OrderedDict() log['START'] = 'BuildConsensus' log['FILE'] = os.path.basename(seq_file) log['BARCODE_FIELD'] = barcode_field log['MIN_COUNT'] = min_count log['MIN_FREQUENCY'] = min_freq log['MIN_QUALITY'] = min_qual log['MAX_GAP'] = max_gap log['PRIMER_FIELD'] = primer_field log['PRIMER_FREQUENCY'] = primer_freq log['MAX_ERROR'] = max_error log['MAX_DIVERSITY'] = max_diversity log['DEPENDENT'] = dependent log['COPY_FIELDS'] = ','.join( copy_fields) if copy_fields is not None else None log['COPY_ACTIONS'] = ','.join( copy_actions) if copy_actions is not None else None log['NPROC'] = nproc printLog(log) # Set consensus building function in_type = getFileType(seq_file) if in_type == 'fastq': cons_func = qualityConsensus cons_args = { 'min_qual': min_qual, 'min_freq': min_freq, 'dependent': dependent } elif in_type == 'fasta': cons_func = frequencyConsensus cons_args = {'min_freq': min_freq} else: printError('Input file must be FASTA or FASTQ.') # Define feeder function and arguments index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']} feed_func = feedSeqQueue feed_args = { 'seq_file': seq_file, 'index_func': indexSeqSets, 'index_args': index_args } # Define worker function and arguments work_func = processQueue work_args = { 'cons_func': cons_func, 'cons_args': cons_args, 'min_count': min_count, 'primer_field': primer_field, 'primer_freq': primer_freq, 'max_gap': max_gap, 'max_error': max_error, 'max_diversity': max_diversity, 'copy_fields': copy_fields, 'copy_actions': copy_actions, 'delimiter': out_args['delimiter'] } # Define collector function and arguments collect_func = collectSeqQueue collect_args = { 'seq_file': seq_file, 'label': 'consensus', 'out_file': out_file, 'out_args': out_args, 'index_field': barcode_field } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'BuildConsensus' printLog(result['log']) return result['out_files']
def defineClones(db_file, feed_func, work_func, collect_func, clone_func, cluster_func=None, group_func=None, group_args={}, clone_args={}, cluster_args={}, max_missing=default_max_missing, out_args=default_out_args, nproc=None, queue_size=None): """ Define clonally related sequences Arguments: db_file = filename of input database feed_func = the function that feeds the queue work_func = the worker function that will run on each CPU collect_func = the function that collects results from the workers group_func = the function to use for assigning preclones clone_func = the function to use for determining clones within preclonal groups group_args = a dictionary of arguments to pass to group_func clone_args = a dictionary of arguments to pass to clone_func max_missing : maximum number of non-ACGT characters to allow in the junction sequence. out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a list of successful output file names """ # Print parameter info log = OrderedDict() log['START'] = 'DefineClones' log['DB_FILE'] = os.path.basename(db_file) log['MAX_MISSING'] = max_missing if group_func is not None: log['GROUP_FUNC'] = group_func.__name__ log['GROUP_ARGS'] = group_args log['CLONE_FUNC'] = clone_func.__name__ # TODO: this is yucky, but can be fixed by using a model class clone_log = clone_args.copy() if 'dist_mat' in clone_log: del clone_log['dist_mat'] log['CLONE_ARGS'] = clone_log if cluster_func is not None: log['CLUSTER_FUNC'] = cluster_func.__name__ log['CLUSTER_ARGS'] = cluster_args log['NPROC'] = nproc printLog(log) # Define feeder function and arguments feed_args = { 'db_file': db_file, 'group_func': group_func, 'group_args': group_args } # Define worker function and arguments work_args = { 'max_missing': max_missing, 'clone_func': clone_func, 'clone_args': clone_args } # Define collector function and arguments collect_args = { 'db_file': db_file, 'out_args': out_args, 'cluster_func': cluster_func, 'cluster_args': cluster_args } # Call process manager result = manageProcesses(feed_func=feed_func, work_func=work_func, collect_func=collect_func, feed_args=feed_args, work_args=work_args, collect_args=collect_args, nproc=nproc, queue_size=queue_size) # Print log result['log']['END'] = 'DefineClones' printLog(result['log']) return result['out_files']
def alignRecords(db_file, seq_fields, group_func, align_func, group_args={}, align_args={}, format='changeo', out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Performs a multiple alignment on sets of sequences Arguments: db_file : filename of the input database. seq_fields : the sequence fields to multiple align. group_func : function to use to group records. align_func : function to use to multiple align sequence groups. group_args : dictionary of arguments to pass to group_func. align_args : dictionary of arguments to pass to align_func. format : output format. One of 'changeo' or 'airr'. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. nproc : the number of processQueue processes. if None defaults to the number of CPUs. queue_size : maximum size of the argument queue. if None defaults to 2*nproc. Returns: dict : names of the 'pass' and 'fail' output files. """ # Define subcommand label dictionary cmd_dict = { alignAcross: 'across', alignWithin: 'within', alignBlocks: 'block' } # Print parameter info log = OrderedDict() log['START'] = 'AlignRecords' log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__) log['FILE'] = os.path.basename(db_file) log['SEQ_FIELDS'] = ','.join(seq_fields) if 'group_fields' in group_args: log['GROUP_FIELDS'] = ','.join(group_args['group_fields']) if 'mode' in group_args: log['MODE'] = group_args['mode'] if 'action' in group_args: log['ACTION'] = group_args['action'] log['NPROC'] = nproc printLog(log) # Define format operators try: reader, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Define feeder function and arguments if 'group_fields' in group_args and group_args['group_fields'] is not None: group_args['group_fields'] = [ schema.toReceptor(f) for f in group_args['group_fields'] ] feed_func = feedDbQueue feed_args = { 'db_file': db_file, 'reader': reader, 'group_func': group_func, 'group_args': group_args } # Define worker function and arguments field_map = OrderedDict([(schema.toReceptor(f), '%s_align' % f) for f in seq_fields]) align_args['field_map'] = field_map work_func = processDbQueue work_args = {'process_func': align_func, 'process_args': align_args} # Define collector function and arguments out_fields = getDbFields(db_file, add=list(field_map.values()), reader=reader) out_args['out_type'] = schema.out_type collect_func = collectDbQueue collect_args = { 'db_file': db_file, 'label': 'align', 'fields': out_fields, 'writer': writer, 'out_file': out_file, 'out_args': out_args } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'AlignRecords' printLog(result['log']) output = {k: v for k, v in result.items() if k in ('pass', 'fail')} return output
def assemblePairs(head_file, tail_file, assemble_func, assemble_args={}, coord_type=default_coord_type, rc=None, head_fields=None, tail_fields=None, out_args=default_out_args, nproc=None, queue_size=None): """ Generates consensus sequences Arguments: head_file = the head sequence file name tail_file = the tail sequence file name assemble_func = the function to use to assemble paired ends assemble_args = a dictionary of arguments to pass to the assembly function coord_type = the sequence header format rc = Defines which sequences ('head','tail','both') to reverse complement before assembly; if None do not reverse complement sequences head_fields = list of annotations in head_file records to copy to assembled record; if None do not copy an annotation tail_fields = list of annotations in tail_file records to copy to assembled record; if None do not copy an annotation out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a list of successful output file names """ # Define subcommand label dictionary cmd_dict = {alignAssembly:'align', joinSeqPair:'join', referenceAssembly:'reference'} # Print parameter info log = OrderedDict() log['START'] = 'AssemblePairs' log['COMMAND'] = cmd_dict.get(assemble_func, assemble_func.__name__) log['FILE1'] = os.path.basename(head_file) log['FILE2'] = os.path.basename(tail_file) log['COORD_TYPE'] = coord_type if 'ref_file' in assemble_args: log['REFFILE'] = assemble_args['ref_file'] if 'alpha' in assemble_args: log['ALPHA'] = assemble_args['alpha'] if 'max_error' in assemble_args: log['MAX_ERROR'] = assemble_args['max_error'] if 'min_len' in assemble_args: log['MIN_LEN'] = assemble_args['min_len'] if 'max_len' in assemble_args: log['MAX_LEN'] = assemble_args['max_len'] if 'scan_reverse' in assemble_args: log['SCAN_REVERSE'] = assemble_args['scan_reverse'] if 'gap' in assemble_args: log['GAP'] = assemble_args['gap'] if 'min_ident' in assemble_args: log['MIN_IDENT'] = assemble_args['min_ident'] if 'evalue' in assemble_args: log['EVALUE'] = assemble_args['evalue'] if 'max_hits' in assemble_args: log['MAX_HITS'] = assemble_args['max_hits'] if 'fill' in assemble_args: log['FILL'] = assemble_args['fill'] log['NPROC'] = nproc printLog(log) # Count input files head_count = countSeqFile(head_file) tail_count = countSeqFile(tail_file) if head_count != tail_count: sys.exit('Error: FILE1 (n=%i) and FILE2 (n=%i) must have the same number of records' \ % (head_count, tail_count)) # Define feeder function and arguments feed_func = feedPairQueue # feed_args = {'seq_file_1': head_file, # 'seq_file_2': tail_file, # 'index_dict': index_dict} feed_args = {'seq_file_1': head_file, 'seq_file_2': tail_file, 'coord_type': coord_type, 'delimiter': out_args['delimiter']} # Define worker function and arguments process_args = {'assemble_func': assemble_func, 'assemble_args': assemble_args, 'rc': rc, 'fields_1': head_fields, 'fields_2': tail_fields, 'delimiter': out_args['delimiter']} work_func = processSeqQueue work_args = {'process_func': processAssembly, 'process_args': process_args} # Define collector function and arguments collect_func = collectPairQueue # collect_args = {'result_count': pair_count, # 'seq_file_1': head_file, # 'seq_file_2': tail_file, # 'out_args': out_args} collect_args = {'result_count': head_count, 'seq_file_1': head_file, 'seq_file_2': tail_file, 'out_args': out_args} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log log = OrderedDict() log['OUTPUT'] = result['log'].pop('OUTPUT') for k, v in result['log'].items(): log[k] = v log['END'] = 'AssemblePairs' printLog(log) return result['out_files']
def unifyHeaders(seq_file, collapse_func, set_field=default_barcode_field, unify_field=default_unify_field, out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Merges and filters annotation fields within groups Arguments: seq_file : the sample sequence file name. collapse_func : the function to use for collapsing annotations. set_field : the annotation containing set IDs. unify_field : the field for collection criteria. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. nproc : the number of processQueue processes; if None defaults to the number of CPUs. queue_size : maximum size of the argument queue; if None defaults to 2*nproc. Returns: str: output file name. """ # Print parameter info log = OrderedDict() log['START'] = 'UnifyHeaders' log['FILE'] = os.path.basename(seq_file) log['SET_FIELD'] = set_field log['UNIFY_FIELD'] = unify_field log['NPROC'] = nproc printLog(log) # Define feeder function and arguments index_args = {'field': set_field, 'delimiter': out_args['delimiter']} feed_func = feedSeqQueue feed_args = { 'seq_file': seq_file, 'index_func': indexSeqSets, 'index_args': index_args } # Define worker function and arguments collapse_args = {'field': unify_field, 'delimiter': out_args['delimiter']} work_func = processSeqQueue work_args = {'process_func': collapse_func, 'process_args': collapse_args} # Define collector function and arguments collect_func = collectSeqQueue collect_args = { 'seq_file': seq_file, 'label': 'unify', 'out_file': out_file, 'out_args': out_args, 'index_field': set_field } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log log = OrderedDict() log['OUTPUT'] = result['log'].pop('OUTPUT') for k, v in result['log'].items(): log[k] = v log['END'] = 'UnifyHeaders' printLog(log) return result['out_files']
def estimateError(seq_file, cons_func=frequencyConsensus, cons_args={}, set_field=default_barcode_field, min_count=default_min_count, max_diversity=None, out_args=default_out_args, nproc=None, queue_size=None): """ Calculates error rates of sequence sets Arguments: seq_file = the sample sequence file name cons_func = the function to use for consensus generation cons_args = a dictionary of arguments for the consensus function set_field = the annotation field containing set IDs min_count = threshold number of sequences to consider a set max_diversity = a threshold defining the average pairwise error rate required to retain a read group; if None do not calculate diversity out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a list of tuples of (position error, quality error, nucleotide pairwise error) output file names """ # Define subcommand label dictionary cmd_dict = {frequencyConsensus:'freq', qualityConsensus:'qual'} # Print parameter info log = OrderedDict() log['START'] = 'EstimateError' log['FILE'] = os.path.basename(seq_file) log['MODE'] = cmd_dict.get(cons_func, cons_func.__name__) log['SET_FIELD'] = set_field log['MIN_COUNT'] = min_count log['MAX_DIVERSITY'] = max_diversity log['NPROC'] = nproc printLog(log) # Check input file type in_type = getFileType(seq_file) if in_type != 'fastq': sys.exit('ERROR: Input file must be FASTQ') # Define feeder function and arguments index_args = {'field': set_field, 'delimiter': out_args['delimiter']} feed_func = feedSeqQueue feed_args = {'seq_file': seq_file, 'index_func': indexSeqSets, 'index_args': index_args} # Define worker function and arguments work_func = processEEQueue work_args = {'cons_func': cons_func, 'cons_args': cons_args, 'min_count': min_count, 'max_diversity': max_diversity} # Define collector function and arguments collect_func = collectEEQueue collect_args = {'seq_file': seq_file, 'out_args': out_args, 'set_field': set_field} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'EstimateError' printLog(result['log']) return result['out_files']
def assemblePairs(head_file, tail_file, assemble_func, assemble_args={}, coord_type=default_coord, rc='tail', head_fields=None, tail_fields=None, out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Generates consensus sequences Arguments: head_file : the head sequence file name tail_file : the tail sequence file name assemble_func : the function to use to assemble paired ends assemble_args : a dictionary of arguments to pass to the assembly function coord_type : the sequence header format rc : Defines which sequences ('head', 'tail', 'both', 'none') to reverse complement before assembly; if 'none' do not reverse complement sequences head_fields : list of annotations in head_file records to copy to assembled record; if None do not copy an annotation tail_fields : list of annotations in tail_file records to copy to assembled record; if None do not copy an annotation out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: list: a list of successful output file names. """ # Define subcommand label dictionary cmd_dict = { alignAssembly: 'align', joinAssembly: 'join', referenceAssembly: 'reference', sequentialAssembly: 'sequential' } cmd_name = cmd_dict.get(assemble_func, assemble_func.__name__) # Print parameter info log = OrderedDict() log['START'] = 'AssemblePairs' log['COMMAND'] = cmd_name log['FILE1'] = os.path.basename(head_file) log['FILE2'] = os.path.basename(tail_file) log['COORD_TYPE'] = coord_type if 'ref_file' in assemble_args: log['REFFILE'] = assemble_args['ref_file'] if 'alpha' in assemble_args: log['ALPHA'] = assemble_args['alpha'] if 'max_error' in assemble_args: log['MAX_ERROR'] = assemble_args['max_error'] if 'min_len' in assemble_args: log['MIN_LEN'] = assemble_args['min_len'] if 'max_len' in assemble_args: log['MAX_LEN'] = assemble_args['max_len'] if 'scan_reverse' in assemble_args: log['SCAN_REVERSE'] = assemble_args['scan_reverse'] if 'gap' in assemble_args: log['GAP'] = assemble_args['gap'] if 'min_ident' in assemble_args: log['MIN_IDENT'] = assemble_args['min_ident'] if 'evalue' in assemble_args: log['EVALUE'] = assemble_args['evalue'] if 'max_hits' in assemble_args: log['MAX_HITS'] = assemble_args['max_hits'] if 'fill' in assemble_args: log['FILL'] = assemble_args['fill'] if 'aligner' in assemble_args: log['ALIGNER'] = assemble_args['aligner'] log['NPROC'] = nproc printLog(log) # Count input files head_count = countSeqFile(head_file) tail_count = countSeqFile(tail_file) if head_count != tail_count: printError('FILE1 (n=%i) and FILE2 (n=%i) must have the same number of records.' \ % (head_count, tail_count)) # Setup for reference alignment if cmd_name in ('reference', 'sequential'): ref_file = assemble_args.pop('ref_file') db_exec = assemble_args.pop('db_exec') # Build reference sequence dictionary assemble_args['ref_dict'] = readReferenceFile(ref_file) # Build reference database files try: db_func = { 'blastn': makeBlastnDb, 'usearch': makeUBlastDb }[assemble_args['aligner']] ref_db, db_handle = db_func(ref_file, db_exec) assemble_args['ref_db'] = ref_db except: printError('Error building reference database for aligner %s with executable %s.' \ % (assemble_args['aligner'], db_exec)) # Define feeder function and arguments feed_func = feedPairQueue feed_args = { 'seq_file_1': head_file, 'seq_file_2': tail_file, 'coord_type': coord_type, 'delimiter': out_args['delimiter'] } # Define worker function and arguments process_args = { 'assemble_func': assemble_func, 'assemble_args': assemble_args, 'rc': rc, 'fields_1': head_fields, 'fields_2': tail_fields, 'delimiter': out_args['delimiter'] } work_func = processSeqQueue work_args = {'process_func': assemblyWorker, 'process_args': process_args} # Define collector function and arguments collect_func = collectPairQueue collect_args = { 'seq_file_1': head_file, 'seq_file_2': tail_file, 'label': 'assemble', 'out_file': out_file, 'out_args': out_args } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Close reference database handle if cmd_name in ('reference', 'sequential'): try: db_handle.close() except AttributeError: db_handle.cleanup() except: printError('Cannot close reference database file.') # Print log log = OrderedDict() log['OUTPUT'] = result['log'].pop('OUTPUT') for k, v in result['log'].items(): log[k] = v log['END'] = 'AssemblePairs' printLog(log) return result['out_files']
def maskPrimers(seq_file, primer_file, mode, align_func, align_args={}, max_error=default_max_error, barcode=False, out_args=default_out_args, nproc=None, queue_size=None): """ Masks or cuts primers from sample sequences using local alignment Arguments: seq_file = name of file containing sample sequences primer_file = name of the file containing primer sequences mode = defines the action taken; one of 'cut','mask','tag' align_func = the function to call for alignment align_arcs = a dictionary of arguments to pass to align_func max_error = maximum acceptable error rate for a valid alignment barcode = if True add sequence preceding primer to description out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a list of successful output file names """ # Define subcommand label dictionary cmd_dict = {alignPrimers:'align', scorePrimers:'score'} # Print parameter info log = OrderedDict() log['START'] = 'MaskPrimers' log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__) log['SEQ_FILE'] = os.path.basename(seq_file) log['PRIMER_FILE'] = os.path.basename(primer_file) log['MODE'] = mode log['BARCODE'] = barcode log['MAX_ERROR'] = max_error if 'start' in align_args: log['START_POS'] = align_args['start'] if 'max_len' in align_args: log['MAX_LEN'] = align_args['max_len'] if 'rev_primer' in align_args: log['REV_PRIMER'] = align_args['rev_primer'] if 'skip_rc' in align_args: log['SKIP_RC'] = align_args['skip_rc'] if 'gap_penalty' in align_args: log['GAP_PENALTY'] = ', '.join([str(x) for x in align_args['gap_penalty']]) log['NPROC'] = nproc printLog(log) # Create dictionary of primer sequences to pass to maskPrimers primers = readPrimerFile(primer_file) if 'rev_primer' in align_args and align_args['rev_primer']: primers = {k: reverseComplement(v) for k, v in primers.items()} # Define alignment arguments and compile primers for align mode align_args['primers'] = primers align_args['score_dict'] = getDNAScoreDict(mask_score=(0, 1), gap_score=(0, 0)) if align_func is alignPrimers: align_args['max_error'] = max_error align_args['primers_regex'] = compilePrimers(primers) # Define sequence masking arguments mask_args = {'mode': mode, 'barcode': barcode, 'delimiter': out_args['delimiter']} # Define feeder function and arguments feed_func = feedSeqQueue feed_args = {'seq_file': seq_file} # Define worker function and arguments work_func = processMPQueue work_args = {'align_func': align_func, 'align_args': align_args, 'mask_args': mask_args, 'max_error': max_error} # Define collector function and arguments collect_func = collectSeqQueue collect_args = {'seq_file': seq_file, 'task_label': 'primers', 'out_args': out_args} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'MaskPrimers' printLog(result['log']) return result['out_files']
def alignSets(seq_file, align_func, align_args, barcode_field=default_barcode_field, calc_div=False, out_args=default_out_args, nproc=None, queue_size=None): """ Performs a multiple alignment on sets of sequences Arguments: seq_file = the sample sequence file name align_func = the function to use to align sequence sets align_args = a dictionary of arguments to pass to align_func barcode_field = the annotation containing set IDs calc_div = if True calculate average pairwise error for each sequence set out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a tuple of (valid_file, invalid_file) names """ # Define subcommand label dictionary cmd_dict = {runMuscle:'align', offsetSeqSet:'offset'} # Print parameter info log = OrderedDict() log['START'] = 'AlignSets' log['COMMAND'] = cmd_dict[align_func] log['FILE'] = os.path.basename(seq_file) if 'mode' in align_args: log['MODE'] = align_args['mode'] log['BARCODE_FIELD'] = barcode_field if 'field' in align_args: log['OFFSET_FIELD'] = align_args['field'] log['CALC_DIV'] = calc_div log['NPROC'] = nproc printLog(log) # Define feeder function and arguments index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']} feed_func = feedSeqQueue feed_args = {'seq_file': seq_file, 'index_func': indexSeqSets, 'index_args': index_args} # Define worker function and arguments work_func = processASQueue work_args = {'align_func': align_func, 'align_args': align_args, 'calc_div': calc_div, 'delimiter': out_args['delimiter']} # Define collector function and arguments collect_func = collectSeqQueue collect_args = {'seq_file': seq_file, 'task_label': 'align', 'out_args': out_args, 'index_field': barcode_field} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'AlignSets' printLog(result['log']) return result['out_files']
def clusterSets(seq_file, ident=default_cluster_ident, length_ratio=default_length_ratio, seq_start=0, seq_end=None, set_field=default_barcode_field, cluster_field=default_cluster_field, cluster_prefix=default_cluster_prefix, cluster_tool=default_cluster_tool, cluster_exec=default_cluster_exec, out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Performs clustering on sets of sequences Arguments: seq_file : the sample sequence file name. ident : the identity threshold for clustering sequences. length_ratio : minimum short/long length ratio allowed within a cluster. seq_start : the start position to trim sequences at before clustering. seq_end : the end position to trim sequences at before clustering. set_field : the annotation containing set IDs. cluster_field : the name of the output cluster field. cluster_prefix : string defining a prefix for the cluster identifier. cluster_exec : the path to the clustering executable. cluster_tool : the clustering tool to use; one of cd-hit or usearch. out_file : output file name. Automatically generated from the input file if None. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. nproc : the number of processQueue processes; if None defaults to the number of CPUs. queue_size : maximum size of the argument queue; if None defaults to 2*nproc. Returns: str: the clustered output file name. """ # Print parameter info log = OrderedDict() log['START'] = 'ClusterSets' log['COMMAND'] = 'set' log['FILE'] = os.path.basename(seq_file) log['IDENTITY'] = ident log['SEQUENCE_START'] = seq_start log['SEQUENCE_END'] = seq_end log['SET_FIELD'] = set_field log['CLUSTER_FIELD'] = cluster_field log['CLUSTER_PREFIX'] = cluster_prefix log['CLUSTER_TOOL'] = cluster_tool log['NPROC'] = nproc printLog(log) # Set cluster tool try: cluster_func = map_cluster_tool.get(cluster_tool) except: printError('Invalid clustering tool %s.' % cluster_tool) # Check the minimum identity if ident < min_cluster_ident[cluster_tool]: printError('Minimum identity %s too low for clustering tool %s.' % (str(ident), cluster_tool)) # Define cluster function parameters cluster_args = { 'cluster_exec': cluster_exec, 'ident': ident, 'length_ratio': length_ratio, 'seq_start': seq_start, 'seq_end': seq_end } # Define feeder function and arguments index_args = {'field': set_field, 'delimiter': out_args['delimiter']} feed_func = feedSeqQueue feed_args = { 'seq_file': seq_file, 'index_func': indexSeqSets, 'index_args': index_args } # Define worker function and arguments work_func = processQueue work_args = { 'cluster_func': cluster_func, 'cluster_args': cluster_args, 'cluster_field': cluster_field, 'cluster_prefix': cluster_prefix, 'delimiter': out_args['delimiter'] } # Define collector function and arguments collect_func = collectSeqQueue collect_args = { 'seq_file': seq_file, 'label': 'cluster', 'out_file': out_file, 'out_args': out_args, 'index_field': set_field } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log log = OrderedDict() log['OUTPUT'] = result['log'].pop('OUTPUT') for k, v in result['log'].items(): log[k] = v log['END'] = 'ClusterSets' printLog(log) return result['out_files']
def buildConsensus(seq_file, barcode_field=default_barcode_field, min_count=default_min_count, min_freq=default_min_freq, min_qual=default_min_qual, primer_field=None, primer_freq=None, max_gap=None, max_error=None, max_diversity=None, copy_fields=None, copy_actions=None, dependent=False, out_args=default_out_args, nproc=None, queue_size=None): """ Generates consensus sequences Arguments: seq_file = the sample sequence file name barcode_field = the annotation field containing set IDs min_count = threshold number of sequences to define a consensus min_freq = the frequency cutoff to assign a base min_qual = the quality cutoff to assign a base primer_field = the annotation field containing primer tags; if None do not annotate with primer tags primer_freq = the maximum primer tag frequency that must be meet to build a consensus; if None do not filter by primer frequency max_gap = the maximum frequency of (., -) characters allowed before deleting a position; if None do not delete positions max_error = a threshold defining the maximum allowed error rate to retain a read group; if None do not calculate error rate max_diversity = a threshold defining the average pairwise error rate required to retain a read group; if None do not calculate diversity dependent = if False treat barcode group sequences as independent data copy_fields = a list of annotations to copy into consensus sequence annotations; if None no additional annotations will be copied copy_actions = the list of actions to take for each copy_fields; one of ['set', 'majority', 'min', 'max', 'sum'] out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a list of successful output file names """ # Print parameter info log = OrderedDict() log['START'] = 'BuildConsensus' log['FILE'] = os.path.basename(seq_file) log['BARCODE_FIELD'] = barcode_field log['MIN_COUNT'] = min_count log['MIN_FREQUENCY'] = min_freq log['MIN_QUALITY'] = min_qual log['MAX_GAP'] = max_gap log['PRIMER_FIELD'] = primer_field log['PRIMER_FREQUENCY'] = primer_freq log['MAX_ERROR'] = max_error log['MAX_DIVERSITY'] = max_diversity log['DEPENDENT'] = dependent log['COPY_FIELDS'] = ','.join(copy_fields) if copy_fields is not None else None log['COPY_ACTIONS'] = ','.join(copy_actions) if copy_actions is not None else None log['NPROC'] = nproc printLog(log) # Set consensus building function in_type = getFileType(seq_file) if in_type == 'fastq': cons_func = qualityConsensus cons_args = {'min_qual': min_qual, 'min_freq': min_freq, 'dependent': dependent} elif in_type == 'fasta': cons_func = frequencyConsensus cons_args = {'min_freq': min_freq} else: sys.exit('ERROR: Input file must be FASTA or FASTQ') # Define feeder function and arguments index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']} feed_func = feedSeqQueue feed_args = {'seq_file': seq_file, 'index_func': indexSeqSets, 'index_args': index_args} # Define worker function and arguments work_func = processBCQueue work_args = {'cons_func': cons_func, 'cons_args': cons_args, 'min_count': min_count, 'primer_field': primer_field, 'primer_freq': primer_freq, 'max_gap': max_gap, 'max_error': max_error, 'max_diversity': max_diversity, 'copy_fields': copy_fields, 'copy_actions': copy_actions, 'delimiter': out_args['delimiter']} # Define collector function and arguments collect_func = collectSeqQueue collect_args = {'seq_file': seq_file, 'task_label': 'consensus', 'out_args': out_args, 'index_field': barcode_field} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'BuildConsensus' printLog(result['log']) return result['out_files']
def maskPrimers(seq_file, primer_file, align_func, align_args={}, out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Masks or cuts primers from sample sequences using local alignment Arguments: seq_file : name of file containing sample sequences. primer_file : name of the file containing primer sequences. align_func : the function to call for alignment. align_arcs : a dictionary of arguments to pass to align_func. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. nproc : the number of processQueue processes; if None defaults to the number of CPUs. queue_size : maximum size of the argument queue; if None defaults to 2*nproc. Returns: list: a list of successful output file names. """ # Define subcommand label dictionary cmd_dict = { alignPrimers: 'align', scorePrimers: 'score', extractPrimers: 'extract' } # Print parameter info log = OrderedDict() log['START'] = 'MaskPrimers' log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__) log['SEQ_FILE'] = os.path.basename(seq_file) if primer_file is not None: log['PRIMER_FILE'] = os.path.basename(primer_file) if 'mode' in align_args: log['MODE'] = align_args['mode'] if 'max_error' in align_args: log['MAX_ERROR'] = align_args['max_error'] if 'start' in align_args: log['START_POS'] = align_args['start'] if 'length' in align_args: log['LENGTH'] = align_args['length'] if 'max_len' in align_args: log['MAX_LEN'] = align_args['max_len'] if 'rev_primer' in align_args: log['REV_PRIMER'] = align_args['rev_primer'] if 'skip_rc' in align_args: log['SKIP_RC'] = align_args['skip_rc'] if 'gap_penalty' in align_args: log['GAP_PENALTY'] = ', '.join( [str(x) for x in align_args['gap_penalty']]) if 'barcode' in align_args: log['BARCODE'] = align_args['barcode'] if 'barcode' in align_args and align_args['barcode']: log['BARCODE_FIELD'] = align_args['barcode_field'] log['PRIMER_FIELD'] = align_args['primer_field'] log['NPROC'] = nproc printLog(log) # Define alignment arguments and compile primers for align mode if primer_file is not None: primers = readPrimerFile(primer_file) if 'rev_primer' in align_args and align_args['rev_primer']: primers = {k: reverseComplement(v) for k, v in primers.items()} align_args['primers'] = primers align_args['score_dict'] = getDNAScoreDict(mask_score=(0, 1), gap_score=(0, 0)) if align_func is alignPrimers: align_args['primers_regex'] = compilePrimers(primers) align_args['delimiter'] = out_args['delimiter'] # Define feeder function and arguments feed_func = feedSeqQueue feed_args = {'seq_file': seq_file} # Define worker function and arguments work_func = processSeqQueue work_args = {'process_func': align_func, 'process_args': align_args} # Define collector function and arguments collect_func = collectSeqQueue collect_args = { 'seq_file': seq_file, 'label': 'primers', 'out_file': out_file, 'out_args': out_args } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'MaskPrimers' printLog(result['log']) return result['out_files']
def estimateSets(seq_file, cons_func=frequencyConsensus, cons_args={}, set_field=default_barcode_field, min_count=default_min_count, max_diversity=None, out_args=default_out_args, nproc=None, queue_size=None): """ Calculates error rates of sequence sets Arguments: seq_file : the sample sequence file name cons_func : the function to use for consensus generation cons_args : a dictionary of arguments for the consensus function set_field : the annotation field containing set IDs min_count : threshold number of sequences to consider a set max_diversity : a threshold defining the average pairwise error rate required to retain a read group; if None do not calculate diversity out_args : common output argument dictionary from parseCommonArgs nproc : the number of processQueue processes; if None defaults to the number of CPUs queue_size : maximum size of the argument queue; if None defaults to 2*nproc Returns: tuple : (position error, quality error, nucleotide pairwise error) output file names """ # Define subcommand label dictionary cmd_dict = {frequencyConsensus: 'freq', qualityConsensus: 'qual'} # Print parameter info log = OrderedDict() log['START'] = 'EstimateError' log['FILE'] = os.path.basename(seq_file) log['MODE'] = cmd_dict.get(cons_func, cons_func.__name__) log['SET_FIELD'] = set_field log['MIN_COUNT'] = min_count log['MAX_DIVERSITY'] = max_diversity log['NPROC'] = nproc printLog(log) # Check input file type in_type = getFileType(seq_file) if in_type != 'fastq': printError('Input file must be FASTQ.') # Define feeder function and arguments index_args = {'field': set_field, 'delimiter': out_args['delimiter']} feed_func = feedSeqQueue feed_args = { 'seq_file': seq_file, 'index_func': indexSeqSets, 'index_args': index_args } # Define worker function and arguments work_func = processEEQueue work_args = { 'cons_func': cons_func, 'cons_args': cons_args, 'min_count': min_count, 'max_diversity': max_diversity } # Define collector function and arguments collect_func = collectEEQueue collect_args = { 'seq_file': seq_file, 'out_args': out_args, 'set_field': set_field } # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'EstimateError' printLog(result['log']) return result['out_files']
def defineClones(db_file, feed_func, work_func, collect_func, clone_func, cluster_func=None, group_func=None, group_args={}, clone_args={}, cluster_args={}, out_args=default_out_args, nproc=None, queue_size=None): """ Define clonally related sequences Arguments: db_file = filename of input database feed_func = the function that feeds the queue work_func = the worker function that will run on each CPU collect_func = the function that collects results from the workers group_func = the function to use for assigning preclones clone_func = the function to use for determining clones within preclonal groups group_args = a dictionary of arguments to pass to group_func clone_args = a dictionary of arguments to pass to clone_func out_args = common output argument dictionary from parseCommonArgs nproc = the number of processQueue processes; if None defaults to the number of CPUs queue_size = maximum size of the argument queue; if None defaults to 2*nproc Returns: a list of successful output file names """ # Print parameter info log = OrderedDict() log['START'] = 'DefineClones' log['DB_FILE'] = os.path.basename(db_file) if group_func is not None: log['GROUP_FUNC'] = group_func.__name__ log['GROUP_ARGS'] = group_args log['CLONE_FUNC'] = clone_func.__name__ # TODO: this is yucky, but can be fixed by using a model class clone_log = clone_args.copy() if 'dist_mat' in clone_log: del clone_log['dist_mat'] log['CLONE_ARGS'] = clone_log if cluster_func is not None: log['CLUSTER_FUNC'] = cluster_func.__name__ log['CLUSTER_ARGS'] = cluster_args log['NPROC'] = nproc printLog(log) # Define feeder function and arguments feed_args = {'db_file': db_file, 'group_func': group_func, 'group_args': group_args} # Define worker function and arguments work_args = {'clone_func': clone_func, 'clone_args': clone_args} # Define collector function and arguments collect_args = {'db_file': db_file, 'out_args': out_args, 'cluster_func': cluster_func, 'cluster_args': cluster_args} # Call process manager result = manageProcesses(feed_func, work_func, collect_func, feed_args, work_args, collect_args, nproc, queue_size) # Print log result['log']['END'] = 'DefineClones' printLog(result['log']) return result['out_files']
def defineClones(db_file, seq_field=default_junction_field, v_field=default_v_field, j_field=default_j_field, max_missing=default_max_missing, group_fields=None, group_func=groupByGene, group_args={}, clone_func=distanceClones, clone_args={}, format=default_format, out_file=None, out_args=default_out_args, nproc=None, queue_size=None): """ Define clonally related sequences Arguments: db_file : filename of input database. seq_field : sequence field used to determine clones. v_field : field containing the V call. j_field : field containing the J call. max_missing : maximum number of non-ACGT characters to allow in the junction sequence. group_fields : additional annotation fields to use to group preclones; if None use only V and J. group_func : the function to use for assigning preclones. group_args : a dictionary of arguments to pass to group_func. clone_func : the function to use for determining clones within preclonal groups. clone_args : a dictionary of arguments to pass to clone_func. format : input and output format. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. nproc : the number of processQueue processes; if None defaults to the number of CPUs. queue_size : maximum size of the argument queue; if None defaults to 2*nproc. Returns: dict: dictionary of output pass and fail files. """ # Print parameter info log = OrderedDict() log['START'] = 'DefineClones' log['FILE'] = os.path.basename(db_file) log['SEQ_FIELD'] = seq_field log['V_FIELD'] = v_field log['J_FIELD'] = j_field log['MAX_MISSING'] = max_missing log['GROUP_FIELDS'] = ','.join( group_fields) if group_fields is not None else None for k in sorted(group_args): log[k.upper()] = group_args[k] for k in sorted(clone_args): if k != 'dist_mat': log[k.upper()] = clone_args[k] log['NPROC'] = nproc printLog(log) # Define format operators try: reader, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Translate to Receptor attribute names seq_field = schema.toReceptor(seq_field) v_field = schema.toReceptor(v_field) j_field = schema.toReceptor(j_field) if group_fields is not None: group_fields = [schema.toReceptor(f) for f in group_fields] # Define feeder function and arguments group_args['group_fields'] = group_fields group_args['v_field'] = v_field group_args['j_field'] = j_field feed_args = { 'db_file': db_file, 'reader': reader, 'group_func': group_func, 'group_args': group_args } # Define worker function and arguments filter_args = { 'seq_field': seq_field, 'v_field': v_field, 'j_field': j_field, 'max_missing': max_missing } clone_args['seq_field'] = seq_field work_args = { 'process_func': clone_func, 'process_args': clone_args, 'filter_func': filterMissing, 'filter_args': filter_args } # Define collector function and arguments out_fields = getDbFields(db_file, add=schema.fromReceptor('clone'), reader=reader) out_args['out_type'] = schema.out_type collect_args = { 'db_file': db_file, 'fields': out_fields, 'writer': writer, 'out_file': out_file, 'out_args': out_args } # Check for required columns try: required = ['junction'] checkFields(required, out_fields, schema=schema) except LookupError as e: printError(e) # Call process manager result = manageProcesses(feed_func=feedDbQueue, work_func=processDbQueue, collect_func=collectQueue, feed_args=feed_args, work_args=work_args, collect_args=collect_args, nproc=nproc, queue_size=queue_size) # Print log result['log']['END'] = 'DefineClones' printLog(result['log']) output = {k: v for k, v in result.items() if k in ('pass', 'fail')} return output